aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/ARM
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-07-31 21:22:58 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-07-31 21:22:58 +0000
commit5ffd83dbcc34f10e07f6d3e968ae6365869615f4 (patch)
tree0e9f5cf729dde39f949698fddef45a34e2bc7f44 /contrib/llvm-project/llvm/lib/Target/ARM
parent1799696096df87b52968b8996d00c91e0a5de8d9 (diff)
parentcfca06d7963fa0909f90483b42a6d7d194d01e08 (diff)
downloadsrc-5ffd83dbcc34f10e07f6d3e968ae6365869615f4.tar.gz
src-5ffd83dbcc34f10e07f6d3e968ae6365869615f4.zip
Merge llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp
master 2e10b7a39b9, the last commit before the llvmorg-12-init tag, from which release/11.x was branched. Note that for now, I rolled back all our local changes to make merging easier, and I will reapply the still-relevant ones after updating to 11.0.0-rc1.
Notes
Notes: svn path=/projects/clang1100-import/; revision=363742
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/ARM')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARM.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARM.td73
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp267
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.h26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp662
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h190
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp95
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp57
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp65
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td48
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp72
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.h14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp858
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp94
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp239
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp532
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp2359
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h87
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrCDE.td666
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td146
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td2090
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td486
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td92
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td266
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp311
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp1116
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td130
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp60
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp532
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h67
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp442
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp59
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp85
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp50
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp147
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h25
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp42
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp911
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp592
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp286
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp464
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp54
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.h11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp58
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h66
88 files changed, 12102 insertions, 3442 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
index 3412813a3ef2..7398968bb24a 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h
@@ -47,6 +47,7 @@ FunctionPass *createARMConstantIslandPass();
FunctionPass *createMLxExpansionPass();
FunctionPass *createThumb2ITBlockPass();
FunctionPass *createMVEVPTBlockPass();
+FunctionPass *createMVEVPTOptimisationsPass();
FunctionPass *createARMOptimizeBarriersPass();
FunctionPass *createThumb2SizeReductionPass(
std::function<bool(const Function &)> Ftor = nullptr);
@@ -66,6 +67,7 @@ void initializeARMExpandPseudoPass(PassRegistry &);
void initializeThumb2SizeReducePass(PassRegistry &);
void initializeThumb2ITBlockPass(PassRegistry &);
void initializeMVEVPTBlockPass(PassRegistry &);
+void initializeMVEVPTOptimisationsPass(PassRegistry &);
void initializeARMLowOverheadLoopsPass(PassRegistry &);
void initializeMVETailPredicationPass(PassRegistry &);
void initializeMVEGatherScatterLoweringPass(PassRegistry &);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
index 380eaa863689..0468f7f1cf8e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td
@@ -424,6 +424,13 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
def FeatureSB : SubtargetFeature<"sb", "HasSB", "true",
"Enable v8.5a Speculation Barrier" >;
+// Armv8.6-A extensions
+def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", "true",
+ "Enable support for BFloat16 instructions", [FeatureNEON]>;
+
+def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
+ "true", "Enable Matrix Multiply Int8 Extension", [FeatureNEON]>;
+
// Armv8.1-M extensions
def FeatureLOB : SubtargetFeature<"lob", "HasLOB", "true",
@@ -523,6 +530,11 @@ def HasV8_5aOps : SubtargetFeature<"v8.5a", "HasV8_5aOps", "true",
"Support ARM v8.5a instructions",
[HasV8_4aOps, FeatureSB]>;
+def HasV8_6aOps : SubtargetFeature<"v8.6a", "HasV8_6aOps", "true",
+ "Support ARM v8.6a instructions",
+ [HasV8_5aOps, FeatureBF16,
+ FeatureMatMulInt8]>;
+
def HasV8_1MMainlineOps : SubtargetFeature<
"v8.1m.main", "HasV8_1MMainlineOps", "true",
"Support ARM v8-1M Mainline instructions",
@@ -536,6 +548,16 @@ def HasMVEFloatOps : SubtargetFeature<
"Support M-Class Vector Extension with integer and floating ops",
[HasMVEIntegerOps, FeatureFPARMv8_D16_SP, FeatureFullFP16]>;
+def HasCDEOps : SubtargetFeature<"cde", "HasCDEOps", "true",
+ "Support CDE instructions",
+ [HasV8MMainlineOps]>;
+
+foreach i = {0-7} in
+ def FeatureCoprocCDE#i : SubtargetFeature<"cdecp"#i,
+ "CoprocCDE["#i#"]", "true",
+ "Coprocessor "#i#" ISA is CDEv1",
+ [HasCDEOps]>;
+
//===----------------------------------------------------------------------===//
// ARM Processor subtarget features.
//
@@ -572,6 +594,12 @@ def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
"Cortex-A75 ARM processors", []>;
def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
"Cortex-A76 ARM processors", []>;
+def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
+ "Cortex-A77 ARM processors", []>;
+def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily", "CortexA78",
+ "Cortex-A78 ARM processors", []>;
+def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
+ "Cortex-X1 ARM processors", []>;
def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
"Qualcomm Krait processors", []>;
@@ -787,6 +815,19 @@ def ARMv85a : Architecture<"armv8.5-a", "ARMv85a", [HasV8_5aOps,
FeatureCRC,
FeatureRAS,
FeatureDotProd]>;
+def ARMv86a : Architecture<"armv8.6-a", "ARMv86a", [HasV8_6aOps,
+ FeatureAClass,
+ FeatureDB,
+ FeatureFPARMv8,
+ FeatureNEON,
+ FeatureDSP,
+ FeatureTrustZone,
+ FeatureMP,
+ FeatureVirtualization,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureRAS,
+ FeatureDotProd]>;
def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops,
FeatureRClass,
@@ -1114,6 +1155,14 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline,
FeatureUseMISched,
FeatureHasNoBranchPredictor]>;
+def : ProcessorModel<"cortex-m55", CortexM4Model, [ARMv81mMainline,
+ FeatureDSP,
+ FeatureFPARMv8_D16,
+ FeatureUseMISched,
+ FeatureHasNoBranchPredictor,
+ FeaturePrefLoopAlign32,
+ FeatureHasSlowFPVMLx,
+ HasMVEFloatOps]>;
def : ProcNoItin<"cortex-a32", [ARMv8a,
FeatureHWDivThumb,
@@ -1181,6 +1230,30 @@ def : ProcNoItin<"cortex-a76ae", [ARMv82a, ProcA76,
FeatureFullFP16,
FeatureDotProd]>;
+def : ProcNoItin<"cortex-a77", [ARMv82a, ProcA77,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFullFP16,
+ FeatureDotProd]>;
+
+def : ProcNoItin<"cortex-a78", [ARMv82a, ProcA78,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFullFP16,
+ FeatureDotProd]>;
+
+def : ProcNoItin<"cortex-x1", [ARMv82a, ProcX1,
+ FeatureHWDivThumb,
+ FeatureHWDivARM,
+ FeatureCrypto,
+ FeatureCRC,
+ FeatureFullFP16,
+ FeatureDotProd]>;
+
def : ProcNoItin<"neoverse-n1", [ARMv82a,
FeatureHWDivThumb,
FeatureHWDivARM,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 6f26ca127f94..d6c1efa6327c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -57,26 +57,36 @@ ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
: AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr), AFI(nullptr),
MCP(nullptr), InConstantPool(false), OptimizationGoals(-1) {}
-void ARMAsmPrinter::EmitFunctionBodyEnd() {
+void ARMAsmPrinter::emitFunctionBodyEnd() {
// Make sure to terminate any constant pools that were at the end
// of the function.
if (!InConstantPool)
return;
InConstantPool = false;
- OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+ OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
}
-void ARMAsmPrinter::EmitFunctionEntryLabel() {
+void ARMAsmPrinter::emitFunctionEntryLabel() {
if (AFI->isThumbFunction()) {
- OutStreamer->EmitAssemblerFlag(MCAF_Code16);
- OutStreamer->EmitThumbFunc(CurrentFnSym);
+ OutStreamer->emitAssemblerFlag(MCAF_Code16);
+ OutStreamer->emitThumbFunc(CurrentFnSym);
} else {
- OutStreamer->EmitAssemblerFlag(MCAF_Code32);
+ OutStreamer->emitAssemblerFlag(MCAF_Code32);
}
- OutStreamer->EmitLabel(CurrentFnSym);
+
+ // Emit symbol for CMSE non-secure entry point
+ if (AFI->isCmseNSEntryFunction()) {
+ MCSymbol *S =
+ OutContext.getOrCreateSymbol("__acle_se_" + CurrentFnSym->getName());
+ emitLinkage(&MF->getFunction(), S);
+ OutStreamer->emitSymbolAttribute(S, MCSA_ELF_TypeFunction);
+ OutStreamer->emitLabel(S);
+ }
+
+ OutStreamer->emitLabel(CurrentFnSym);
}
-void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) {
+void ARMAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) {
uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType());
assert(Size && "C++ constructor pointer had zero size!");
@@ -90,17 +100,17 @@ void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) {
: MCSymbolRefExpr::VK_None),
OutContext);
- OutStreamer->EmitValue(E, Size);
+ OutStreamer->emitValue(E, Size);
}
-void ARMAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+void ARMAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
if (PromotedGlobals.count(GV))
// The global was promoted into a constant pool. It should not be emitted.
return;
- AsmPrinter::EmitGlobalVariable(GV);
+ AsmPrinter::emitGlobalVariable(GV);
}
-/// runOnMachineFunction - This uses the EmitInstruction()
+/// runOnMachineFunction - This uses the emitInstruction()
/// method to print assembly for each instruction.
///
bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -158,7 +168,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
}
// Emit the rest of the function body.
- EmitFunctionBody();
+ emitFunctionBody();
// Emit the XRay table for this function.
emitXRayTable();
@@ -167,10 +177,10 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// These are created per function, rather than per TU, since it's
// relatively easy to exceed the thumb branch range within a TU.
if (! ThumbIndirectPads.empty()) {
- OutStreamer->EmitAssemblerFlag(MCAF_Code16);
- EmitAlignment(Align(2));
+ OutStreamer->emitAssemblerFlag(MCAF_Code16);
+ emitAlignment(Align(2));
for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) {
- OutStreamer->EmitLabel(TIP.second);
+ OutStreamer->emitLabel(TIP.second);
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX)
.addReg(TIP.first)
// Add predicate operands.
@@ -467,14 +477,14 @@ void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
// the start mode, then restore the start mode.
const bool WasThumb = isThumb(StartInfo);
if (!EndInfo || WasThumb != isThumb(*EndInfo)) {
- OutStreamer->EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
+ OutStreamer->emitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
}
}
-void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
+void ARMAsmPrinter::emitStartOfAsmFile(Module &M) {
const Triple &TT = TM.getTargetTriple();
// Use unified assembler syntax.
- OutStreamer->EmitAssemblerFlag(MCAF_SyntaxUnified);
+ OutStreamer->emitAssemblerFlag(MCAF_SyntaxUnified);
// Emit ARM Build Attributes
if (TT.isOSBinFormatELF())
@@ -484,20 +494,20 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
// if we're thumb for the purposes of the top level code16 assembler
// flag.
if (!M.getModuleInlineAsm().empty() && TT.isThumb())
- OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+ OutStreamer->emitAssemblerFlag(MCAF_Code16);
}
static void
emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
MachineModuleInfoImpl::StubValueTy &MCSym) {
// L_foo$stub:
- OutStreamer.EmitLabel(StubLabel);
+ OutStreamer.emitLabel(StubLabel);
// .indirect_symbol _foo
- OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+ OutStreamer.emitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
if (MCSym.getInt())
// External to current translation unit.
- OutStreamer.EmitIntValue(0, 4/*size*/);
+ OutStreamer.emitIntValue(0, 4/*size*/);
else
// Internal to current translation unit.
//
@@ -505,13 +515,13 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
// pointers need to be indirect and pc-rel. We accomplish this by
// using NLPs; however, sometimes the types are local to the file.
// We need to fill in the value for the NLP in those cases.
- OutStreamer.EmitValue(
+ OutStreamer.emitValue(
MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
4 /*size*/);
}
-void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void ARMAsmPrinter::emitEndOfAsmFile(Module &M) {
const Triple &TT = TM.getTargetTriple();
if (TT.isOSBinFormatMachO()) {
// All darwin targets use mach-o.
@@ -526,7 +536,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (!Stubs.empty()) {
// Switch with ".non_lazy_symbol_pointer" directive.
OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
- EmitAlignment(Align(4));
+ emitAlignment(Align(4));
for (auto &Stub : Stubs)
emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -539,7 +549,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
if (!Stubs.empty()) {
// Switch with ".non_lazy_symbol_pointer" directive.
OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
- EmitAlignment(Align(4));
+ emitAlignment(Align(4));
for (auto &Stub : Stubs)
emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -553,7 +563,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
// implementation of multiple entry points). If this doesn't occur, the
// linker can safely perform dead code stripping. Since LLVM never
// generates code that does this, it is always safe to set.
- OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
}
// The last attribute to be emitted is ABI_optimization_goals
@@ -570,18 +580,28 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
}
//===----------------------------------------------------------------------===//
-// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+// Helper routines for emitStartOfAsmFile() and emitEndOfAsmFile()
// FIXME:
// The following seem like one-off assembler flags, but they actually need
// to appear in the .ARM.attributes section in ELF.
// Instead of subclassing the MCELFStreamer, we do the work here.
-// Returns true if all functions have the same function attribute value.
-// It also returns true when the module has no functions.
+ // Returns true if all functions have the same function attribute value.
+ // It also returns true when the module has no functions.
static bool checkFunctionsAttributeConsistency(const Module &M, StringRef Attr,
StringRef Value) {
+ return !any_of(M, [&](const Function &F) {
+ return F.getFnAttribute(Attr).getValueAsString() != Value;
+ });
+}
+// Returns true if all functions have the same denormal mode.
+// It also returns true when the module has no functions.
+static bool checkDenormalAttributeConsistency(const Module &M,
+ StringRef Attr,
+ DenormalMode Value) {
return !any_of(M, [&](const Function &F) {
- return F.getFnAttribute(Attr).getValueAsString() != Value;
+ StringRef AttrVal = F.getFnAttribute(Attr).getValueAsString();
+ return parseDenormalFPAttribute(AttrVal) != Value;
});
}
@@ -606,11 +626,12 @@ void ARMAsmPrinter::emitAttributes() {
if (!ArchFS.empty())
ArchFS = (Twine(ArchFS) + "," + FS).str();
else
- ArchFS = FS;
+ ArchFS = std::string(FS);
}
const ARMBaseTargetMachine &ATM =
static_cast<const ARMBaseTargetMachine &>(TM);
- const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian());
+ const ARMSubtarget STI(TT, std::string(CPU), ArchFS, ATM,
+ ATM.isLittleEndian());
// Emit build attributes for the available hardware.
ATS.emitTargetAttributes(STI);
@@ -641,16 +662,13 @@ void ARMAsmPrinter::emitAttributes() {
}
// Set FP Denormals.
- if (checkFunctionsAttributeConsistency(*MMI->getModule(),
- "denormal-fp-math",
- "preserve-sign") ||
- TM.Options.FPDenormalMode == FPDenormal::PreserveSign)
+ if (checkDenormalAttributeConsistency(*MMI->getModule(), "denormal-fp-math",
+ DenormalMode::getPreserveSign()))
ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
ARMBuildAttrs::PreserveFPSign);
- else if (checkFunctionsAttributeConsistency(*MMI->getModule(),
- "denormal-fp-math",
- "positive-zero") ||
- TM.Options.FPDenormalMode == FPDenormal::PositiveZero)
+ else if (checkDenormalAttributeConsistency(*MMI->getModule(),
+ "denormal-fp-math",
+ DenormalMode::getPositiveZero()))
ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
ARMBuildAttrs::PositiveZero);
else if (!TM.Options.UnsafeFPMath)
@@ -855,8 +873,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
llvm_unreachable("unexpected target");
}
-void ARMAsmPrinter::
-EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+void ARMAsmPrinter::emitMachineConstantPoolValue(
+ MachineConstantPoolValue *MCPV) {
const DataLayout &DL = getDataLayout();
int Size = DL.getTypeAllocSize(MCPV->getType());
@@ -876,11 +894,11 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
for (const auto *GV : ACPC->promotedGlobals()) {
if (!EmittedPromotedGlobalLabels.count(GV)) {
MCSymbol *GVSym = getSymbol(GV);
- OutStreamer->EmitLabel(GVSym);
+ OutStreamer->emitLabel(GVSym);
EmittedPromotedGlobalLabels.insert(GV);
}
}
- return EmitGlobalConstant(DL, ACPC->getPromotedGlobalInit());
+ return emitGlobalConstant(DL, ACPC->getPromotedGlobalInit());
}
MCSymbol *MCSym;
@@ -925,29 +943,29 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
// We want "(<expr> - .)", but MC doesn't have a concept of the '.'
// label, so just emit a local label end reference that instead.
MCSymbol *DotSym = OutContext.createTempSymbol();
- OutStreamer->EmitLabel(DotSym);
+ OutStreamer->emitLabel(DotSym);
const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
PCRelExpr = MCBinaryExpr::createSub(PCRelExpr, DotExpr, OutContext);
}
Expr = MCBinaryExpr::createSub(Expr, PCRelExpr, OutContext);
}
- OutStreamer->EmitValue(Expr, Size);
+ OutStreamer->emitValue(Expr, Size);
}
-void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
+void ARMAsmPrinter::emitJumpTableAddrs(const MachineInstr *MI) {
const MachineOperand &MO1 = MI->getOperand(1);
unsigned JTI = MO1.getIndex();
// Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
// ARM mode tables.
- EmitAlignment(Align(4));
+ emitAlignment(Align(4));
// Emit a label for the jump table.
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
- OutStreamer->EmitLabel(JTISymbol);
+ OutStreamer->emitLabel(JTISymbol);
// Mark the jump table as data-in-code.
- OutStreamer->EmitDataRegion(MCDR_DataRegionJT32);
+ OutStreamer->emitDataRegion(MCDR_DataRegionJT32);
// Emit each entry of the table.
const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
@@ -974,23 +992,23 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
else if (AFI->isThumbFunction())
Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(1,OutContext),
OutContext);
- OutStreamer->EmitValue(Expr, 4);
+ OutStreamer->emitValue(Expr, 4);
}
// Mark the end of jump table data-in-code region.
- OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+ OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
}
-void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
+void ARMAsmPrinter::emitJumpTableInsts(const MachineInstr *MI) {
const MachineOperand &MO1 = MI->getOperand(1);
unsigned JTI = MO1.getIndex();
// Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
// ARM mode tables.
- EmitAlignment(Align(4));
+ emitAlignment(Align(4));
// Emit a label for the jump table.
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
- OutStreamer->EmitLabel(JTISymbol);
+ OutStreamer->emitLabel(JTISymbol);
// Emit each entry of the table.
const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
@@ -1008,17 +1026,17 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
}
}
-void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
+void ARMAsmPrinter::emitJumpTableTBInst(const MachineInstr *MI,
unsigned OffsetWidth) {
assert((OffsetWidth == 1 || OffsetWidth == 2) && "invalid tbb/tbh width");
const MachineOperand &MO1 = MI->getOperand(1);
unsigned JTI = MO1.getIndex();
if (Subtarget->isThumb1Only())
- EmitAlignment(Align(4));
+ emitAlignment(Align(4));
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
- OutStreamer->EmitLabel(JTISymbol);
+ OutStreamer->emitLabel(JTISymbol);
// Emit each entry of the table.
const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
@@ -1026,7 +1044,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
// Mark the jump table as data-in-code.
- OutStreamer->EmitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8
+ OutStreamer->emitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8
: MCDR_DataRegionJT16);
for (auto MBB : JTBBs) {
@@ -1050,15 +1068,15 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
Expr = MCBinaryExpr::createSub(MBBSymbolExpr, Expr, OutContext);
Expr = MCBinaryExpr::createDiv(Expr, MCConstantExpr::create(2, OutContext),
OutContext);
- OutStreamer->EmitValue(Expr, OffsetWidth);
+ OutStreamer->emitValue(Expr, OffsetWidth);
}
// Mark the end of jump table data-in-code region. 32-bit offsets use
// actual branch instructions here, so we don't mark those as a data-region
// at all.
- OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+ OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
// Make sure the next instruction is 2-byte aligned.
- EmitAlignment(Align(2));
+ emitAlignment(Align(2));
}
void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
@@ -1076,16 +1094,26 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
unsigned Opc = MI->getOpcode();
unsigned SrcReg, DstReg;
- if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) {
- // Two special cases:
- // 1) tPUSH does not have src/dst regs.
- // 2) for Thumb1 code we sometimes materialize the constant via constpool
- // load. Yes, this is pretty fragile, but for now I don't see better
- // way... :(
+ switch (Opc) {
+ case ARM::tPUSH:
+ // special case: tPUSH does not have src/dst regs.
SrcReg = DstReg = ARM::SP;
- } else {
+ break;
+ case ARM::tLDRpci:
+ case ARM::t2MOVi16:
+ case ARM::t2MOVTi16:
+ // special cases:
+ // 1) for Thumb1 code we sometimes materialize the constant via constpool
+ // load.
+ // 2) for Thumb2 execute only code we materialize the constant via
+ // immediate constants in 2 separate instructions (MOVW/MOVT).
+ SrcReg = ~0U;
+ DstReg = MI->getOperand(0).getReg();
+ break;
+ default:
SrcReg = MI->getOperand(1).getReg();
DstReg = MI->getOperand(0).getReg();
+ break;
}
// Try to figure out the unwinding opcode out of src / dst regs.
@@ -1189,23 +1217,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
case ARM::tADDrSPi:
Offset = -MI->getOperand(2).getImm()*4;
break;
- case ARM::tLDRpci: {
- // Grab the constpool index and check, whether it corresponds to
- // original or cloned constpool entry.
- unsigned CPI = MI->getOperand(1).getIndex();
- const MachineConstantPool *MCP = MF.getConstantPool();
- if (CPI >= MCP->getConstants().size())
- CPI = AFI->getOriginalCPIdx(CPI);
- assert(CPI != -1U && "Invalid constpool index");
-
- // Derive the actual offset.
- const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI];
- assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry");
- // FIXME: Check for user, it should be "add" instruction!
- Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue();
+ case ARM::tADDhirr:
+ Offset =
+ -AFI->EHPrologueOffsetInRegs.lookup(MI->getOperand(2).getReg());
break;
}
- }
if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
if (DstReg == FramePtr && FramePtr != ARM::SP)
@@ -1225,14 +1241,43 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
} else if (DstReg == ARM::SP) {
MI->print(errs());
llvm_unreachable("Unsupported opcode for unwinding information");
- } else if (Opc == ARM::tMOVr) {
- // If a Thumb1 function spills r8-r11, we copy the values to low
- // registers before pushing them. Record the copy so we can emit the
- // correct ".save" later.
- AFI->EHPrologueRemappedRegs[DstReg] = SrcReg;
} else {
- MI->print(errs());
- llvm_unreachable("Unsupported opcode for unwinding information");
+ int64_t Offset = 0;
+ switch (Opc) {
+ case ARM::tMOVr:
+ // If a Thumb1 function spills r8-r11, we copy the values to low
+ // registers before pushing them. Record the copy so we can emit the
+ // correct ".save" later.
+ AFI->EHPrologueRemappedRegs[DstReg] = SrcReg;
+ break;
+ case ARM::tLDRpci: {
+ // Grab the constpool index and check, whether it corresponds to
+ // original or cloned constpool entry.
+ unsigned CPI = MI->getOperand(1).getIndex();
+ const MachineConstantPool *MCP = MF.getConstantPool();
+ if (CPI >= MCP->getConstants().size())
+ CPI = AFI->getOriginalCPIdx(CPI);
+ assert(CPI != -1U && "Invalid constpool index");
+
+ // Derive the actual offset.
+ const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI];
+ assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry");
+ Offset = cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue();
+ AFI->EHPrologueOffsetInRegs[DstReg] = Offset;
+ break;
+ }
+ case ARM::t2MOVi16:
+ Offset = MI->getOperand(1).getImm();
+ AFI->EHPrologueOffsetInRegs[DstReg] = Offset;
+ break;
+ case ARM::t2MOVTi16:
+ Offset = MI->getOperand(2).getImm();
+ AFI->EHPrologueOffsetInRegs[DstReg] |= (Offset << 16);
+ break;
+ default:
+ MI->print(errs());
+ llvm_unreachable("Unsupported opcode for unwinding information");
+ }
}
}
}
@@ -1241,7 +1286,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
// instructions) auto-generated.
#include "ARMGenMCPseudoLowering.inc"
-void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
const DataLayout &DL = getDataLayout();
MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
@@ -1252,7 +1297,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// If we just ended a constant pool, mark it as such.
if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
- OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+ OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
InConstantPool = false;
}
@@ -1513,7 +1558,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// This is a pseudo op for a label used by a branch future instruction
// Emit the label.
- OutStreamer->EmitLabel(getBFLabel(DL.getPrivateGlobalPrefix(),
+ OutStreamer->emitLabel(getBFLabel(DL.getPrivateGlobalPrefix(),
getFunctionNumber(),
MI->getOperand(0).getIndex(), OutContext));
return;
@@ -1525,7 +1570,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// This adds the address of LPC0 to r0.
// Emit the label.
- OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+ OutStreamer->emitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
getFunctionNumber(),
MI->getOperand(2).getImm(), OutContext));
@@ -1546,7 +1591,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// This adds the address of LPC0 to r0.
// Emit the label.
- OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+ OutStreamer->emitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
getFunctionNumber(),
MI->getOperand(2).getImm(), OutContext));
@@ -1577,7 +1622,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// a PC-relative address at the ldr instruction.
// Emit the label.
- OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+ OutStreamer->emitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
getFunctionNumber(),
MI->getOperand(2).getImm(), OutContext));
@@ -1620,28 +1665,28 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// If this is the first entry of the pool, mark it.
if (!InConstantPool) {
- OutStreamer->EmitDataRegion(MCDR_DataRegion);
+ OutStreamer->emitDataRegion(MCDR_DataRegion);
InConstantPool = true;
}
- OutStreamer->EmitLabel(GetCPISymbol(LabelId));
+ OutStreamer->emitLabel(GetCPISymbol(LabelId));
const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
if (MCPE.isMachineConstantPoolEntry())
- EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+ emitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
else
- EmitGlobalConstant(DL, MCPE.Val.ConstVal);
+ emitGlobalConstant(DL, MCPE.Val.ConstVal);
return;
}
case ARM::JUMPTABLE_ADDRS:
- EmitJumpTableAddrs(MI);
+ emitJumpTableAddrs(MI);
return;
case ARM::JUMPTABLE_INSTS:
- EmitJumpTableInsts(MI);
+ emitJumpTableInsts(MI);
return;
case ARM::JUMPTABLE_TBB:
case ARM::JUMPTABLE_TBH:
- EmitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2);
+ emitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2);
return;
case ARM::t2BR_JT: {
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
@@ -1656,7 +1701,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
case ARM::t2TBH_JT: {
unsigned Opc = MI->getOpcode() == ARM::t2TBB_JT ? ARM::t2TBB : ARM::t2TBH;
// Lower and emit the PC label, then the instruction itself.
- OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
+ OutStreamer->emitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
@@ -1698,7 +1743,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
// FIXME: Ideally we could vary the LDRB index based on the padding
// between the sequence and jump table, however that relies on MCExprs
// for load indexes which are currently not supported.
- OutStreamer->EmitCodeAlignment(4);
+ OutStreamer->emitCodeAlignment(4);
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
.addReg(Idx)
.addReg(Idx)
@@ -1740,7 +1785,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
.addImm(ARMCC::AL)
.addReg(0));
- OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
+ OutStreamer->emitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
.addReg(ARM::PC)
.addReg(ARM::PC)
@@ -1809,7 +1854,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
}
case ARM::SPACE:
- OutStreamer->EmitZeros(MI->getOperand(1).getImm());
+ OutStreamer->emitZeros(MI->getOperand(1).getImm());
return;
case ARM::TRAP: {
// Non-Darwin binutils don't yet support the "trap" mnemonic.
@@ -1904,7 +1949,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
.addImm(ARMCC::AL)
.addReg(0));
- OutStreamer->EmitLabel(Label);
+ OutStreamer->emitLabel(Label);
return;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.h
index a4b37fa2331f..f8ff047a1d06 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -84,21 +84,21 @@ public:
void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
const MCSubtargetInfo *EndInfo) const override;
- void EmitJumpTableAddrs(const MachineInstr *MI);
- void EmitJumpTableInsts(const MachineInstr *MI);
- void EmitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth);
- void EmitInstruction(const MachineInstr *MI) override;
+ void emitJumpTableAddrs(const MachineInstr *MI);
+ void emitJumpTableInsts(const MachineInstr *MI);
+ void emitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth);
+ void emitInstruction(const MachineInstr *MI) override;
bool runOnMachineFunction(MachineFunction &F) override;
- void EmitConstantPool() override {
+ void emitConstantPool() override {
// we emit constant pools customly!
}
- void EmitFunctionBodyEnd() override;
- void EmitFunctionEntryLabel() override;
- void EmitStartOfAsmFile(Module &M) override;
- void EmitEndOfAsmFile(Module &M) override;
- void EmitXXStructor(const DataLayout &DL, const Constant *CV) override;
- void EmitGlobalVariable(const GlobalVariable *GV) override;
+ void emitFunctionBodyEnd() override;
+ void emitFunctionEntryLabel() override;
+ void emitStartOfAsmFile(Module &M) override;
+ void emitEndOfAsmFile(Module &M) override;
+ void emitXXStructor(const DataLayout &DL, const Constant *CV) override;
+ void emitGlobalVariable(const GlobalVariable *GV) override;
MCSymbol *GetCPISymbol(unsigned CPID) const override;
@@ -117,7 +117,7 @@ public:
private:
void EmitSled(const MachineInstr &MI, SledKind Kind);
- // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+ // Helpers for emitStartOfAsmFile() and emitEndOfAsmFile()
void emitAttributes();
// Generic helper used to emit e.g. ARMv5 mul pseudos
@@ -150,7 +150,7 @@ private:
public:
/// EmitMachineConstantPoolValue - Print a machine constantpool value to
/// the .s file.
- void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+ void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 48f781510254..4cc2b6bf7e7e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
@@ -495,6 +496,31 @@ bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const {
return PIdx != -1 && MI.getOperand(PIdx).getImm() != ARMCC::AL;
}
+std::string ARMBaseInstrInfo::createMIROperandComment(
+ const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx,
+ const TargetRegisterInfo *TRI) const {
+
+ // First, let's see if there is a generic comment for this operand
+ std::string GenericComment =
+ TargetInstrInfo::createMIROperandComment(MI, Op, OpIdx, TRI);
+ if (!GenericComment.empty())
+ return GenericComment;
+
+ // If not, check if we have an immediate operand.
+ if (Op.getType() != MachineOperand::MO_Immediate)
+ return std::string();
+
+ // And print its corresponding condition code if the immediate is a
+ // predicate.
+ int FirstPredOp = MI.findFirstPredOperandIdx();
+ if (FirstPredOp != (int) OpIdx)
+ return std::string();
+
+ std::string CC = "CC::";
+ CC += ARMCondCodeToString((ARMCC::CondCodes)Op.getImm());
+ return CC;
+}
+
bool ARMBaseInstrInfo::PredicateInstruction(
MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
unsigned Opc = MI.getOpcode();
@@ -811,7 +837,7 @@ void llvm::addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB) {
}
void llvm::addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB,
- unsigned DestReg) {
+ Register DestReg) {
addUnpredicatedMveVpredNOp(MIB);
MIB.addReg(DestReg, RegState::Undef);
}
@@ -1009,6 +1035,36 @@ ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
}
+Optional<ParamLoadedValue>
+ARMBaseInstrInfo::describeLoadedValue(const MachineInstr &MI,
+ Register Reg) const {
+ if (auto DstSrcPair = isCopyInstrImpl(MI)) {
+ Register DstReg = DstSrcPair->Destination->getReg();
+
+ // TODO: We don't handle cases where the forwarding reg is narrower/wider
+ // than the copy registers. Consider for example:
+ //
+ // s16 = VMOVS s0
+ // s17 = VMOVS s1
+ // call @callee(d0)
+ //
+ // We'd like to describe the call site value of d0 as d8, but this requires
+ // gathering and merging the descriptions for the two VMOVS instructions.
+ //
+ // We also don't handle the reverse situation, where the forwarding reg is
+ // narrower than the copy destination:
+ //
+ // d8 = VMOVD d0
+ // call @callee(s1)
+ //
+ // We need to produce a fragment description (the call site value of s1 is
+ // /not/ just d8).
+ if (DstReg != Reg)
+ return None;
+ }
+ return TargetInstrInfo::describeLoadedValue(MI, Reg);
+}
+
const MachineInstrBuilder &
ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
unsigned SubIdx, unsigned State,
@@ -1023,16 +1079,16 @@ ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
void ARMBaseInstrInfo::
storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned SrcReg, bool isKill, int FI,
+ Register SrcReg, bool isKill, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
- unsigned Align = MFI.getObjectAlignment(FI);
+ Align Alignment = MFI.getObjectAlign(FI);
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
- MFI.getObjectSize(FI), Align);
+ MFI.getObjectSize(FI), Alignment);
switch (TRI->getSpillSize(*RC)) {
case 2:
@@ -1102,7 +1158,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
case 16:
if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) {
// Use aligned spills if the stack can be realigned.
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF)) {
BuildMI(MBB, I, DebugLoc(), get(ARM::VST1q64))
.addFrameIndex(FI)
.addImm(16)
@@ -1130,7 +1186,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
case 24:
if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
// Use aligned spills if the stack can be realigned.
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
Subtarget.hasNEON()) {
BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo))
.addFrameIndex(FI)
@@ -1153,7 +1209,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 32:
if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
Subtarget.hasNEON()) {
// FIXME: It's possible to only store part of the QQ register if the
// spilled def has a sub-register index.
@@ -1264,17 +1320,17 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
void ARMBaseInstrInfo::
loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned DestReg, int FI,
+ Register DestReg, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
DebugLoc DL;
if (I != MBB.end()) DL = I->getDebugLoc();
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
- unsigned Align = MFI.getObjectAlignment(FI);
+ const Align Alignment = MFI.getObjectAlign(FI);
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
- MFI.getObjectSize(FI), Align);
+ MFI.getObjectSize(FI), Alignment);
switch (TRI->getSpillSize(*RC)) {
case 2:
@@ -1343,7 +1399,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 16:
if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+ if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF)) {
BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
.addFrameIndex(FI)
.addImm(16)
@@ -1367,7 +1423,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 24:
if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
Subtarget.hasNEON()) {
BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
.addFrameIndex(FI)
@@ -1390,7 +1446,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
break;
case 32:
if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
- if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+ if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
Subtarget.hasNEON()) {
BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
.addFrameIndex(FI)
@@ -1682,13 +1738,13 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4);
else
llvm_unreachable("Unexpected ARM constantpool value type!!");
- CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
+ CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlign());
return PCLabelId;
}
void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- unsigned DestReg, unsigned SubIdx,
+ Register DestReg, unsigned SubIdx,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const {
unsigned Opcode = Orig.getOpcode();
@@ -1959,6 +2015,10 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
if (MI.isTerminator() || MI.isPosition())
return true;
+ // INLINEASM_BR can jump to another block
+ if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+ return true;
+
// Treat the start of the IT block as a scheduling boundary, but schedule
// t2IT along with all instructions following it.
// FIXME: This is a big hammer. But the alternative is to add all potential
@@ -2120,7 +2180,7 @@ ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
/// condition, otherwise returns AL. It also returns the condition code
/// register by reference.
ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI,
- unsigned &PredReg) {
+ Register &PredReg) {
int PIdx = MI.findFirstPredOperandIdx();
if (PIdx == -1) {
PredReg = 0;
@@ -2150,7 +2210,7 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
case ARM::MOVCCr:
case ARM::t2MOVCCr: {
// MOVCC can be commuted by inverting the condition.
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
// MOVCC AL can't be inverted. Shouldn't happen.
if (CC == ARMCC::AL || PredReg != ARM::CPSR)
@@ -2171,9 +2231,9 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
/// Identify instructions that can be folded into a MOVCC instruction, and
/// return the defining instruction.
MachineInstr *
-ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
+ARMBaseInstrInfo::canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
const TargetInstrInfo *TII) const {
- if (!Register::isVirtualRegister(Reg))
+ if (!Reg.isVirtual())
return nullptr;
if (!MRI.hasOneNonDBGUse(Reg))
return nullptr;
@@ -2353,9 +2413,9 @@ unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg,
- unsigned BaseReg, int NumBytes,
- ARMCC::CondCodes Pred, unsigned PredReg,
+ const DebugLoc &dl, Register DestReg,
+ Register BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, Register PredReg,
const ARMBaseInstrInfo &TII,
unsigned MIFlags) {
if (NumBytes == 0 && DestReg != BaseReg) {
@@ -2515,7 +2575,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
}
bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ Register FrameReg, int &Offset,
const ARMBaseInstrInfo &TII) {
unsigned Opcode = MI.getOpcode();
const MCInstrDesc &Desc = MI.getDesc();
@@ -2671,8 +2731,8 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
/// in SrcReg and SrcReg2 if having two register operands, and the value it
/// compares against in CmpValue. Return true if the comparison instruction
/// can be analyzed.
-bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &SrcReg2, int &CmpMask,
+bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+ Register &SrcReg2, int &CmpMask,
int &CmpValue) const {
switch (MI.getOpcode()) {
default: break;
@@ -2708,7 +2768,7 @@ bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
/// operates on the given source register and applies the same mask
/// as a 'tst' instruction. Provide a limited look-through for copies.
/// When successful, MI will hold the found instruction.
-static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
+static bool isSuitableForMask(MachineInstr *&MI, Register SrcReg,
int CmpMask, bool CommonUse) {
switch (MI->getOpcode()) {
case ARM::ANDri:
@@ -2743,7 +2803,7 @@ inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) {
/// CMPrr(r0, r1) can be made redundant by ADDr[ri](r0, r1, X).
/// This function can be extended later on.
inline static bool isRedundantFlagInstr(const MachineInstr *CmpI,
- unsigned SrcReg, unsigned SrcReg2,
+ Register SrcReg, Register SrcReg2,
int ImmValue, const MachineInstr *OI,
bool &IsThumb1) {
if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) &&
@@ -2879,7 +2939,7 @@ static bool isOptimizeCompareCandidate(MachineInstr *MI, bool &IsThumb1) {
/// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
/// condition code of instructions which use the flags.
bool ARMBaseInstrInfo::optimizeCompareInstr(
- MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+ MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
int CmpValue, const MachineRegisterInfo *MRI) const {
// Get the unique definition of SrcReg.
MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
@@ -3166,7 +3226,7 @@ bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
return true;
MachineBasicBlock::const_iterator Next = &MI;
++Next;
- unsigned SrcReg, SrcReg2;
+ Register SrcReg, SrcReg2;
int CmpMask, CmpValue;
bool IsThumb1;
if (Next != MI.getParent()->end() &&
@@ -3177,7 +3237,7 @@ bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
}
bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
- unsigned Reg,
+ Register Reg,
MachineRegisterInfo *MRI) const {
// Fold large immediates into add, sub, or, xor.
unsigned DefOpc = DefMI.getOpcode();
@@ -3729,7 +3789,7 @@ unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
// If there are odd number of registers or if it's not 64-bit aligned,
// then it takes an extra AGU (Address Generation Unit) cycle.
if ((NumRegs % 2) || !MI.hasOneMemOperand() ||
- (*MI.memoperands_begin())->getAlignment() < 8)
+ (*MI.memoperands_begin())->getAlign() < Align(8))
++UOps;
return UOps;
}
@@ -4316,10 +4376,10 @@ int ARMBaseInstrInfo::getOperandLatencyImpl(
return -1;
unsigned DefAlign = DefMI.hasOneMemOperand()
- ? (*DefMI.memoperands_begin())->getAlignment()
+ ? (*DefMI.memoperands_begin())->getAlign().value()
: 0;
unsigned UseAlign = UseMI.hasOneMemOperand()
- ? (*UseMI.memoperands_begin())->getAlignment()
+ ? (*UseMI.memoperands_begin())->getAlign().value()
: 0;
// Get the itinerary's latency if possible, and handle variable_ops.
@@ -4366,10 +4426,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
auto *DefMN = cast<MachineSDNode>(DefNode);
unsigned DefAlign = !DefMN->memoperands_empty()
- ? (*DefMN->memoperands_begin())->getAlignment() : 0;
+ ? (*DefMN->memoperands_begin())->getAlign().value()
+ : 0;
auto *UseMN = cast<MachineSDNode>(UseNode);
unsigned UseAlign = !UseMN->memoperands_empty()
- ? (*UseMN->memoperands_begin())->getAlignment() : 0;
+ ? (*UseMN->memoperands_begin())->getAlign().value()
+ : 0;
int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
UseMCID, UseIdx, UseAlign);
@@ -4660,7 +4722,7 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
// Adjust for dynamic def-side opcode variants not captured by the itinerary.
unsigned DefAlign =
- MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlignment() : 0;
+ MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlign().value() : 0;
int Adj = adjustDefLatency(Subtarget, MI, MCID, DefAlign);
if (Adj >= 0 || (int)Latency > -Adj) {
return Latency + Adj;
@@ -4782,7 +4844,7 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant;
MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
- MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, Align(4));
MIB.addMemOperand(MMO).add(predOps(ARMCC::AL));
}
@@ -5353,7 +5415,8 @@ Optional<RegImmPair> ARMBaseInstrInfo::isAddImmediate(const MachineInstr &MI,
// TODO: Handle cases where Reg is a super- or sub-register of the
// destination register.
- if (Reg != MI.getOperand(0).getReg())
+ const MachineOperand &Op0 = MI.getOperand(0);
+ if (!Op0.isReg() || Reg != Op0.getReg())
return None;
// We describe SUBri or ADDri instructions.
@@ -5365,8 +5428,7 @@ Optional<RegImmPair> ARMBaseInstrInfo::isAddImmediate(const MachineInstr &MI,
// TODO: Third operand can be global address (usually some string). Since
// strings can be relocated we cannot calculate their offsets for
// now.
- if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
- !MI.getOperand(2).isImm())
+ if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm())
return None;
Offset = MI.getOperand(2).getImm() * Sign;
@@ -5402,7 +5464,7 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br,
if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri)
return nullptr;
Register Reg = CmpMI->getOperand(0).getReg();
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg);
if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0)
return nullptr;
@@ -5460,3 +5522,521 @@ bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
return ConstantMaterializationCost(Val1, Subtarget, !ForCodesize) <
ConstantMaterializationCost(Val2, Subtarget, !ForCodesize);
}
+
+/// Constants defining how certain sequences should be outlined.
+/// This encompasses how an outlined function should be called, and what kind of
+/// frame should be emitted for that outlined function.
+///
+/// \p MachineOutlinerTailCall implies that the function is being created from
+/// a sequence of instructions ending in a return.
+///
+/// That is,
+///
+/// I1 OUTLINED_FUNCTION:
+/// I2 --> B OUTLINED_FUNCTION I1
+/// BX LR I2
+/// BX LR
+///
+/// +-------------------------+--------+-----+
+/// | | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes | 4 | 4 |
+/// | Frame overhead in Bytes | 0 | 0 |
+/// | Stack fixup required | No | No |
+/// +-------------------------+--------+-----+
+///
+/// \p MachineOutlinerThunk implies that the function is being created from
+/// a sequence of instructions ending in a call. The outlined function is
+/// called with a BL instruction, and the outlined function tail-calls the
+/// original call destination.
+///
+/// That is,
+///
+/// I1 OUTLINED_FUNCTION:
+/// I2 --> BL OUTLINED_FUNCTION I1
+/// BL f I2
+/// B f
+///
+/// +-------------------------+--------+-----+
+/// | | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes | 4 | 4 |
+/// | Frame overhead in Bytes | 0 | 0 |
+/// | Stack fixup required | No | No |
+/// +-------------------------+--------+-----+
+///
+/// \p MachineOutlinerNoLRSave implies that the function should be called using
+/// a BL instruction, but doesn't require LR to be saved and restored. This
+/// happens when LR is known to be dead.
+///
+/// That is,
+///
+/// I1 OUTLINED_FUNCTION:
+/// I2 --> BL OUTLINED_FUNCTION I1
+/// I3 I2
+/// I3
+/// BX LR
+///
+/// +-------------------------+--------+-----+
+/// | | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes | 4 | 4 |
+/// | Frame overhead in Bytes | 4 | 4 |
+/// | Stack fixup required | No | No |
+/// +-------------------------+--------+-----+
+///
+/// \p MachineOutlinerRegSave implies that the function should be called with a
+/// save and restore of LR to an available register. This allows us to avoid
+/// stack fixups. Note that this outlining variant is compatible with the
+/// NoLRSave case.
+///
+/// That is,
+///
+/// I1 Save LR OUTLINED_FUNCTION:
+/// I2 --> BL OUTLINED_FUNCTION I1
+/// I3 Restore LR I2
+/// I3
+/// BX LR
+///
+/// +-------------------------+--------+-----+
+/// | | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes | 8 | 12 |
+/// | Frame overhead in Bytes | 2 | 4 |
+/// | Stack fixup required | No | No |
+/// +-------------------------+--------+-----+
+
+enum MachineOutlinerClass {
+ MachineOutlinerTailCall,
+ MachineOutlinerThunk,
+ MachineOutlinerNoLRSave,
+ MachineOutlinerRegSave
+};
+
+enum MachineOutlinerMBBFlags {
+ LRUnavailableSomewhere = 0x2,
+ HasCalls = 0x4,
+ UnsafeRegsDead = 0x8
+};
+
+struct OutlinerCosts {
+ const int CallTailCall;
+ const int FrameTailCall;
+ const int CallThunk;
+ const int FrameThunk;
+ const int CallNoLRSave;
+ const int FrameNoLRSave;
+ const int CallRegSave;
+ const int FrameRegSave;
+
+ OutlinerCosts(const ARMSubtarget &target)
+ : CallTailCall(target.isThumb() ? 4 : 4),
+ FrameTailCall(target.isThumb() ? 0 : 0),
+ CallThunk(target.isThumb() ? 4 : 4),
+ FrameThunk(target.isThumb() ? 0 : 0),
+ CallNoLRSave(target.isThumb() ? 4 : 4),
+ FrameNoLRSave(target.isThumb() ? 4 : 4),
+ CallRegSave(target.isThumb() ? 8 : 12),
+ FrameRegSave(target.isThumb() ? 2 : 4) {}
+};
+
+unsigned
+ARMBaseInstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
+ assert(C.LRUWasSet && "LRU wasn't set?");
+ MachineFunction *MF = C.getMF();
+ const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo *>(
+ MF->getSubtarget().getRegisterInfo());
+
+ BitVector regsReserved = ARI->getReservedRegs(*MF);
+ // Check if there is an available register across the sequence that we can
+ // use.
+ for (unsigned Reg : ARM::rGPRRegClass) {
+ if (!(Reg < regsReserved.size() && regsReserved.test(Reg)) &&
+ Reg != ARM::LR && // LR is not reserved, but don't use it.
+ Reg != ARM::R12 && // R12 is not guaranteed to be preserved.
+ C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
+ return Reg;
+ }
+
+ // No suitable register. Return 0.
+ return 0u;
+}
+
+outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
+ std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+ outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
+ unsigned SequenceSize =
+ std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
+ [this](unsigned Sum, const MachineInstr &MI) {
+ return Sum + getInstSizeInBytes(MI);
+ });
+
+ // Properties about candidate MBBs that hold for all of them.
+ unsigned FlagsSetInAll = 0xF;
+
+ // Compute liveness information for each candidate, and set FlagsSetInAll.
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+ std::for_each(
+ RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
+ [&FlagsSetInAll](outliner::Candidate &C) { FlagsSetInAll &= C.Flags; });
+
+ // According to the ARM Procedure Call Standard, the following are
+ // undefined on entry/exit from a function call:
+ //
+ // * Register R12(IP),
+ // * Condition codes (and thus the CPSR register)
+ //
+ // Since we control the instructions which are part of the outlined regions
+ // we don't need to be fully compliant with the AAPCS, but we have to
+ // guarantee that if a veneer is inserted at link time the code is still
+ // correct. Because of this, we can't outline any sequence of instructions
+ // where one of these registers is live into/across it. Thus, we need to
+ // delete those candidates.
+ auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
+ // If the unsafe registers in this block are all dead, then we don't need
+ // to compute liveness here.
+ if (C.Flags & UnsafeRegsDead)
+ return false;
+ C.initLRU(TRI);
+ LiveRegUnits LRU = C.LRU;
+ return (!LRU.available(ARM::R12) || !LRU.available(ARM::CPSR));
+ };
+
+ // Are there any candidates where those registers are live?
+ if (!(FlagsSetInAll & UnsafeRegsDead)) {
+ // Erase every candidate that violates the restrictions above. (It could be
+ // true that we have viable candidates, so it's not worth bailing out in
+ // the case that, say, 1 out of 20 candidates violate the restructions.)
+ RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
+ RepeatedSequenceLocs.end(),
+ CantGuaranteeValueAcrossCall),
+ RepeatedSequenceLocs.end());
+
+ // If the sequence doesn't have enough candidates left, then we're done.
+ if (RepeatedSequenceLocs.size() < 2)
+ return outliner::OutlinedFunction();
+ }
+
+ // At this point, we have only "safe" candidates to outline. Figure out
+ // frame + call instruction information.
+
+ unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
+
+ // Helper lambda which sets call information for every candidate.
+ auto SetCandidateCallInfo =
+ [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
+ for (outliner::Candidate &C : RepeatedSequenceLocs)
+ C.setCallInfo(CallID, NumBytesForCall);
+ };
+
+ OutlinerCosts Costs(Subtarget);
+ unsigned FrameID = 0;
+ unsigned NumBytesToCreateFrame = 0;
+
+ // If the last instruction in any candidate is a terminator, then we should
+ // tail call all of the candidates.
+ if (RepeatedSequenceLocs[0].back()->isTerminator()) {
+ FrameID = MachineOutlinerTailCall;
+ NumBytesToCreateFrame = Costs.FrameTailCall;
+ SetCandidateCallInfo(MachineOutlinerTailCall, Costs.CallTailCall);
+ } else if (LastInstrOpcode == ARM::BL || LastInstrOpcode == ARM::BLX ||
+ LastInstrOpcode == ARM::tBL || LastInstrOpcode == ARM::tBLXr ||
+ LastInstrOpcode == ARM::tBLXi) {
+ FrameID = MachineOutlinerThunk;
+ NumBytesToCreateFrame = Costs.FrameThunk;
+ SetCandidateCallInfo(MachineOutlinerThunk, Costs.CallThunk);
+ } else {
+ // We need to decide how to emit calls + frames. We can always emit the same
+ // frame if we don't need to save to the stack.
+ unsigned NumBytesNoStackCalls = 0;
+ std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
+
+ for (outliner::Candidate &C : RepeatedSequenceLocs) {
+ C.initLRU(TRI);
+
+ // Is LR available? If so, we don't need a save.
+ if (C.LRU.available(ARM::LR)) {
+ FrameID = MachineOutlinerNoLRSave;
+ NumBytesNoStackCalls += Costs.CallNoLRSave;
+ C.setCallInfo(MachineOutlinerNoLRSave, Costs.CallNoLRSave);
+ CandidatesWithoutStackFixups.push_back(C);
+ }
+
+ // Is an unused register available? If so, we won't modify the stack, so
+ // we can outline with the same frame type as those that don't save LR.
+ else if (findRegisterToSaveLRTo(C)) {
+ FrameID = MachineOutlinerRegSave;
+ NumBytesNoStackCalls += Costs.CallRegSave;
+ C.setCallInfo(MachineOutlinerRegSave, Costs.CallRegSave);
+ CandidatesWithoutStackFixups.push_back(C);
+ }
+ }
+
+ if (!CandidatesWithoutStackFixups.empty()) {
+ RepeatedSequenceLocs = CandidatesWithoutStackFixups;
+ } else
+ return outliner::OutlinedFunction();
+ }
+
+ return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
+ NumBytesToCreateFrame, FrameID);
+}
+
+bool ARMBaseInstrInfo::isFunctionSafeToOutlineFrom(
+ MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
+ const Function &F = MF.getFunction();
+
+ // Can F be deduplicated by the linker? If it can, don't outline from it.
+ if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
+ return false;
+
+ // Don't outline from functions with section markings; the program could
+ // expect that all the code is in the named section.
+ // FIXME: Allow outlining from multiple functions with the same section
+ // marking.
+ if (F.hasSection())
+ return false;
+
+ // FIXME: Thumb1 outlining is not handled
+ if (MF.getInfo<ARMFunctionInfo>()->isThumb1OnlyFunction())
+ return false;
+
+ // It's safe to outline from MF.
+ return true;
+}
+
+bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+ unsigned &Flags) const {
+ // Check if LR is available through all of the MBB. If it's not, then set
+ // a flag.
+ assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
+ "Suitable Machine Function for outlining must track liveness");
+
+ LiveRegUnits LRU(getRegisterInfo());
+
+ std::for_each(MBB.rbegin(), MBB.rend(),
+ [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
+
+ // Check if each of the unsafe registers are available...
+ bool R12AvailableInBlock = LRU.available(ARM::R12);
+ bool CPSRAvailableInBlock = LRU.available(ARM::CPSR);
+
+ // If all of these are dead (and not live out), we know we don't have to check
+ // them later.
+ if (R12AvailableInBlock && CPSRAvailableInBlock)
+ Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
+
+ // Now, add the live outs to the set.
+ LRU.addLiveOuts(MBB);
+
+ // If any of these registers is available in the MBB, but also a live out of
+ // the block, then we know outlining is unsafe.
+ if (R12AvailableInBlock && !LRU.available(ARM::R12))
+ return false;
+ if (CPSRAvailableInBlock && !LRU.available(ARM::CPSR))
+ return false;
+
+ // Check if there's a call inside this MachineBasicBlock. If there is, then
+ // set a flag.
+ if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
+ Flags |= MachineOutlinerMBBFlags::HasCalls;
+
+ if (!LRU.available(ARM::LR))
+ Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
+
+ return true;
+}
+
+outliner::InstrType
+ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
+ unsigned Flags) const {
+ MachineInstr &MI = *MIT;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+ // Be conservative with inline ASM
+ if (MI.isInlineAsm())
+ return outliner::InstrType::Illegal;
+
+ // Don't allow debug values to impact outlining type.
+ if (MI.isDebugInstr() || MI.isIndirectDebugValue())
+ return outliner::InstrType::Invisible;
+
+ // At this point, KILL or IMPLICIT_DEF instructions don't really tell us much
+ // so we can go ahead and skip over them.
+ if (MI.isKill() || MI.isImplicitDef())
+ return outliner::InstrType::Invisible;
+
+ // PIC instructions contain labels, outlining them would break offset
+ // computing. unsigned Opc = MI.getOpcode();
+ unsigned Opc = MI.getOpcode();
+ if (Opc == ARM::tPICADD || Opc == ARM::PICADD || Opc == ARM::PICSTR ||
+ Opc == ARM::PICSTRB || Opc == ARM::PICSTRH || Opc == ARM::PICLDR ||
+ Opc == ARM::PICLDRB || Opc == ARM::PICLDRH || Opc == ARM::PICLDRSB ||
+ Opc == ARM::PICLDRSH || Opc == ARM::t2LDRpci_pic ||
+ Opc == ARM::t2MOVi16_ga_pcrel || Opc == ARM::t2MOVTi16_ga_pcrel ||
+ Opc == ARM::t2MOV_ga_pcrel)
+ return outliner::InstrType::Illegal;
+
+ // Be conservative with ARMv8.1 MVE instructions.
+ if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart ||
+ Opc == ARM::t2WhileLoopStart || Opc == ARM::t2LoopDec ||
+ Opc == ARM::t2LoopEnd)
+ return outliner::InstrType::Illegal;
+
+ const MCInstrDesc &MCID = MI.getDesc();
+ uint64_t MIFlags = MCID.TSFlags;
+ if ((MIFlags & ARMII::DomainMask) == ARMII::DomainMVE)
+ return outliner::InstrType::Illegal;
+
+ // Is this a terminator for a basic block?
+ if (MI.isTerminator()) {
+ // Don't outline if the branch is not unconditional.
+ if (isPredicated(MI))
+ return outliner::InstrType::Illegal;
+
+ // Is this the end of a function?
+ if (MI.getParent()->succ_empty())
+ return outliner::InstrType::Legal;
+
+ // It's not, so don't outline it.
+ return outliner::InstrType::Illegal;
+ }
+
+ // Make sure none of the operands are un-outlinable.
+ for (const MachineOperand &MOP : MI.operands()) {
+ if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+ MOP.isTargetIndex())
+ return outliner::InstrType::Illegal;
+ }
+
+ // Don't outline if link register or program counter value are used.
+ if (MI.readsRegister(ARM::LR, TRI) || MI.readsRegister(ARM::PC, TRI))
+ return outliner::InstrType::Illegal;
+
+ if (MI.isCall()) {
+ // If we don't know anything about the callee, assume it depends on the
+ // stack layout of the caller. In that case, it's only legal to outline
+ // as a tail-call. Explicitly list the call instructions we know about so
+ // we don't get unexpected results with call pseudo-instructions.
+ auto UnknownCallOutlineType = outliner::InstrType::Illegal;
+ if (Opc == ARM::BL || Opc == ARM::tBL || Opc == ARM::BLX ||
+ Opc == ARM::tBLXr || Opc == ARM::tBLXi)
+ UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
+
+ return UnknownCallOutlineType;
+ }
+
+ // Since calls are handled, don't touch LR or PC
+ if (MI.modifiesRegister(ARM::LR, TRI) || MI.modifiesRegister(ARM::PC, TRI))
+ return outliner::InstrType::Illegal;
+
+ // Does this use the stack?
+ if (MI.modifiesRegister(ARM::SP, TRI) || MI.readsRegister(ARM::SP, TRI)) {
+ // True if there is no chance that any outlined candidate from this range
+ // could require stack fixups. That is, both
+ // * LR is available in the range (No save/restore around call)
+ // * The range doesn't include calls (No save/restore in outlined frame)
+ // are true.
+ // FIXME: This is very restrictive; the flags check the whole block,
+ // not just the bit we will try to outline.
+ bool MightNeedStackFixUp =
+ (Flags & (MachineOutlinerMBBFlags::LRUnavailableSomewhere |
+ MachineOutlinerMBBFlags::HasCalls));
+
+ if (!MightNeedStackFixUp)
+ return outliner::InstrType::Legal;
+
+ return outliner::InstrType::Illegal;
+ }
+
+ // Be conservative with IT blocks.
+ if (MI.readsRegister(ARM::ITSTATE, TRI) ||
+ MI.modifiesRegister(ARM::ITSTATE, TRI))
+ return outliner::InstrType::Illegal;
+
+ // Don't outline positions.
+ if (MI.isPosition())
+ return outliner::InstrType::Illegal;
+
+ return outliner::InstrType::Legal;
+}
+
+void ARMBaseInstrInfo::buildOutlinedFrame(
+ MachineBasicBlock &MBB, MachineFunction &MF,
+ const outliner::OutlinedFunction &OF) const {
+ // Nothing is needed for tail-calls.
+ if (OF.FrameConstructionID == MachineOutlinerTailCall)
+ return;
+
+ // For thunk outlining, rewrite the last instruction from a call to a
+ // tail-call.
+ if (OF.FrameConstructionID == MachineOutlinerThunk) {
+ MachineInstr *Call = &*--MBB.instr_end();
+ bool isThumb = Subtarget.isThumb();
+ unsigned FuncOp = isThumb ? 2 : 0;
+ unsigned Opc = Call->getOperand(FuncOp).isReg()
+ ? isThumb ? ARM::tTAILJMPr : ARM::TAILJMPr
+ : isThumb ? Subtarget.isTargetMachO() ? ARM::tTAILJMPd
+ : ARM::tTAILJMPdND
+ : ARM::TAILJMPd;
+ MachineInstrBuilder MIB = BuildMI(MBB, MBB.end(), DebugLoc(), get(Opc))
+ .add(Call->getOperand(FuncOp));
+ if (isThumb && !Call->getOperand(FuncOp).isReg())
+ MIB.add(predOps(ARMCC::AL));
+ Call->eraseFromParent();
+ return;
+ }
+
+ // Here we have to insert the return ourselves. Get the correct opcode from
+ // current feature set.
+ BuildMI(MBB, MBB.end(), DebugLoc(), get(Subtarget.getReturnOpcode()))
+ .add(predOps(ARMCC::AL));
+}
+
+MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall(
+ Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
+ MachineFunction &MF, const outliner::Candidate &C) const {
+ MachineInstrBuilder MIB;
+ MachineBasicBlock::iterator CallPt;
+ unsigned Opc;
+ bool isThumb = Subtarget.isThumb();
+
+ // Are we tail calling?
+ if (C.CallConstructionID == MachineOutlinerTailCall) {
+ // If yes, then we can just branch to the label.
+ Opc = isThumb
+ ? Subtarget.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND
+ : ARM::TAILJMPd;
+ MIB = BuildMI(MF, DebugLoc(), get(Opc))
+ .addGlobalAddress(M.getNamedValue(MF.getName()));
+ if (isThumb)
+ MIB.add(predOps(ARMCC::AL));
+ It = MBB.insert(It, MIB);
+ return It;
+ }
+
+ // Create the call instruction.
+ Opc = isThumb ? ARM::tBL : ARM::BL;
+ MachineInstrBuilder CallMIB = BuildMI(MF, DebugLoc(), get(Opc));
+ if (isThumb)
+ CallMIB.add(predOps(ARMCC::AL));
+ CallMIB.addGlobalAddress(M.getNamedValue(MF.getName()));
+
+ // Can we save to a register?
+ if (C.CallConstructionID == MachineOutlinerRegSave) {
+ unsigned Reg = findRegisterToSaveLRTo(C);
+ assert(Reg != 0 && "No callee-saved register available?");
+
+ // Save and restore LR from that register.
+ if (!MBB.isLiveIn(ARM::LR))
+ MBB.addLiveIn(ARM::LR);
+ copyPhysReg(MBB, It, DebugLoc(), Reg, ARM::LR, true);
+ CallPt = MBB.insert(It, CallMIB);
+ copyPhysReg(MBB, It, DebugLoc(), ARM::LR, Reg, true);
+ It--;
+ return CallPt;
+ }
+ // Insert the call.
+ It = MBB.insert(It, CallMIB);
+ return It;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index f6d4ebe3a090..1a75b011ca59 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -21,6 +21,8 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsARM.h"
#include <array>
#include <cstdint>
@@ -105,6 +107,11 @@ protected:
Optional<DestSourcePair>
isCopyInstrImpl(const MachineInstr &MI) const override;
+ /// Specialization of \ref TargetInstrInfo::describeLoadedValue, used to
+ /// enhance debug entry value descriptions for ARM targets.
+ Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
+ Register Reg) const override;
+
public:
// Return whether the target has an explicit NOP encoding.
bool hasNOP() const;
@@ -146,6 +153,12 @@ public:
// Predication support.
bool isPredicated(const MachineInstr &MI) const override;
+ // MIR printer helper function to annotate Operands with a comment.
+ std::string
+ createMIROperandComment(const MachineInstr &MI, const MachineOperand &Op,
+ unsigned OpIdx,
+ const TargetRegisterInfo *TRI) const override;
+
ARMCC::CondCodes getPredicate(const MachineInstr &MI) const {
int PIdx = MI.findFirstPredOperandIdx();
return PIdx != -1 ? (ARMCC::CondCodes)MI.getOperand(PIdx).getImm()
@@ -207,13 +220,13 @@ public:
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- unsigned SrcReg, bool isKill, int FrameIndex,
+ Register SrcReg, bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- unsigned DestReg, int FrameIndex,
+ Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
@@ -222,7 +235,7 @@ public:
bool shouldSink(const MachineInstr &MI) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- unsigned DestReg, unsigned SubIdx,
+ Register DestReg, unsigned SubIdx,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const override;
@@ -286,16 +299,16 @@ public:
/// in SrcReg and SrcReg2 if having two register operands, and the value it
/// compares against in CmpValue. Return true if the comparison instruction
/// can be analyzed.
- bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &SrcReg2, int &CmpMask,
+ bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+ Register &SrcReg2, int &CmpMask,
int &CmpValue) const override;
/// optimizeCompareInstr - Convert the instruction to set the zero flag so
/// that we can remove a "comparison with zero"; Remove a redundant CMP
/// instruction if the flags can be updated in the same way by an earlier
/// instruction such as SUB.
- bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
- unsigned SrcReg2, int CmpMask, int CmpValue,
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+ Register SrcReg2, int CmpMask, int CmpValue,
const MachineRegisterInfo *MRI) const override;
bool analyzeSelect(const MachineInstr &MI,
@@ -308,7 +321,7 @@ public:
/// FoldImmediate - 'Reg' is known to be defined by a move immediate
/// instruction, try to fold the immediate into the use instruction.
- bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+ bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
MachineRegisterInfo *MRI) const override;
unsigned getNumMicroOps(const InstrItineraryData *ItinData,
@@ -343,7 +356,27 @@ public:
ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const override;
+ /// ARM supports the MachineOutliner.
+ bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
+ bool OutlineFromLinkOnceODRs) const override;
+ outliner::OutlinedFunction getOutliningCandidateInfo(
+ std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
+ outliner::InstrType getOutliningType(MachineBasicBlock::iterator &MIT,
+ unsigned Flags) const override;
+ bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+ unsigned &Flags) const override;
+ void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
+ const outliner::OutlinedFunction &OF) const override;
+ MachineBasicBlock::iterator
+ insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &It, MachineFunction &MF,
+ const outliner::Candidate &C) const override;
+
private:
+ /// Returns an unused general-purpose register which can be used for
+ /// constructing an outlined call if one exists. Returns 0 otherwise.
+ unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
+
unsigned getInstBundleLength(const MachineInstr &MI) const;
int getVLDMDefCycle(const InstrItineraryData *ItinData,
@@ -403,7 +436,7 @@ private:
/// Identify instructions that can be folded into a MOVCC instruction, and
/// return the defining instruction.
- MachineInstr *canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
+ MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
const TargetInstrInfo *TII) const;
private:
@@ -491,24 +524,6 @@ bool isUncondBranchOpcode(int Opc) {
// This table shows the VPT instruction variants, i.e. the different
// mask field encodings, see also B5.6. Predication/conditional execution in
// the ArmARM.
-enum VPTMaskValue {
- T = 8, // 0b1000
- TT = 4, // 0b0100
- TE = 12, // 0b1100
- TTT = 2, // 0b0010
- TTE = 6, // 0b0110
- TEE = 10, // 0b1010
- TET = 14, // 0b1110
- TTTT = 1, // 0b0001
- TTTE = 3, // 0b0011
- TTEE = 5, // 0b0101
- TTET = 7, // 0b0111
- TEEE = 9, // 0b1001
- TEET = 11, // 0b1011
- TETT = 13, // 0b1101
- TETE = 15 // 0b1111
-};
-
static inline bool isVPTOpcode(int Opc) {
return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 ||
Opc == ARM::MVE_VPTv16s8 || Opc == ARM::MVE_VPTv8i16 ||
@@ -595,6 +610,18 @@ unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) {
return 0;
}
+static inline unsigned getTailPredVectorWidth(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ llvm_unreachable("unhandled vctp opcode");
+ case ARM::MVE_VCTP8: return 16;
+ case ARM::MVE_VCTP16: return 8;
+ case ARM::MVE_VCTP32: return 4;
+ case ARM::MVE_VCTP64: return 2;
+ }
+ return 0;
+}
+
static inline
bool isVCTP(MachineInstr *MI) {
switch (MI->getOpcode()) {
@@ -642,20 +669,31 @@ static inline bool isPushOpcode(int Opc) {
Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
}
+static inline bool isSubImmOpcode(int Opc) {
+ return Opc == ARM::SUBri ||
+ Opc == ARM::tSUBi3 || Opc == ARM::tSUBi8 ||
+ Opc == ARM::tSUBSi3 || Opc == ARM::tSUBSi8 ||
+ Opc == ARM::t2SUBri || Opc == ARM::t2SUBri12 || Opc == ARM::t2SUBSri;
+}
+
+static inline bool isMovRegOpcode(int Opc) {
+ return Opc == ARM::MOVr || Opc == ARM::tMOVr || Opc == ARM::t2MOVr;
+}
/// isValidCoprocessorNumber - decide whether an explicit coprocessor
/// number is legal in generic instructions like CDP. The answer can
/// vary with the subtarget.
static inline bool isValidCoprocessorNumber(unsigned Num,
const FeatureBitset& featureBits) {
+ // In Armv7 and Armv8-M CP10 and CP11 clash with VFP/NEON, however, the
+ // coprocessor is still valid for CDP/MCR/MRC and friends. Allowing it is
+ // useful for code which is shared with older architectures which do not know
+ // the new VFP/NEON mnemonics.
+
// Armv8-A disallows everything *other* than 111x (CP14 and CP15).
if (featureBits[ARM::HasV8Ops] && (Num & 0xE) != 0xE)
return false;
- // Armv7 disallows 101x (CP10 and CP11), which clash with VFP/NEON.
- if (featureBits[ARM::HasV7Ops] && (Num & 0xE) == 0xA)
- return false;
-
- // Armv8.1-M also disallows 100x (CP8,CP9) and 111x (CP14,CP15)
+ // Armv8.1-M disallows 100x (CP8,CP9) and 111x (CP14,CP15)
// which clash with MVE.
if (featureBits[ARM::HasV8_1MMainlineOps] &&
((Num & 0xE) == 0x8 || (Num & 0xE) == 0xE))
@@ -667,7 +705,7 @@ static inline bool isValidCoprocessorNumber(unsigned Num,
/// getInstrPredicate - If instruction is predicated, returns its predicate
/// condition, otherwise returns AL. It also returns the condition code
/// register by reference.
-ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
+ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, Register &PredReg);
unsigned getMatchingCondBranchOpcode(unsigned Opc);
@@ -681,21 +719,21 @@ unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
/// code.
void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg,
- unsigned BaseReg, int NumBytes,
- ARMCC::CondCodes Pred, unsigned PredReg,
+ const DebugLoc &dl, Register DestReg,
+ Register BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, Register PredReg,
const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg,
- unsigned BaseReg, int NumBytes,
- ARMCC::CondCodes Pred, unsigned PredReg,
+ const DebugLoc &dl, Register DestReg,
+ Register BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, Register PredReg,
const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg,
- unsigned BaseReg, int NumBytes,
+ const DebugLoc &dl, Register DestReg,
+ Register BaseReg, int NumBytes,
const TargetInstrInfo &TII,
const ARMBaseRegisterInfo &MRI,
unsigned MIFlags = 0);
@@ -714,11 +752,11 @@ bool tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
/// offset could not be handled directly in MI, and return the left-over
/// portion by reference.
bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ Register FrameReg, int &Offset,
const ARMBaseInstrInfo &TII);
bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ Register FrameReg, int &Offset,
const ARMBaseInstrInfo &TII,
const TargetRegisterInfo *TRI);
@@ -733,7 +771,7 @@ MachineInstr *findCMPToFoldIntoCBZ(MachineInstr *Br,
const TargetRegisterInfo *TRI);
void addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB);
-void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned DestReg);
+void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, Register DestReg);
void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond);
void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond,
@@ -753,6 +791,70 @@ bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
const ARMSubtarget *Subtarget,
bool ForCodesize = false);
+// Return the immediate if this is ADDri or SUBri, scaled as appropriate.
+// Returns 0 for unknown instructions.
+inline int getAddSubImmediate(MachineInstr &MI) {
+ int Scale = 1;
+ unsigned ImmOp;
+ switch (MI.getOpcode()) {
+ case ARM::t2ADDri:
+ ImmOp = 2;
+ break;
+ case ARM::t2SUBri:
+ case ARM::t2SUBri12:
+ ImmOp = 2;
+ Scale = -1;
+ break;
+ case ARM::tSUBi3:
+ case ARM::tSUBi8:
+ ImmOp = 3;
+ Scale = -1;
+ break;
+ default:
+ return 0;
+ }
+ return Scale * MI.getOperand(ImmOp).getImm();
+}
+
+// Given a memory access Opcode, check that the give Imm would be a valid Offset
+// for this instruction using its addressing mode.
+inline bool isLegalAddressImm(unsigned Opcode, int Imm,
+ const TargetInstrInfo *TII) {
+ const MCInstrDesc &Desc = TII->get(Opcode);
+ unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+ switch (AddrMode) {
+ case ARMII::AddrModeT2_i7:
+ return std::abs(Imm) < (((1 << 7) * 1) - 1);
+ case ARMII::AddrModeT2_i7s2:
+ return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0;
+ case ARMII::AddrModeT2_i7s4:
+ return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0;
+ default:
+ llvm_unreachable("Unhandled Addressing mode");
+ }
+}
+
+// Return true if the given intrinsic is a gather or scatter
+inline bool isGatherScatter(IntrinsicInst *IntInst) {
+ if (IntInst == nullptr)
+ return false;
+ unsigned IntrinsicID = IntInst->getIntrinsicID();
+ return (IntrinsicID == Intrinsic::masked_gather ||
+ IntrinsicID == Intrinsic::arm_mve_vldr_gather_base ||
+ IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_predicated ||
+ IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb ||
+ IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated ||
+ IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset ||
+ IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated ||
+ IntrinsicID == Intrinsic::masked_scatter ||
+ IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base ||
+ IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated ||
+ IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb ||
+ IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated ||
+ IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset ||
+ IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated);
+}
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 52e6d05c3155..3579635f83b5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -220,10 +220,25 @@ getReservedRegs(const MachineFunction &MF) const {
}
bool ARMBaseRegisterInfo::
-isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const {
+isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const {
return !getReservedRegs(MF).test(PhysReg);
}
+bool ARMBaseRegisterInfo::isInlineAsmReadOnlyReg(const MachineFunction &MF,
+ unsigned PhysReg) const {
+ const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+ const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+ BitVector Reserved(getNumRegs());
+ markSuperRegs(Reserved, ARM::PC);
+ if (TFI->hasFP(MF))
+ markSuperRegs(Reserved, getFramePointerReg(STI));
+ if (hasBasePointer(MF))
+ markSuperRegs(Reserved, BasePtr);
+ assert(checkAllSuperRegsMarked(Reserved));
+ return Reserved.test(PhysReg);
+}
+
const TargetRegisterClass *
ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &MF) const {
@@ -289,7 +304,8 @@ ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
}
// Get the other register in a GPRPair.
-static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) {
+static MCPhysReg getPairedGPR(MCPhysReg Reg, bool Odd,
+ const MCRegisterInfo *RI) {
for (MCSuperRegIterator Supers(Reg, RI); Supers.isValid(); ++Supers)
if (ARM::GPRPairRegClass.contains(*Supers))
return RI->getSubReg(*Supers, Odd ? ARM::gsub_1 : ARM::gsub_0);
@@ -297,15 +313,12 @@ static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) {
}
// Resolve the RegPairEven / RegPairOdd register allocator hints.
-bool
-ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
- ArrayRef<MCPhysReg> Order,
- SmallVectorImpl<MCPhysReg> &Hints,
- const MachineFunction &MF,
- const VirtRegMap *VRM,
- const LiveRegMatrix *Matrix) const {
+bool ARMBaseRegisterInfo::getRegAllocationHints(
+ Register VirtReg, ArrayRef<MCPhysReg> Order,
+ SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
+ const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
const MachineRegisterInfo &MRI = MF.getRegInfo();
- std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
+ std::pair<Register, Register> Hint = MRI.getRegAllocationHint(VirtReg);
unsigned Odd;
switch (Hint.first) {
@@ -323,12 +336,12 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
// This register should preferably be even (Odd == 0) or odd (Odd == 1).
// Check if the other part of the pair has already been assigned, and provide
// the paired register as the first hint.
- unsigned Paired = Hint.second;
- if (Paired == 0)
+ Register Paired = Hint.second;
+ if (!Paired)
return false;
- unsigned PairedPhys = 0;
- if (Register::isPhysicalRegister(Paired)) {
+ Register PairedPhys;
+ if (Paired.isPhysical()) {
PairedPhys = Paired;
} else if (VRM && VRM->hasPhys(Paired)) {
PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this);
@@ -339,11 +352,11 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
Hints.push_back(PairedPhys);
// Then prefer even or odd registers.
- for (unsigned Reg : Order) {
+ for (MCPhysReg Reg : Order) {
if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd)
continue;
// Don't provide hints that are paired to a reserved register.
- unsigned Paired = getPairedGPR(Reg, !Odd, this);
+ MCPhysReg Paired = getPairedGPR(Reg, !Odd, this);
if (!Paired || MRI.isReserved(Paired))
continue;
Hints.push_back(Reg);
@@ -351,27 +364,27 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
return false;
}
-void
-ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
- MachineFunction &MF) const {
+void ARMBaseRegisterInfo::updateRegAllocHint(Register Reg, Register NewReg,
+ MachineFunction &MF) const {
MachineRegisterInfo *MRI = &MF.getRegInfo();
- std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
- if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
- Hint.first == (unsigned)ARMRI::RegPairEven) &&
- Register::isVirtualRegister(Hint.second)) {
+ std::pair<Register, Register> Hint = MRI->getRegAllocationHint(Reg);
+ if ((Hint.first == ARMRI::RegPairOdd || Hint.first == ARMRI::RegPairEven) &&
+ Hint.second.isVirtual()) {
// If 'Reg' is one of the even / odd register pair and it's now changed
// (e.g. coalesced) into a different register. The other register of the
// pair allocation hint must be updated to reflect the relationship
// change.
- unsigned OtherReg = Hint.second;
+ Register OtherReg = Hint.second;
Hint = MRI->getRegAllocationHint(OtherReg);
// Make sure the pair has not already divorced.
if (Hint.second == Reg) {
MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
if (Register::isVirtualRegister(NewReg))
MRI->setRegAllocationHint(NewReg,
- Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven
- : ARMRI::RegPairOdd, OtherReg);
+ Hint.first == ARMRI::RegPairOdd
+ ? ARMRI::RegPairEven
+ : ARMRI::RegPairOdd,
+ OtherReg);
}
}
}
@@ -457,14 +470,14 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
/// specified immediate.
void ARMBaseRegisterInfo::emitLoadConstPool(
MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
- ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
+ const DebugLoc &dl, Register DestReg, unsigned SubIdx, int Val,
+ ARMCC::CondCodes Pred, Register PredReg, unsigned MIFlags) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
MachineConstantPool *ConstantPool = MF.getConstantPool();
const Constant *C =
ConstantInt::get(Type::getInt32Ty(MF.getFunction().getContext()), Val);
- unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
.addReg(DestReg, getDefRegState(true), SubIdx)
@@ -480,11 +493,6 @@ requiresRegisterScavenging(const MachineFunction &MF) const {
}
bool ARMBaseRegisterInfo::
-trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
- return true;
-}
-
-bool ARMBaseRegisterInfo::
requiresFrameIndexScavenging(const MachineFunction &MF) const {
return true;
}
@@ -606,9 +614,9 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
// The FP is only available if there is no dynamic realignment. We
// don't know for sure yet whether we'll need that, so we guess based
// on whether there are any local variables that would trigger it.
- unsigned StackAlign = TFI->getStackAlignment();
if (TFI->hasFP(MF) &&
- !((MFI.getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) {
+ !((MFI.getLocalFrameMaxAlign() > TFI->getStackAlign()) &&
+ canRealignStack(MF))) {
if (isFrameOffsetLegal(MI, getFrameRegister(MF), FPOffset))
return false;
}
@@ -626,10 +634,10 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
/// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to
/// be a pointer to FrameIdx at the beginning of the basic block.
-void ARMBaseRegisterInfo::
-materializeFrameBaseRegister(MachineBasicBlock *MBB,
- unsigned BaseReg, int FrameIdx,
- int64_t Offset) const {
+void ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+ Register BaseReg,
+ int FrameIdx,
+ int64_t Offset) const {
ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
(AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri);
@@ -652,7 +660,7 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
}
-void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const {
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
@@ -680,7 +688,8 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
(void)Done;
}
-bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+ Register BaseReg,
int64_t Offset) const {
const MCInstrDesc &Desc = MI->getDesc();
unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
@@ -759,7 +768,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
assert(!AFI->isThumb1OnlyFunction() &&
"This eliminateFrameIndex does not support Thumb1!");
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
- unsigned FrameReg;
+ Register FrameReg;
int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 477f3ad0a9a7..0a0907af2141 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -134,7 +134,9 @@ public:
BitVector getReservedRegs(const MachineFunction &MF) const override;
bool isAsmClobberable(const MachineFunction &MF,
- unsigned PhysReg) const override;
+ MCRegister PhysReg) const override;
+ bool isInlineAsmReadOnlyReg(const MachineFunction &MF,
+ unsigned PhysReg) const override;
const TargetRegisterClass *
getPointerRegClass(const MachineFunction &MF,
@@ -149,14 +151,12 @@ public:
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
- bool getRegAllocationHints(unsigned VirtReg,
- ArrayRef<MCPhysReg> Order,
+ bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
SmallVectorImpl<MCPhysReg> &Hints,
- const MachineFunction &MF,
- const VirtRegMap *VRM,
+ const MachineFunction &MF, const VirtRegMap *VRM,
const LiveRegMatrix *Matrix) const override;
- void updateRegAllocHint(unsigned Reg, unsigned NewReg,
+ void updateRegAllocHint(Register Reg, Register NewReg,
MachineFunction &MF) const override;
bool hasBasePointer(const MachineFunction &MF) const;
@@ -165,35 +165,32 @@ public:
int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
int Idx) const override;
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
- void materializeFrameBaseRegister(MachineBasicBlock *MBB,
- unsigned BaseReg, int FrameIdx,
+ void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
+ int FrameIdx,
int64_t Offset) const override;
- void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const override;
- bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+ bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
int64_t Offset) const override;
bool cannotEliminateFrame(const MachineFunction &MF) const;
// Debug information queries.
Register getFrameRegister(const MachineFunction &MF) const override;
- unsigned getBaseRegister() const { return BasePtr; }
-
+ Register getBaseRegister() const { return BasePtr; }
/// emitLoadConstPool - Emits a load from constpool to materialize the
/// specified immediate.
virtual void
emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+ const DebugLoc &dl, Register DestReg, unsigned SubIdx,
int Val, ARMCC::CondCodes Pred = ARMCC::AL,
- unsigned PredReg = 0,
+ Register PredReg = Register(),
unsigned MIFlags = MachineInstr::NoFlags) const;
/// Code Generation virtual methods...
bool requiresRegisterScavenging(const MachineFunction &MF) const override;
- bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
-
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp
index 00a2231f59e3..6d389cc82730 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp
@@ -49,7 +49,7 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) {
BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
BBI.Size = 0;
BBI.Unalign = 0;
- BBI.PostAlign = Align::None();
+ BBI.PostAlign = Align(1);
for (MachineInstr &I : *MBB) {
BBI.Size += TII->getInstSizeInBytes(I);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
index 13df399ed995..47d9a4049fa0 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -87,10 +87,10 @@ struct BasicBlockInfo {
/// Compute the offset immediately following this block. If Align is
/// specified, return the offset the successor block will get if it has
/// this alignment.
- unsigned postOffset(Align Alignment = Align::None()) const {
+ unsigned postOffset(Align Alignment = Align(1)) const {
unsigned PO = Offset + Size;
const Align PA = std::max(PostAlign, Alignment);
- if (PA == Align::None())
+ if (PA == Align(1))
return PO;
// Add alignment padding from the terminator.
return PO + UnknownPadding(PA, internalKnownBits());
@@ -101,7 +101,7 @@ struct BasicBlockInfo {
/// instruction alignment. An aligned terminator may increase the number
/// of know bits.
/// If LogAlign is given, also consider the alignment of the next block.
- unsigned postKnownBits(Align Align = Align::None()) const {
+ unsigned postKnownBits(Align Align = llvm::Align(1)) const {
return std::max(Log2(std::max(PostAlign, Align)), internalKnownBits());
}
};
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
index ce260a9ba145..d860473011e7 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -99,17 +99,14 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
LLT p0 = LLT::pointer(0, 32);
LLT s32 = LLT::scalar(32);
- Register SPReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildCopy(SPReg, Register(ARM::SP));
+ auto SPReg = MIRBuilder.buildCopy(p0, Register(ARM::SP));
- Register OffsetReg = MRI.createGenericVirtualRegister(s32);
- MIRBuilder.buildConstant(OffsetReg, Offset);
+ auto OffsetReg = MIRBuilder.buildConstant(s32, Offset);
- Register AddrReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
+ auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
- return AddrReg;
+ return AddrReg.getReg(0);
}
void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -133,7 +130,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
Register ExtReg = extendRegister(ValVReg, VA);
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
- /* Alignment */ 1);
+ Align(1));
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
@@ -143,7 +140,10 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
CCValAssign VA = VAs[0];
assert(VA.needsCustom() && "Value doesn't need custom handling");
- assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+ // Custom lowering for other types, such as f16, is currently not supported
+ if (VA.getValVT() != MVT::f64)
+ return 0;
CCValAssign NextVA = VAs[1];
assert(NextVA.needsCustom() && "Value doesn't need custom handling");
@@ -203,7 +203,7 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
// Even if there is no splitting to do, we still want to replace the
// original type (e.g. pointer type -> integer).
auto Flags = OrigArg.Flags[0];
- Flags.setOrigAlign(Align(DL.getABITypeAlignment(OrigArg.Ty)));
+ Flags.setOrigAlign(DL.getABITypeAlign(OrigArg.Ty));
SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
Flags, OrigArg.IsFixed);
return;
@@ -215,7 +215,7 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
auto Flags = OrigArg.Flags[0];
- Flags.setOrigAlign(Align(DL.getABITypeAlignment(SplitTy)));
+ Flags.setOrigAlign(DL.getABITypeAlign(SplitTy));
bool NeedsConsecutiveRegisters =
TLI.functionArgumentNeedsConsecutiveRegisters(
@@ -299,11 +299,8 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- Register AddrReg =
- MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32));
- MIRBuilder.buildFrameIndex(AddrReg, FI);
-
- return AddrReg;
+ return MIRBuilder.buildFrameIndex(LLT::pointer(MPO.getAddrSpace(), 32), FI)
+ .getReg(0);
}
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
@@ -318,20 +315,21 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
Size = 4;
assert(MRI.getType(ValVReg).isScalar() && "Only scalars supported atm");
- auto LoadVReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
- buildLoad(LoadVReg, Addr, Size, /* Alignment */ 1, MPO);
+ auto LoadVReg = buildLoad(LLT::scalar(32), Addr, Size, MPO);
MIRBuilder.buildTrunc(ValVReg, LoadVReg);
} else {
// If the value is not extended, a simple load will suffice.
- buildLoad(ValVReg, Addr, Size, /* Alignment */ 1, MPO);
+ buildLoad(ValVReg, Addr, Size, MPO);
}
}
- void buildLoad(Register Val, Register Addr, uint64_t Size, unsigned Alignment,
- MachinePointerInfo &MPO) {
- auto MMO = MIRBuilder.getMF().getMachineMemOperand(
- MPO, MachineMemOperand::MOLoad, Size, Alignment);
- MIRBuilder.buildLoad(Val, Addr, *MMO);
+ MachineInstrBuilder buildLoad(const DstOp &Res, Register Addr, uint64_t Size,
+ MachinePointerInfo &MPO) {
+ MachineFunction &MF = MIRBuilder.getMF();
+
+ auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size,
+ inferAlignFromPtrInfo(MF, MPO));
+ return MIRBuilder.buildLoad(Res, Addr, *MMO);
}
void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -354,9 +352,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
// We cannot create a truncating copy, nor a trunc of a physical register.
// Therefore, we need to copy the content of the physical register into a
// virtual one and then truncate that.
- auto PhysRegToVReg =
- MRI.createGenericVirtualRegister(LLT::scalar(LocSize));
- MIRBuilder.buildCopy(PhysRegToVReg, PhysReg);
+ auto PhysRegToVReg = MIRBuilder.buildCopy(LLT::scalar(LocSize), PhysReg);
MIRBuilder.buildTrunc(ValVReg, PhysRegToVReg);
}
}
@@ -367,7 +363,10 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
CCValAssign VA = VAs[0];
assert(VA.needsCustom() && "Value doesn't need custom handling");
- assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+ // Custom lowering for other types, such as f16, is currently not supported
+ if (VA.getValVT() != MVT::f64)
+ return 0;
CCValAssign NextVA = VAs[1];
assert(NextVA.needsCustom() && "Value doesn't need custom handling");
@@ -436,7 +435,7 @@ bool ARMCallLowering::lowerFormalArguments(
for (auto &Arg : F.args()) {
if (!isSupportedType(DL, TLI, Arg.getType()))
return false;
- if (Arg.hasByValOrInAllocaAttr())
+ if (Arg.hasPassPointeeByValueAttr())
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp
index a47c59512592..67c822a5b6ef 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp
@@ -32,9 +32,8 @@ static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
return false;
// Put the whole thing on the stack.
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
- State.AllocateStack(8, 4),
- LocVT, LocInfo));
+ State.addLoc(CCValAssign::getCustomMem(
+ ValNo, ValVT, State.AllocateStack(8, Align(4)), LocVT, LocInfo));
return true;
}
@@ -42,9 +41,8 @@ static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
if (unsigned Reg = State.AllocateReg(RegList))
State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
else
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
- State.AllocateStack(4, 4),
- LocVT, LocInfo));
+ State.addLoc(CCValAssign::getCustomMem(
+ ValNo, ValVT, State.AllocateStack(4, Align(4)), LocVT, LocInfo));
return true;
}
@@ -81,9 +79,8 @@ static bool f64AssignAAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
return false;
// Put the whole thing on the stack.
- State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
- State.AllocateStack(8, 8),
- LocVT, LocInfo));
+ State.addLoc(CCValAssign::getCustomMem(
+ ValNo, ValVT, State.AllocateStack(8, Align(8)), LocVT, LocInfo));
return true;
}
@@ -184,8 +181,8 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
// aggregate. Store the type's required alignment as extra info for later: in
// the [N x i64] case all trace has been removed by the time we actually get
// to do allocation.
- PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
- ArgFlags.getOrigAlign()));
+ PendingMembers.push_back(CCValAssign::getPending(
+ ValNo, ValVT, LocVT, LocInfo, ArgFlags.getNonZeroOrigAlign().value()));
if (!ArgFlags.isInConsecutiveRegsLast())
return true;
@@ -193,8 +190,9 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
// Try to allocate a contiguous block of registers, each of the correct
// size to hold one member.
auto &DL = State.getMachineFunction().getDataLayout();
- unsigned StackAlign = DL.getStackAlignment().value();
- unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
+ const Align StackAlign = DL.getStackAlignment();
+ const Align FirstMemberAlign(PendingMembers[0].getExtraInfo());
+ Align Alignment = std::min(FirstMemberAlign, StackAlign);
ArrayRef<MCPhysReg> RegList;
switch (LocVT.SimpleTy) {
@@ -204,21 +202,24 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
// First consume all registers that would give an unaligned object. Whether
// we go on stack or in regs, no-one will be using them in future.
- unsigned RegAlign = alignTo(Align, 4) / 4;
+ unsigned RegAlign = alignTo(Alignment.value(), 4) / 4;
while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
State.AllocateReg(RegList[RegIdx++]);
break;
}
case MVT::f16:
+ case MVT::bf16:
case MVT::f32:
RegList = SRegList;
break;
case MVT::v4f16:
+ case MVT::v4bf16:
case MVT::f64:
RegList = DRegList;
break;
case MVT::v8f16:
+ case MVT::v8bf16:
case MVT::v2f64:
RegList = QRegList;
break;
@@ -247,7 +248,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
unsigned RegIdx = State.getFirstUnallocated(RegList);
for (auto &It : PendingMembers) {
if (RegIdx >= RegList.size())
- It.convertToMem(State.AllocateStack(Size, Size));
+ It.convertToMem(State.AllocateStack(Size, Align(Size)));
else
It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
@@ -265,12 +266,12 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
// After the first item has been allocated, the rest are packed as tightly as
// possible. (E.g. an incoming i64 would have starting Align of 8, but we'll
// be allocating a bunch of i32 slots).
- unsigned RestAlign = std::min(Align, Size);
+ const Align RestAlign = std::min(Alignment, Align(Size));
for (auto &It : PendingMembers) {
- It.convertToMem(State.AllocateStack(Size, Align));
+ It.convertToMem(State.AllocateStack(Size, Alignment));
State.addLoc(It);
- Align = RestAlign;
+ Alignment = RestAlign;
}
// All pending members have now been allocated
@@ -280,5 +281,33 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
return true;
}
+static bool CustomAssignInRegList(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo, CCState &State,
+ ArrayRef<MCPhysReg> RegList) {
+ unsigned Reg = State.AllocateReg(RegList);
+ if (Reg) {
+ State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+ return true;
+ }
+ return false;
+}
+
+static bool CC_ARM_AAPCS_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ // f16 arguments are extended to i32 and assigned to a register in [r0, r3]
+ return CustomAssignInRegList(ValNo, ValVT, MVT::i32, LocInfo, State,
+ RRegList);
+}
+
+static bool CC_ARM_AAPCS_VFP_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags,
+ CCState &State) {
+ // f16 arguments are extended to f32 and assigned to a register in [s0, s15]
+ return CustomAssignInRegList(ValNo, ValVT, MVT::f32, LocInfo, State,
+ SRegList);
+}
+
// Include the table generated calling convention implementations.
#include "ARMGenCallingConv.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td
index 5df5b56f5afa..3517274e4c5c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -10,7 +10,7 @@
/// CCIfAlign - Match of the original alignment of the arg
class CCIfAlign<string Align, CCAction A>:
- CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+ CCIf<!strconcat("ArgFlags.getNonZeroOrigAlign() == ", Align), A>;
//===----------------------------------------------------------------------===//
// ARM APCS Calling Convention
@@ -30,8 +30,8 @@ def CC_ARM_APCS : CallingConv<[
CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
@@ -56,8 +56,8 @@ def RetCC_ARM_APCS : CallingConv<[
CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
@@ -71,8 +71,8 @@ def RetCC_ARM_APCS : CallingConv<[
let Entry = 1 in
def FastCC_ARM_APCS : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
@@ -91,8 +91,8 @@ def FastCC_ARM_APCS : CallingConv<[
let Entry = 1 in
def RetFastCC_ARM_APCS : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
@@ -108,8 +108,8 @@ def RetFastCC_ARM_APCS : CallingConv<[
let Entry = 1 in
def CC_ARM_APCS_GHC : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>,
@@ -134,12 +134,12 @@ def CC_ARM_AAPCS_Common : CallingConv<[
// i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register
// (and the same is true for f64 if VFP is not enabled)
CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>,
- CCIfType<[i32], CCIf<"ArgFlags.getOrigAlign() != 8",
+ CCIfType<[i32], CCIf<"ArgFlags.getNonZeroOrigAlign() != Align(8)",
CCAssignToReg<[R0, R1, R2, R3]>>>,
CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>,
CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
- CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+ CCIfType<[f16, bf16, f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
CCIfType<[v2f64], CCIfAlign<"16",
CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>,
@@ -165,8 +165,8 @@ def CC_ARM_AAPCS : CallingConv<[
CCIfNest<CCAssignToReg<[R12]>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -176,14 +176,15 @@ def CC_ARM_AAPCS : CallingConv<[
CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
CCIfType<[f32], CCBitConvertToType<i32>>,
+ CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>,
CCDelegateTo<CC_ARM_AAPCS_Common>
]>;
let Entry = 1 in
def RetCC_ARM_AAPCS : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -193,6 +194,7 @@ def RetCC_ARM_AAPCS : CallingConv<[
CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
CCIfType<[f32], CCBitConvertToType<i32>>,
+ CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>,
CCDelegateTo<RetCC_ARM_AAPCS_Common>
]>;
@@ -208,8 +210,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -224,14 +226,15 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
S9, S10, S11, S12, S13, S14, S15]>>,
+ CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>,
CCDelegateTo<CC_ARM_AAPCS_Common>
]>;
let Entry = 1 in
def RetCC_ARM_AAPCS_VFP : CallingConv<[
// Handle all vector types as either f64 or v2f64.
- CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
- CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+ CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
// Pass SwiftSelf in a callee saved register.
CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -242,7 +245,8 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
- S9, S10, S11, S12, S13, S14, S15]>>,
+ S9, S10, S11, S12, S13, S14, S15]>>,
+ CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>,
CCDelegateTo<RetCC_ARM_AAPCS_Common>
]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 66ad120a111f..195d0a89291b 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -206,10 +206,6 @@ namespace {
/// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
SmallVector<MachineInstr*, 4> T2JumpTables;
- /// HasFarJump - True if any far jump instruction has been emitted during
- /// the branch fix up pass.
- bool HasFarJump;
-
MachineFunction *MF;
MachineConstantPool *MCP;
const ARMBaseInstrInfo *TII;
@@ -270,7 +266,6 @@ namespace {
bool fixupImmediateBr(ImmBranch &Br);
bool fixupConditionalBr(ImmBranch &Br);
bool fixupUnconditionalBr(ImmBranch &Br);
- bool undoLRSpillRestore();
bool optimizeThumb2Instructions();
bool optimizeThumb2Branches();
bool reorderThumb2JumpTables();
@@ -350,7 +345,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
LLVM_DEBUG(dbgs() << "***** ARMConstantIslands: "
<< MCP->getConstants().size() << " CP entries, aligned to "
- << MCP->getConstantPoolAlignment() << " bytes *****\n");
+ << MCP->getConstantPoolAlign().value() << " bytes *****\n");
STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget());
TII = STI->getInstrInfo();
@@ -363,7 +358,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
isThumb1 = AFI->isThumb1OnlyFunction();
isThumb2 = AFI->isThumb2Function();
- HasFarJump = false;
bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
// Renumber all of the machine basic blocks in the function, guaranteeing that
@@ -456,11 +450,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
// After a while, this might be made debug-only, but it is not expensive.
verify();
- // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
- // undo the spill / restore of LR if possible.
- if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
- MadeChange |= undoLRSpillRestore();
-
// Save the mapping between original and cloned constpool entries.
for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) {
@@ -494,7 +483,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
MF->push_back(BB);
// MachineConstantPool measures alignment in bytes.
- const Align MaxAlign(MCP->getConstantPoolAlignment());
+ const Align MaxAlign = MCP->getConstantPoolAlign();
const unsigned MaxLogAlign = Log2(MaxAlign);
// Mark the basic block as required by the const-pool.
@@ -518,14 +507,13 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
const DataLayout &TD = MF->getDataLayout();
for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
- unsigned Align = CPs[i].getAlignment();
- assert(isPowerOf2_32(Align) && "Invalid alignment");
+ Align Alignment = CPs[i].getAlign();
// Verify that all constant pool entries are a multiple of their alignment.
// If not, we would have to pad them out so that instructions stay aligned.
- assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+ assert(isAligned(Alignment, Size) && "CP Entry not multiple of 4 bytes!");
// Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
- unsigned LogAlign = Log2_32(Align);
+ unsigned LogAlign = Log2(Alignment);
MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
MachineInstr *CPEMI =
BuildMI(*BB, InsAt, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
@@ -542,7 +530,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
CPEntries.emplace_back(1, CPEntry(CPEMI, i));
++NumCPEs;
LLVM_DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
- << Size << ", align = " << Align << '\n');
+ << Size << ", align = " << Alignment.value() << '\n');
}
LLVM_DEBUG(BB->dump());
}
@@ -668,7 +656,7 @@ Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) {
unsigned CPI = getCombinedIndex(CPEMI);
assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
- return Align(MCP->getConstants()[CPI].getAlignment());
+ return MCP->getConstants()[CPI].getAlign();
}
/// scanFunctionJumpTables - Do a scan of the function, building up
@@ -1364,8 +1352,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
// displacement.
MachineBasicBlock::iterator I = UserMI;
++I;
- for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI),
- PredReg = 0;
+ Register PredReg;
+ for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI);
I->getOpcode() != ARM::t2IT &&
getITInstrPredicate(*I, PredReg) != ARMCC::AL;
Offset += TII->getInstSizeInBytes(*I), I = std::next(I)) {
@@ -1410,7 +1398,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
// Avoid splitting an IT block.
if (LastIT) {
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
if (CC != ARMCC::AL)
MI = LastIT;
@@ -1434,7 +1422,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
// We really must not split an IT block.
#ifndef NDEBUG
- unsigned PredReg;
+ Register PredReg;
assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL);
#endif
NewMBB = splitBlockBeforeInstr(&*MI);
@@ -1566,7 +1554,7 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
BBInfo[CPEBB->getNumber()].Size = 0;
// This block no longer needs to be aligned.
- CPEBB->setAlignment(Align::None());
+ CPEBB->setAlignment(Align(1));
} else {
// Entries are sorted by descending alignment, so realign from the front.
CPEBB->setAlignment(getCPEAlign(&*CPEBB->begin()));
@@ -1633,7 +1621,6 @@ ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
BBInfoVector &BBInfo = BBUtils->getBBInfo();
BBInfo[MBB->getNumber()].Size += 2;
BBUtils->adjustBBOffsetsAfter(MBB);
- HasFarJump = true;
++NumUBrFixed;
LLVM_DEBUG(dbgs() << " Changed B to long jump " << *MI);
@@ -1735,34 +1722,6 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
return true;
}
-/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills
-/// LR / restores LR to pc. FIXME: This is done here because it's only possible
-/// to do this if tBfar is not used.
-bool ARMConstantIslands::undoLRSpillRestore() {
- bool MadeChange = false;
- for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
- MachineInstr *MI = PushPopMIs[i];
- // First two operands are predicates.
- if (MI->getOpcode() == ARM::tPOP_RET &&
- MI->getOperand(2).getReg() == ARM::PC &&
- MI->getNumExplicitOperands() == 3) {
- // Create the new insn and copy the predicate from the old.
- BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET))
- .add(MI->getOperand(0))
- .add(MI->getOperand(1));
- MI->eraseFromParent();
- MadeChange = true;
- } else if (MI->getOpcode() == ARM::tPUSH &&
- MI->getOperand(2).getReg() == ARM::LR &&
- MI->getNumExplicitOperands() == 3) {
- // Just remove the push.
- MI->eraseFromParent();
- MadeChange = true;
- }
- }
- return MadeChange;
-}
-
bool ARMConstantIslands::optimizeThumb2Instructions() {
bool MadeChange = false;
@@ -1868,7 +1827,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
if (!Br.MI->killsRegister(ARM::CPSR))
return false;
- unsigned PredReg = 0;
+ Register PredReg;
unsigned NewOpc = 0;
ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg);
if (Pred == ARMCC::EQ)
@@ -2402,6 +2361,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
SmallVector<MachineOperand, 4> CondPrior;
MachineFunction::iterator BBi = BB->getIterator();
MachineFunction::iterator OldPrior = std::prev(BBi);
+ MachineFunction::iterator OldNext = std::next(BBi);
// If the block terminator isn't analyzable, don't try to move the block
bool B = TII->analyzeBranch(*BB, TBB, FBB, Cond);
@@ -2412,8 +2372,8 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
if (!B && Cond.empty() && BB != &MF->front() &&
!TII->analyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
BB->moveAfter(JTBB);
- OldPrior->updateTerminator();
- BB->updateTerminator();
+ OldPrior->updateTerminator(BB);
+ BB->updateTerminator(OldNext != MF->end() ? &*OldNext : nullptr);
// Update numbering to account for the block being moved.
MF->RenumberBlocks();
++NumJTMoved;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
index 72c95f441265..c1df7ef43cad 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -73,7 +73,7 @@ StringRef ARMConstantPoolValue::getModifierText() const {
}
int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
- unsigned Alignment) {
+ Align Alignment) {
llvm_unreachable("Shouldn't be calling this directly!");
}
@@ -189,7 +189,7 @@ const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
}
int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
- unsigned Alignment) {
+ Align Alignment) {
int index =
getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment);
if (index != -1) {
@@ -228,7 +228,7 @@ ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, StringRef s,
bool AddCurrentAddress)
: ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier,
AddCurrentAddress),
- S(s) {}
+ S(std::string(s)) {}
ARMConstantPoolSymbol *ARMConstantPoolSymbol::Create(LLVMContext &C,
StringRef s, unsigned ID,
@@ -237,7 +237,7 @@ ARMConstantPoolSymbol *ARMConstantPoolSymbol::Create(LLVMContext &C,
}
int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
- unsigned Alignment) {
+ Align Alignment) {
return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment);
}
@@ -277,7 +277,7 @@ ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
}
int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
- unsigned Alignment) {
+ Align Alignment) {
return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.h
index 660b7fc88d82..261070a74ba3 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -76,13 +76,11 @@ protected:
bool AddCurrentAddress);
template <typename Derived>
- int getExistingMachineCPValueImpl(MachineConstantPool *CP,
- unsigned Alignment) {
- unsigned AlignMask = Alignment - 1;
+ int getExistingMachineCPValueImpl(MachineConstantPool *CP, Align Alignment) {
const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
if (Constants[i].isMachineConstantPoolEntry() &&
- (Constants[i].getAlignment() & AlignMask) == 0) {
+ Constants[i].getAlign() >= Alignment) {
auto *CPV =
static_cast<ARMConstantPoolValue*>(Constants[i].Val.MachineCPVal);
if (Derived *APC = dyn_cast<Derived>(CPV))
@@ -114,7 +112,7 @@ public:
bool isPromotedGlobal() const{ return Kind == ARMCP::CPPromotedGlobal; }
int getExistingMachineCPValue(MachineConstantPool *CP,
- unsigned Alignment) override;
+ Align Alignment) override;
void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
@@ -187,7 +185,7 @@ public:
}
int getExistingMachineCPValue(MachineConstantPool *CP,
- unsigned Alignment) override;
+ Align Alignment) override;
/// hasSameValue - Return true if this ARM constpool value can share the same
/// constantpool entry as another ARM constpool value.
@@ -223,7 +221,7 @@ public:
StringRef getSymbol() const { return S; }
int getExistingMachineCPValue(MachineConstantPool *CP,
- unsigned Alignment) override;
+ Align Alignment) override;
void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
@@ -259,7 +257,7 @@ public:
const MachineBasicBlock *getMBB() const { return MBB; }
int getExistingMachineCPValue(MachineConstantPool *CP,
- unsigned Alignment) override;
+ Align Alignment) override;
void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index de4377ec5a47..48622aae3cb4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -71,6 +71,38 @@ namespace {
unsigned Opc, bool IsExt);
void ExpandMOV32BitImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI);
+ void CMSEClearGPRegs(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ const SmallVectorImpl<unsigned> &ClearRegs,
+ unsigned ClobberReg);
+ MachineBasicBlock &CMSEClearFPRegs(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI);
+ MachineBasicBlock &CMSEClearFPRegsV8(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const BitVector &ClearRegs);
+ MachineBasicBlock &CMSEClearFPRegsV81(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const BitVector &ClearRegs);
+ void CMSESaveClearFPRegs(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ const LivePhysRegs &LiveRegs,
+ SmallVectorImpl<unsigned> &AvailableRegs);
+ void CMSESaveClearFPRegsV8(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ const LivePhysRegs &LiveRegs,
+ SmallVectorImpl<unsigned> &ScratchRegs);
+ void CMSESaveClearFPRegsV81(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ const LivePhysRegs &LiveRegs);
+ void CMSERestoreFPRegs(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ SmallVectorImpl<unsigned> &AvailableRegs);
+ void CMSERestoreFPRegsV8(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ SmallVectorImpl<unsigned> &AvailableRegs);
+ void CMSERestoreFPRegsV81(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ SmallVectorImpl<unsigned> &AvailableRegs);
bool ExpandCMP_SWAP(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, unsigned LdrexOp,
unsigned StrexOp, unsigned UxtOp,
@@ -417,8 +449,7 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
// Make sure the table is sorted.
static std::atomic<bool> TableChecked(false);
if (!TableChecked.load(std::memory_order_relaxed)) {
- assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) &&
- "NEONLdStTable is not sorted!");
+ assert(llvm::is_sorted(NEONLdStTable) && "NEONLdStTable is not sorted!");
TableChecked.store(true, std::memory_order_relaxed);
}
#endif
@@ -827,7 +858,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
unsigned Opcode = MI.getOpcode();
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
@@ -852,10 +883,13 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
unsigned ImmVal = (unsigned)MO.getImm();
unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+ unsigned MIFlags = MI.getFlags();
LO16 = LO16.addImm(SOImmValV1);
HI16 = HI16.addImm(SOImmValV2);
LO16.cloneMemRefs(MI);
HI16.cloneMemRefs(MI);
+ LO16.setMIFlags(MIFlags);
+ HI16.setMIFlags(MIFlags);
LO16.addImm(Pred).addReg(PredReg).add(condCodeOp());
HI16.addImm(Pred).addReg(PredReg).add(condCodeOp());
if (isCC)
@@ -867,6 +901,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
unsigned LO16Opc = 0;
unsigned HI16Opc = 0;
+ unsigned MIFlags = MI.getFlags();
if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) {
LO16Opc = ARM::t2MOVi16;
HI16Opc = ARM::t2MOVTi16;
@@ -880,6 +915,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
.addReg(DstReg);
+ LO16.setMIFlags(MIFlags);
+ HI16.setMIFlags(MIFlags);
+
switch (MO.getType()) {
case MachineOperand::MO_Immediate: {
unsigned Imm = MO.getImm();
@@ -921,6 +959,582 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
LLVM_DEBUG(dbgs() << "And: "; HI16.getInstr()->dump(););
}
+// The size of the area, accessed by that VLSTM/VLLDM
+// S0-S31 + FPSCR + 8 more bytes (VPR + pad, or just pad)
+static const int CMSE_FP_SAVE_SIZE = 136;
+
+static void determineGPRegsToClear(const MachineInstr &MI,
+ const std::initializer_list<unsigned> &Regs,
+ SmallVectorImpl<unsigned> &ClearRegs) {
+ SmallVector<unsigned, 4> OpRegs;
+ for (const MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ OpRegs.push_back(Op.getReg());
+ }
+ llvm::sort(OpRegs);
+
+ std::set_difference(Regs.begin(), Regs.end(), OpRegs.begin(), OpRegs.end(),
+ std::back_inserter(ClearRegs));
+}
+
+void ARMExpandPseudo::CMSEClearGPRegs(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, const SmallVectorImpl<unsigned> &ClearRegs,
+ unsigned ClobberReg) {
+
+ if (STI->hasV8_1MMainlineOps()) {
+ // Clear the registers using the CLRM instruction.
+ MachineInstrBuilder CLRM =
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::t2CLRM)).add(predOps(ARMCC::AL));
+ for (unsigned R : ClearRegs)
+ CLRM.addReg(R, RegState::Define);
+ CLRM.addReg(ARM::APSR, RegState::Define);
+ CLRM.addReg(ARM::CPSR, RegState::Define | RegState::Implicit);
+ } else {
+ // Clear the registers and flags by copying ClobberReg into them.
+ // (Baseline can't do a high register clear in one instruction).
+ for (unsigned Reg : ClearRegs) {
+ if (Reg == ClobberReg)
+ continue;
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::tMOVr), Reg)
+ .addReg(ClobberReg)
+ .add(predOps(ARMCC::AL));
+ }
+
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::t2MSR_M))
+ .addImm(STI->hasDSP() ? 0xc00 : 0x800)
+ .addReg(ClobberReg)
+ .add(predOps(ARMCC::AL));
+ }
+}
+
+// Find which FP registers need to be cleared. The parameter `ClearRegs` is
+// initialised with all elements set to true, and this function resets all the
+// bits, which correspond to register uses. Returns true if any floating point
+// register is defined, false otherwise.
+static bool determineFPRegsToClear(const MachineInstr &MI,
+ BitVector &ClearRegs) {
+ bool DefFP = false;
+ for (const MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+
+ unsigned Reg = Op.getReg();
+ if (Op.isDef()) {
+ if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) ||
+ (Reg >= ARM::D0 && Reg <= ARM::D15) ||
+ (Reg >= ARM::S0 && Reg <= ARM::S31))
+ DefFP = true;
+ continue;
+ }
+
+ if (Reg >= ARM::Q0 && Reg <= ARM::Q7) {
+ int R = Reg - ARM::Q0;
+ ClearRegs.reset(R * 4, (R + 1) * 4);
+ } else if (Reg >= ARM::D0 && Reg <= ARM::D15) {
+ int R = Reg - ARM::D0;
+ ClearRegs.reset(R * 2, (R + 1) * 2);
+ } else if (Reg >= ARM::S0 && Reg <= ARM::S31) {
+ ClearRegs[Reg - ARM::S0] = false;
+ }
+ }
+ return DefFP;
+}
+
+MachineBasicBlock &
+ARMExpandPseudo::CMSEClearFPRegs(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) {
+ BitVector ClearRegs(16, true);
+ (void)determineFPRegsToClear(*MBBI, ClearRegs);
+
+ if (STI->hasV8_1MMainlineOps())
+ return CMSEClearFPRegsV81(MBB, MBBI, ClearRegs);
+ else
+ return CMSEClearFPRegsV8(MBB, MBBI, ClearRegs);
+}
+
+// Clear the FP registers for v8.0-M, by copying over the content
+// of LR. Uses R12 as a scratch register.
+MachineBasicBlock &
+ARMExpandPseudo::CMSEClearFPRegsV8(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const BitVector &ClearRegs) {
+ if (!STI->hasFPRegs())
+ return MBB;
+
+ auto &RetI = *MBBI;
+ const DebugLoc &DL = RetI.getDebugLoc();
+
+ // If optimising for minimum size, clear FP registers unconditionally.
+ // Otherwise, check the CONTROL.SFPA (Secure Floating-Point Active) bit and
+ // don't clear them if they belong to the non-secure state.
+ MachineBasicBlock *ClearBB, *DoneBB;
+ if (STI->hasMinSize()) {
+ ClearBB = DoneBB = &MBB;
+ } else {
+ MachineFunction *MF = MBB.getParent();
+ ClearBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+ DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+ MF->insert(++MBB.getIterator(), ClearBB);
+ MF->insert(++ClearBB->getIterator(), DoneBB);
+
+ DoneBB->splice(DoneBB->end(), &MBB, MBBI, MBB.end());
+ DoneBB->transferSuccessors(&MBB);
+ MBB.addSuccessor(ClearBB);
+ MBB.addSuccessor(DoneBB);
+ ClearBB->addSuccessor(DoneBB);
+
+ // At the new basic blocks we need to have live-in the registers, used
+ // for the return value as well as LR, used to clear registers.
+ for (const MachineOperand &Op : RetI.operands()) {
+ if (!Op.isReg())
+ continue;
+ Register Reg = Op.getReg();
+ if (Reg == ARM::NoRegister || Reg == ARM::LR)
+ continue;
+ assert(Register::isPhysicalRegister(Reg) && "Unallocated register");
+ ClearBB->addLiveIn(Reg);
+ DoneBB->addLiveIn(Reg);
+ }
+ ClearBB->addLiveIn(ARM::LR);
+ DoneBB->addLiveIn(ARM::LR);
+
+ // Read the CONTROL register.
+ BuildMI(MBB, MBB.end(), DL, TII->get(ARM::t2MRS_M), ARM::R12)
+ .addImm(20)
+ .add(predOps(ARMCC::AL));
+ // Check bit 3 (SFPA).
+ BuildMI(MBB, MBB.end(), DL, TII->get(ARM::t2TSTri))
+ .addReg(ARM::R12)
+ .addImm(8)
+ .add(predOps(ARMCC::AL));
+ // If SFPA is clear, jump over ClearBB to DoneBB.
+ BuildMI(MBB, MBB.end(), DL, TII->get(ARM::tBcc))
+ .addMBB(DoneBB)
+ .addImm(ARMCC::EQ)
+ .addReg(ARM::CPSR, RegState::Kill);
+ }
+
+ // Emit the clearing sequence
+ for (unsigned D = 0; D < 8; D++) {
+ // Attempt to clear as double
+ if (ClearRegs[D * 2 + 0] && ClearRegs[D * 2 + 1]) {
+ unsigned Reg = ARM::D0 + D;
+ BuildMI(ClearBB, DL, TII->get(ARM::VMOVDRR), Reg)
+ .addReg(ARM::LR)
+ .addReg(ARM::LR)
+ .add(predOps(ARMCC::AL));
+ } else {
+ // Clear first part as single
+ if (ClearRegs[D * 2 + 0]) {
+ unsigned Reg = ARM::S0 + D * 2;
+ BuildMI(ClearBB, DL, TII->get(ARM::VMOVSR), Reg)
+ .addReg(ARM::LR)
+ .add(predOps(ARMCC::AL));
+ }
+ // Clear second part as single
+ if (ClearRegs[D * 2 + 1]) {
+ unsigned Reg = ARM::S0 + D * 2 + 1;
+ BuildMI(ClearBB, DL, TII->get(ARM::VMOVSR), Reg)
+ .addReg(ARM::LR)
+ .add(predOps(ARMCC::AL));
+ }
+ }
+ }
+
+ // Clear FPSCR bits 0-4, 7, 28-31
+ // The other bits are program global according to the AAPCS
+ BuildMI(ClearBB, DL, TII->get(ARM::VMRS), ARM::R12)
+ .add(predOps(ARMCC::AL));
+ BuildMI(ClearBB, DL, TII->get(ARM::t2BICri), ARM::R12)
+ .addReg(ARM::R12)
+ .addImm(0x0000009F)
+ .add(predOps(ARMCC::AL))
+ .add(condCodeOp());
+ BuildMI(ClearBB, DL, TII->get(ARM::t2BICri), ARM::R12)
+ .addReg(ARM::R12)
+ .addImm(0xF0000000)
+ .add(predOps(ARMCC::AL))
+ .add(condCodeOp());
+ BuildMI(ClearBB, DL, TII->get(ARM::VMSR))
+ .addReg(ARM::R12)
+ .add(predOps(ARMCC::AL));
+
+ return *DoneBB;
+}
+
+MachineBasicBlock &
+ARMExpandPseudo::CMSEClearFPRegsV81(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const BitVector &ClearRegs) {
+ auto &RetI = *MBBI;
+
+ // Emit a sequence of VSCCLRM <sreglist> instructions, one instruction for
+ // each contiguous sequence of S-registers.
+ int Start = -1, End = -1;
+ for (int S = 0, E = ClearRegs.size(); S != E; ++S) {
+ if (ClearRegs[S] && S == End + 1) {
+ End = S; // extend range
+ continue;
+ }
+ // Emit current range.
+ if (Start < End) {
+ MachineInstrBuilder VSCCLRM =
+ BuildMI(MBB, MBBI, RetI.getDebugLoc(), TII->get(ARM::VSCCLRMS))
+ .add(predOps(ARMCC::AL));
+ while (++Start <= End)
+ VSCCLRM.addReg(ARM::S0 + Start, RegState::Define);
+ VSCCLRM.addReg(ARM::VPR, RegState::Define);
+ }
+ Start = End = S;
+ }
+ // Emit last range.
+ if (Start < End) {
+ MachineInstrBuilder VSCCLRM =
+ BuildMI(MBB, MBBI, RetI.getDebugLoc(), TII->get(ARM::VSCCLRMS))
+ .add(predOps(ARMCC::AL));
+ while (++Start <= End)
+ VSCCLRM.addReg(ARM::S0 + Start, RegState::Define);
+ VSCCLRM.addReg(ARM::VPR, RegState::Define);
+ }
+
+ return MBB;
+}
+
+void ARMExpandPseudo::CMSESaveClearFPRegs(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) {
+ if (STI->hasV8_1MMainlineOps())
+ CMSESaveClearFPRegsV81(MBB, MBBI, DL, LiveRegs);
+ else
+ CMSESaveClearFPRegsV8(MBB, MBBI, DL, LiveRegs, ScratchRegs);
+}
+
+// Save and clear FP registers if present
+void ARMExpandPseudo::CMSESaveClearFPRegsV8(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) {
+ if (!STI->hasFPRegs())
+ return;
+
+ // Store an available register for FPSCR clearing
+ assert(!ScratchRegs.empty());
+ unsigned SpareReg = ScratchRegs.front();
+
+ // save space on stack for VLSTM
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBspi), ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(CMSE_FP_SAVE_SIZE >> 2)
+ .add(predOps(ARMCC::AL));
+
+ // Use ScratchRegs to store the fp regs
+ std::vector<std::tuple<unsigned, unsigned, unsigned>> ClearedFPRegs;
+ std::vector<unsigned> NonclearedFPRegs;
+ for (const MachineOperand &Op : MBBI->operands()) {
+ if (Op.isReg() && Op.isUse()) {
+ unsigned Reg = Op.getReg();
+ assert(!ARM::DPRRegClass.contains(Reg) ||
+ ARM::DPR_VFP2RegClass.contains(Reg));
+ assert(!ARM::QPRRegClass.contains(Reg));
+ if (ARM::DPR_VFP2RegClass.contains(Reg)) {
+ if (ScratchRegs.size() >= 2) {
+ unsigned SaveReg2 = ScratchRegs.pop_back_val();
+ unsigned SaveReg1 = ScratchRegs.pop_back_val();
+ ClearedFPRegs.emplace_back(Reg, SaveReg1, SaveReg2);
+
+ // Save the fp register to the normal registers
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRRD))
+ .addReg(SaveReg1, RegState::Define)
+ .addReg(SaveReg2, RegState::Define)
+ .addReg(Reg)
+ .add(predOps(ARMCC::AL));
+ } else {
+ NonclearedFPRegs.push_back(Reg);
+ }
+ } else if (ARM::SPRRegClass.contains(Reg)) {
+ if (ScratchRegs.size() >= 1) {
+ unsigned SaveReg = ScratchRegs.pop_back_val();
+ ClearedFPRegs.emplace_back(Reg, SaveReg, 0);
+
+ // Save the fp register to the normal registers
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRS), SaveReg)
+ .addReg(Reg)
+ .add(predOps(ARMCC::AL));
+ } else {
+ NonclearedFPRegs.push_back(Reg);
+ }
+ }
+ }
+ }
+
+ bool passesFPReg = (!NonclearedFPRegs.empty() || !ClearedFPRegs.empty());
+
+ // Lazy store all fp registers to the stack
+ MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL));
+ for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
+ ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
+ VLSTM.addReg(R, RegState::Implicit |
+ (LiveRegs.contains(R) ? 0 : RegState::Undef));
+
+ // Restore all arguments
+ for (const auto &Regs : ClearedFPRegs) {
+ unsigned Reg, SaveReg1, SaveReg2;
+ std::tie(Reg, SaveReg1, SaveReg2) = Regs;
+ if (ARM::DPR_VFP2RegClass.contains(Reg))
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVDRR), Reg)
+ .addReg(SaveReg1)
+ .addReg(SaveReg2)
+ .add(predOps(ARMCC::AL));
+ else if (ARM::SPRRegClass.contains(Reg))
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVSR), Reg)
+ .addReg(SaveReg1)
+ .add(predOps(ARMCC::AL));
+ }
+
+ for (unsigned Reg : NonclearedFPRegs) {
+ if (ARM::DPR_VFP2RegClass.contains(Reg)) {
+ if (STI->isLittle()) {
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRD), Reg)
+ .addReg(ARM::SP)
+ .addImm((Reg - ARM::D0) * 2)
+ .add(predOps(ARMCC::AL));
+ } else {
+ // For big-endian targets we need to load the two subregisters of Reg
+ // manually because VLDRD would load them in wrong order
+ unsigned SReg0 = TRI->getSubReg(Reg, ARM::ssub_0);
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), SReg0)
+ .addReg(ARM::SP)
+ .addImm((Reg - ARM::D0) * 2)
+ .add(predOps(ARMCC::AL));
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), SReg0 + 1)
+ .addReg(ARM::SP)
+ .addImm((Reg - ARM::D0) * 2 + 1)
+ .add(predOps(ARMCC::AL));
+ }
+ } else if (ARM::SPRRegClass.contains(Reg)) {
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), Reg)
+ .addReg(ARM::SP)
+ .addImm(Reg - ARM::S0)
+ .add(predOps(ARMCC::AL));
+ }
+ }
+ // restore FPSCR from stack and clear bits 0-4, 7, 28-31
+ // The other bits are program global according to the AAPCS
+ if (passesFPReg) {
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::t2LDRi8), SpareReg)
+ .addReg(ARM::SP)
+ .addImm(0x40)
+ .add(predOps(ARMCC::AL));
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg)
+ .addReg(SpareReg)
+ .addImm(0x0000009F)
+ .add(predOps(ARMCC::AL))
+ .add(condCodeOp());
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg)
+ .addReg(SpareReg)
+ .addImm(0xF0000000)
+ .add(predOps(ARMCC::AL))
+ .add(condCodeOp());
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VMSR))
+ .addReg(SpareReg)
+ .add(predOps(ARMCC::AL));
+ // The ldr must happen after a floating point instruction. To prevent the
+ // post-ra scheduler to mess with the order, we create a bundle.
+ finalizeBundle(MBB, VLSTM->getIterator(), MBBI->getIterator());
+ }
+}
+
+void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc &DL,
+ const LivePhysRegs &LiveRegs) {
+ BitVector ClearRegs(32, true);
+ bool DefFP = determineFPRegsToClear(*MBBI, ClearRegs);
+
+ // If the instruction does not write to a FP register and no elements were
+ // removed from the set, then no FP registers were used to pass
+ // arguments/returns.
+ if (!DefFP && ClearRegs.count() == ClearRegs.size()) {
+ // save space on stack for VLSTM
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBspi), ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(CMSE_FP_SAVE_SIZE >> 2)
+ .add(predOps(ARMCC::AL));
+
+ // Lazy store all FP registers to the stack
+ MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL));
+ for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
+ ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
+ VLSTM.addReg(R, RegState::Implicit |
+ (LiveRegs.contains(R) ? 0 : RegState::Undef));
+ } else {
+ // Push all the callee-saved registers (s16-s31).
+ MachineInstrBuilder VPUSH =
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTMSDB_UPD), ARM::SP)
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL));
+ for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg)
+ VPUSH.addReg(Reg);
+
+ // Clear FP registers with a VSCCLRM.
+ (void)CMSEClearFPRegsV81(MBB, MBBI, ClearRegs);
+
+ // Save floating-point context.
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTR_FPCXTS_pre), ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(-8)
+ .add(predOps(ARMCC::AL));
+ }
+}
+
+// Restore FP registers if present
+void ARMExpandPseudo::CMSERestoreFPRegs(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ SmallVectorImpl<unsigned> &AvailableRegs) {
+ if (STI->hasV8_1MMainlineOps())
+ CMSERestoreFPRegsV81(MBB, MBBI, DL, AvailableRegs);
+ else
+ CMSERestoreFPRegsV8(MBB, MBBI, DL, AvailableRegs);
+}
+
+void ARMExpandPseudo::CMSERestoreFPRegsV8(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ SmallVectorImpl<unsigned> &AvailableRegs) {
+ if (!STI->hasFPRegs())
+ return;
+
+ // Use AvailableRegs to store the fp regs
+ std::vector<std::tuple<unsigned, unsigned, unsigned>> ClearedFPRegs;
+ std::vector<unsigned> NonclearedFPRegs;
+ for (const MachineOperand &Op : MBBI->operands()) {
+ if (Op.isReg() && Op.isDef()) {
+ unsigned Reg = Op.getReg();
+ assert(!ARM::DPRRegClass.contains(Reg) ||
+ ARM::DPR_VFP2RegClass.contains(Reg));
+ assert(!ARM::QPRRegClass.contains(Reg));
+ if (ARM::DPR_VFP2RegClass.contains(Reg)) {
+ if (AvailableRegs.size() >= 2) {
+ unsigned SaveReg2 = AvailableRegs.pop_back_val();
+ unsigned SaveReg1 = AvailableRegs.pop_back_val();
+ ClearedFPRegs.emplace_back(Reg, SaveReg1, SaveReg2);
+
+ // Save the fp register to the normal registers
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRRD))
+ .addReg(SaveReg1, RegState::Define)
+ .addReg(SaveReg2, RegState::Define)
+ .addReg(Reg)
+ .add(predOps(ARMCC::AL));
+ } else {
+ NonclearedFPRegs.push_back(Reg);
+ }
+ } else if (ARM::SPRRegClass.contains(Reg)) {
+ if (AvailableRegs.size() >= 1) {
+ unsigned SaveReg = AvailableRegs.pop_back_val();
+ ClearedFPRegs.emplace_back(Reg, SaveReg, 0);
+
+ // Save the fp register to the normal registers
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRS), SaveReg)
+ .addReg(Reg)
+ .add(predOps(ARMCC::AL));
+ } else {
+ NonclearedFPRegs.push_back(Reg);
+ }
+ }
+ }
+ }
+
+ // Push FP regs that cannot be restored via normal registers on the stack
+ for (unsigned Reg : NonclearedFPRegs) {
+ if (ARM::DPR_VFP2RegClass.contains(Reg))
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRD), Reg)
+ .addReg(ARM::SP)
+ .addImm((Reg - ARM::D0) * 2)
+ .add(predOps(ARMCC::AL));
+ else if (ARM::SPRRegClass.contains(Reg))
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRS), Reg)
+ .addReg(ARM::SP)
+ .addImm(Reg - ARM::S0)
+ .add(predOps(ARMCC::AL));
+ }
+
+ // Lazy load fp regs from stack
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL));
+
+ // Restore all FP registers via normal registers
+ for (const auto &Regs : ClearedFPRegs) {
+ unsigned Reg, SaveReg1, SaveReg2;
+ std::tie(Reg, SaveReg1, SaveReg2) = Regs;
+ if (ARM::DPR_VFP2RegClass.contains(Reg))
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVDRR), Reg)
+ .addReg(SaveReg1)
+ .addReg(SaveReg2)
+ .add(predOps(ARMCC::AL));
+ else if (ARM::SPRRegClass.contains(Reg))
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVSR), Reg)
+ .addReg(SaveReg1)
+ .add(predOps(ARMCC::AL));
+ }
+
+ // Pop the stack space
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(CMSE_FP_SAVE_SIZE >> 2)
+ .add(predOps(ARMCC::AL));
+}
+
+static bool definesOrUsesFPReg(const MachineInstr &MI) {
+ for (const MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+ unsigned Reg = Op.getReg();
+ if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) ||
+ (Reg >= ARM::D0 && Reg <= ARM::D15) ||
+ (Reg >= ARM::S0 && Reg <= ARM::S31))
+ return true;
+ }
+ return false;
+}
+
+void ARMExpandPseudo::CMSERestoreFPRegsV81(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+ SmallVectorImpl<unsigned> &AvailableRegs) {
+ if (!definesOrUsesFPReg(*MBBI)) {
+ // Load FP registers from stack.
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL));
+
+ // Pop the stack space
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(CMSE_FP_SAVE_SIZE >> 2)
+ .add(predOps(ARMCC::AL));
+ } else {
+ // Restore the floating point context.
+ BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::VLDR_FPCXTS_post),
+ ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(8)
+ .add(predOps(ARMCC::AL));
+
+ // Pop all the callee-saved registers (s16-s31).
+ MachineInstrBuilder VPOP =
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDMSIA_UPD), ARM::SP)
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL));
+ for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg)
+ VPOP.addReg(Reg, RegState::Define);
+ }
+}
+
/// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
/// possible. This only gets used at -O0 so we don't care about efficiency of
/// the generated code.
@@ -1149,6 +1763,93 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
return true;
}
+static void CMSEPushCalleeSaves(const TargetInstrInfo &TII,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, int JumpReg,
+ const LivePhysRegs &LiveRegs, bool Thumb1Only) {
+ const DebugLoc &DL = MBBI->getDebugLoc();
+ if (Thumb1Only) { // push Lo and Hi regs separately
+ MachineInstrBuilder PushMIB =
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
+ for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) {
+ PushMIB.addReg(
+ Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef);
+ }
+
+ // Thumb1 can only tPUSH low regs, so we copy the high regs to the low
+ // regs that we just saved and push the low regs again, taking care to
+ // not clobber JumpReg. If JumpReg is one of the low registers, push first
+ // the values of r9-r11, and then r8. That would leave them ordered in
+ // memory, and allow us to later pop them with a single instructions.
+ // FIXME: Could also use any of r0-r3 that are free (including in the
+ // first PUSH above).
+ for (int LoReg = ARM::R7, HiReg = ARM::R11; LoReg >= ARM::R4; --LoReg) {
+ if (JumpReg == LoReg)
+ continue;
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg)
+ .addReg(HiReg, LiveRegs.contains(HiReg) ? 0 : RegState::Undef)
+ .add(predOps(ARMCC::AL));
+ --HiReg;
+ }
+ MachineInstrBuilder PushMIB2 =
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
+ for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) {
+ if (Reg == JumpReg)
+ continue;
+ PushMIB2.addReg(Reg, RegState::Kill);
+ }
+
+ // If we couldn't use a low register for temporary storage (because it was
+ // the JumpReg), use r4 or r5, whichever is not JumpReg. It has already been
+ // saved.
+ if (JumpReg >= ARM::R4 && JumpReg <= ARM::R7) {
+ int LoReg = JumpReg == ARM::R4 ? ARM::R5 : ARM::R4;
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg)
+ .addReg(ARM::R8, LiveRegs.contains(ARM::R8) ? 0 : RegState::Undef)
+ .add(predOps(ARMCC::AL));
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH))
+ .add(predOps(ARMCC::AL))
+ .addReg(LoReg, RegState::Kill);
+ }
+ } else { // push Lo and Hi registers with a single instruction
+ MachineInstrBuilder PushMIB =
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::t2STMDB_UPD), ARM::SP)
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL));
+ for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg) {
+ PushMIB.addReg(
+ Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef);
+ }
+ }
+}
+
+static void CMSEPopCalleeSaves(const TargetInstrInfo &TII,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, int JumpReg,
+ bool Thumb1Only) {
+ const DebugLoc &DL = MBBI->getDebugLoc();
+ if (Thumb1Only) {
+ MachineInstrBuilder PopMIB =
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
+ for (int R = 0; R < 4; ++R) {
+ PopMIB.addReg(ARM::R4 + R, RegState::Define);
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), ARM::R8 + R)
+ .addReg(ARM::R4 + R, RegState::Kill)
+ .add(predOps(ARMCC::AL));
+ }
+ MachineInstrBuilder PopMIB2 =
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
+ for (int R = 0; R < 4; ++R)
+ PopMIB2.addReg(ARM::R4 + R, RegState::Define);
+ } else { // pop Lo and Hi registers with a single instruction
+ MachineInstrBuilder PopMIB =
+ BuildMI(MBB, MBBI, DL, TII.get(ARM::t2LDMIA_UPD), ARM::SP)
+ .addReg(ARM::SP)
+ .add(predOps(ARMCC::AL));
+ for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg)
+ PopMIB.addReg(Reg, RegState::Define);
+ }
+}
bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@@ -1207,12 +1908,117 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// Update call site info and delete the pseudo instruction TCRETURN.
- MBB.getParent()->moveCallSiteInfo(&MI, &*NewMI);
+ if (MI.isCandidateForCallSiteEntry())
+ MI.getMF()->moveCallSiteInfo(&MI, &*NewMI);
MBB.erase(MBBI);
MBBI = NewMI;
return true;
}
+ case ARM::tBXNS_RET: {
+ MachineBasicBlock &AfterBB = CMSEClearFPRegs(MBB, MBBI);
+
+ if (STI->hasV8_1MMainlineOps()) {
+ // Restore the non-secure floating point context.
+ BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+ TII->get(ARM::VLDR_FPCXTNS_post), ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(4)
+ .add(predOps(ARMCC::AL));
+ }
+
+ // Clear all GPR that are not a use of the return instruction.
+ assert(llvm::all_of(MBBI->operands(), [](const MachineOperand &Op) {
+ return !Op.isReg() || Op.getReg() != ARM::R12;
+ }));
+ SmallVector<unsigned, 5> ClearRegs;
+ determineGPRegsToClear(
+ *MBBI, {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R12}, ClearRegs);
+ CMSEClearGPRegs(AfterBB, AfterBB.end(), MBBI->getDebugLoc(), ClearRegs,
+ ARM::LR);
+
+ MachineInstrBuilder NewMI =
+ BuildMI(AfterBB, AfterBB.end(), MBBI->getDebugLoc(),
+ TII->get(ARM::tBXNS))
+ .addReg(ARM::LR)
+ .add(predOps(ARMCC::AL));
+ for (const MachineOperand &Op : MI.operands())
+ NewMI->addOperand(Op);
+ MI.eraseFromParent();
+ return true;
+ }
+ case ARM::tBLXNS_CALL: {
+ DebugLoc DL = MBBI->getDebugLoc();
+ unsigned JumpReg = MBBI->getOperand(0).getReg();
+
+ // Figure out which registers are live at the point immediately before the
+ // call. When we indiscriminately push a set of registers, the live
+ // registers are added as ordinary use operands, whereas dead registers
+ // are "undef".
+ LivePhysRegs LiveRegs(*TRI);
+ LiveRegs.addLiveOuts(MBB);
+ for (const MachineInstr &MI : make_range(MBB.rbegin(), MBBI.getReverse()))
+ LiveRegs.stepBackward(MI);
+ LiveRegs.stepBackward(*MBBI);
+
+ CMSEPushCalleeSaves(*TII, MBB, MBBI, JumpReg, LiveRegs,
+ AFI->isThumb1OnlyFunction());
+
+ SmallVector<unsigned, 16> ClearRegs;
+ determineGPRegsToClear(*MBBI,
+ {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4,
+ ARM::R5, ARM::R6, ARM::R7, ARM::R8, ARM::R9,
+ ARM::R10, ARM::R11, ARM::R12},
+ ClearRegs);
+ auto OriginalClearRegs = ClearRegs;
+
+ // Get the first cleared register as a scratch (to use later with tBIC).
+ // We need to use the first so we can ensure it is a low register.
+ unsigned ScratchReg = ClearRegs.front();
+
+ // Clear LSB of JumpReg
+ if (AFI->isThumb2Function()) {
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), JumpReg)
+ .addReg(JumpReg)
+ .addImm(1)
+ .add(predOps(ARMCC::AL))
+ .add(condCodeOp());
+ } else {
+ // We need to use an extra register to cope with 8M Baseline,
+ // since we have saved all of the registers we are ok to trash a non
+ // argument register here.
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::tMOVi8), ScratchReg)
+ .add(condCodeOp())
+ .addImm(1)
+ .add(predOps(ARMCC::AL));
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::tBIC), JumpReg)
+ .addReg(ARM::CPSR, RegState::Define)
+ .addReg(JumpReg)
+ .addReg(ScratchReg)
+ .add(predOps(ARMCC::AL));
+ }
+
+ CMSESaveClearFPRegs(MBB, MBBI, DL, LiveRegs,
+ ClearRegs); // save+clear FP regs with ClearRegs
+ CMSEClearGPRegs(MBB, MBBI, DL, ClearRegs, JumpReg);
+
+ const MachineInstrBuilder NewCall =
+ BuildMI(MBB, MBBI, DL, TII->get(ARM::tBLXNSr))
+ .add(predOps(ARMCC::AL))
+ .addReg(JumpReg, RegState::Kill);
+
+ for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
+ NewCall->addOperand(MI.getOperand(I));
+ if (MI.isCandidateForCallSiteEntry())
+ MI.getMF()->moveCallSiteInfo(&MI, NewCall.getInstr());
+
+ CMSERestoreFPRegs(MBB, MBBI, DL, OriginalClearRegs); // restore FP registers
+
+ CMSEPopCalleeSaves(*TII, MBB, MBBI, JumpReg, AFI->isThumb1OnlyFunction());
+
+ MI.eraseFromParent();
+ return true;
+ }
case ARM::VMOVHcc:
case ARM::VMOVScc:
case ARM::VMOVDcc: {
@@ -1359,17 +2165,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
// If there's dynamic realignment, adjust for it.
if (RI.needsStackRealignment(MF)) {
MachineFrameInfo &MFI = MF.getFrameInfo();
- unsigned MaxAlign = MFI.getMaxAlignment();
+ Align MaxAlign = MFI.getMaxAlign();
assert (!AFI->isThumb1OnlyFunction());
// Emit bic r6, r6, MaxAlign
- assert(MaxAlign <= 256 && "The BIC instruction cannot encode "
- "immediates larger than 256 with all lower "
- "bits set.");
+ assert(MaxAlign <= Align(256) &&
+ "The BIC instruction cannot encode "
+ "immediates larger than 256 with all lower "
+ "bits set.");
unsigned bicOpc = AFI->isThumbFunction() ?
ARM::t2BICri : ARM::BICri;
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(bicOpc), ARM::R6)
.addReg(ARM::R6, RegState::Kill)
- .addImm(MaxAlign - 1)
+ .addImm(MaxAlign.value() - 1)
.add(predOps(ARMCC::AL))
.add(condCodeOp());
}
@@ -1410,17 +2217,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
const bool Thumb = Opcode == ARM::tTPsoft;
MachineInstrBuilder MIB;
+ MachineFunction *MF = MBB.getParent();
if (STI->genLongCalls()) {
- MachineFunction *MF = MBB.getParent();
MachineConstantPool *MCP = MF->getConstantPool();
unsigned PCLabelID = AFI->createPICLabelUId();
MachineConstantPoolValue *CPV =
ARMConstantPoolSymbol::Create(MF->getFunction().getContext(),
"__aeabi_read_tp", PCLabelID, 0);
Register Reg = MI.getOperand(0).getReg();
- MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
- TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg)
- .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
+ MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg)
+ .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, Align(4)));
if (!Thumb)
MIB.addImm(0);
MIB.add(predOps(ARMCC::AL));
@@ -1440,7 +2248,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MIB.cloneMemRefs(MI);
TransferImpOps(MI, MIB, MIB);
- MI.getMF()->moveCallSiteInfo(&MI, &*MIB);
+ // Update the call site info.
+ if (MI.isCandidateForCallSiteEntry())
+ MF->moveCallSiteInfo(&MI, &*MIB);
MI.eraseFromParent();
return true;
}
@@ -1504,7 +2314,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg)
- .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
+ .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, Align(4)));
if (IsARM)
MIB.addImm(0);
MIB.add(predOps(ARMCC::AL));
@@ -1952,6 +2762,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MI.eraseFromParent();
return true;
}
+ case ARM::LOADDUAL:
+ case ARM::STOREDUAL: {
+ Register PairReg = MI.getOperand(0).getReg();
+
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == ARM::LOADDUAL ? ARM::LDRD : ARM::STRD))
+ .addReg(TRI->getSubReg(PairReg, ARM::gsub_0),
+ Opcode == ARM::LOADDUAL ? RegState::Define : 0)
+ .addReg(TRI->getSubReg(PairReg, ARM::gsub_1),
+ Opcode == ARM::LOADDUAL ? RegState::Define : 0);
+ for (unsigned i = 1; i < MI.getNumOperands(); i++)
+ MIB.add(MI.getOperand(i));
+ MIB.add(predOps(ARMCC::AL));
+ MIB.cloneMemRefs(MI);
+ MI.eraseFromParent();
+ return true;
+ }
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
index 6e19db3c7e22..4bfca8a803ca 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -48,7 +48,6 @@
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -209,7 +208,7 @@ class ARMFastISel final : public FastISel {
unsigned ARMMoveToFPReg(MVT VT, unsigned SrcReg);
unsigned ARMMoveToIntReg(MVT VT, unsigned SrcReg);
unsigned ARMSelectCallOp(bool UseReg);
- unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
+ unsigned ARMLowerPICELF(const GlobalValue *GV, MVT VT);
const TargetLowering *getTargetLowering() { return &TLI; }
@@ -444,12 +443,8 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
if (!Subtarget->hasVFP2Base()) return false;
// MachineConstantPool wants an explicit alignment.
- unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
- if (Align == 0) {
- // TODO: Figure out if this is correct.
- Align = DL.getTypeAllocSize(CFP->getType());
- }
- unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+ Align Alignment = DL.getPrefTypeAlign(CFP->getType());
+ unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment);
unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS;
@@ -508,12 +503,8 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
return 0;
// MachineConstantPool wants an explicit alignment.
- unsigned Align = DL.getPrefTypeAlignment(C->getType());
- if (Align == 0) {
- // TODO: Figure out if this is correct.
- Align = DL.getTypeAllocSize(C->getType());
- }
- unsigned Idx = MCP.getConstantPoolIndex(C, Align);
+ Align Alignment = DL.getPrefTypeAlign(C->getType());
+ unsigned Idx = MCP.getConstantPoolIndex(C, Alignment);
ResultReg = createResultReg(TLI.getRegClassFor(VT));
if (isThumb2)
AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -570,14 +561,10 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF));
} else {
// MachineConstantPool wants an explicit alignment.
- unsigned Align = DL.getPrefTypeAlignment(GV->getType());
- if (Align == 0) {
- // TODO: Figure out if this is correct.
- Align = DL.getTypeAllocSize(GV->getType());
- }
+ Align Alignment = DL.getPrefTypeAlign(GV->getType());
if (Subtarget->isTargetELF() && IsPositionIndependent)
- return ARMLowerPICELF(GV, Align, VT);
+ return ARMLowerPICELF(GV, VT);
// Grab index.
unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
@@ -585,7 +572,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
ARMCP::CPValue,
PCAdj);
- unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+ unsigned Idx = MCP.getConstantPoolIndex(CPV, Alignment);
// Load value.
MachineInstrBuilder MIB;
@@ -882,7 +869,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
int Offset = Addr.Offset;
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
// Now add the rest of the operands.
MIB.addFrameIndex(FI);
@@ -2090,6 +2077,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
bool ARMFastISel::SelectRet(const Instruction *I) {
const ReturnInst *Ret = cast<ReturnInst>(I);
const Function &F = *I->getParent()->getParent();
+ const bool IsCmseNSEntry = F.hasFnAttribute("cmse_nonsecure_entry");
if (!FuncInfo.CanLowerReturn)
return false;
@@ -2166,8 +2154,17 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
RetRegs.push_back(VA.getLocReg());
}
+ unsigned RetOpc;
+ if (IsCmseNSEntry)
+ if (isThumb2)
+ RetOpc = ARM::tBXNS_RET;
+ else
+ llvm_unreachable("CMSE not valid for non-Thumb targets");
+ else
+ RetOpc = Subtarget->getReturnOpcode();
+
MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(Subtarget->getReturnOpcode()));
+ TII.get(RetOpc));
AddOptionalDefs(MIB);
for (unsigned R : RetRegs)
MIB.addReg(R, RegState::Implicit);
@@ -2239,7 +2236,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
if (!isTypeLegal(ArgTy, ArgVT)) return false;
ISD::ArgFlagsTy Flags;
- Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
+ Flags.setOrigAlign(DL.getABITypeAlign(ArgTy));
Args.push_back(Op);
ArgRegs.push_back(Arg);
@@ -2293,7 +2290,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
bool ARMFastISel::SelectCall(const Instruction *I,
const char *IntrMemName = nullptr) {
const CallInst *CI = cast<CallInst>(I);
- const Value *Callee = CI->getCalledValue();
+ const Value *Callee = CI->getCalledOperand();
// Can't handle inline asm.
if (isa<InlineAsm>(Callee)) return false;
@@ -2302,12 +2299,11 @@ bool ARMFastISel::SelectCall(const Instruction *I,
if (CI->isTailCall()) return false;
// Check the calling convention.
- ImmutableCallSite CS(CI);
- CallingConv::ID CC = CS.getCallingConv();
+ CallingConv::ID CC = CI->getCallingConv();
// TODO: Avoid some calling conventions?
- FunctionType *FTy = CS.getFunctionType();
+ FunctionType *FTy = CI->getFunctionType();
bool isVarArg = FTy->isVarArg();
// Handle *simple* calls for now.
@@ -2334,47 +2330,46 @@ bool ARMFastISel::SelectCall(const Instruction *I,
SmallVector<Register, 8> ArgRegs;
SmallVector<MVT, 8> ArgVTs;
SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
- unsigned arg_size = CS.arg_size();
+ unsigned arg_size = CI->arg_size();
Args.reserve(arg_size);
ArgRegs.reserve(arg_size);
ArgVTs.reserve(arg_size);
ArgFlags.reserve(arg_size);
- for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
- i != e; ++i) {
+ for (auto ArgI = CI->arg_begin(), ArgE = CI->arg_end(); ArgI != ArgE; ++ArgI) {
// If we're lowering a memory intrinsic instead of a regular call, skip the
// last argument, which shouldn't be passed to the underlying function.
- if (IntrMemName && e - i <= 1)
+ if (IntrMemName && ArgE - ArgI <= 1)
break;
ISD::ArgFlagsTy Flags;
- unsigned ArgIdx = i - CS.arg_begin();
- if (CS.paramHasAttr(ArgIdx, Attribute::SExt))
+ unsigned ArgIdx = ArgI - CI->arg_begin();
+ if (CI->paramHasAttr(ArgIdx, Attribute::SExt))
Flags.setSExt();
- if (CS.paramHasAttr(ArgIdx, Attribute::ZExt))
+ if (CI->paramHasAttr(ArgIdx, Attribute::ZExt))
Flags.setZExt();
// FIXME: Only handle *easy* calls for now.
- if (CS.paramHasAttr(ArgIdx, Attribute::InReg) ||
- CS.paramHasAttr(ArgIdx, Attribute::StructRet) ||
- CS.paramHasAttr(ArgIdx, Attribute::SwiftSelf) ||
- CS.paramHasAttr(ArgIdx, Attribute::SwiftError) ||
- CS.paramHasAttr(ArgIdx, Attribute::Nest) ||
- CS.paramHasAttr(ArgIdx, Attribute::ByVal))
+ if (CI->paramHasAttr(ArgIdx, Attribute::InReg) ||
+ CI->paramHasAttr(ArgIdx, Attribute::StructRet) ||
+ CI->paramHasAttr(ArgIdx, Attribute::SwiftSelf) ||
+ CI->paramHasAttr(ArgIdx, Attribute::SwiftError) ||
+ CI->paramHasAttr(ArgIdx, Attribute::Nest) ||
+ CI->paramHasAttr(ArgIdx, Attribute::ByVal))
return false;
- Type *ArgTy = (*i)->getType();
+ Type *ArgTy = (*ArgI)->getType();
MVT ArgVT;
if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 &&
ArgVT != MVT::i1)
return false;
- Register Arg = getRegForValue(*i);
+ Register Arg = getRegForValue(*ArgI);
if (!Arg.isValid())
return false;
- Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
+ Flags.setOrigAlign(DL.getABITypeAlign(ArgTy));
- Args.push_back(*i);
+ Args.push_back(*ArgI);
ArgRegs.push_back(Arg);
ArgVTs.push_back(ArgVT);
ArgFlags.push_back(Flags);
@@ -2949,8 +2944,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
return true;
}
-unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
- unsigned Align, MVT VT) {
+unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MVT VT) {
bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
LLVMContext *Context = &MF->getFunction().getContext();
@@ -2961,12 +2955,12 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
/*AddCurrentAddress=*/UseGOT_PREL);
- unsigned ConstAlign =
- MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
+ Align ConstAlign =
+ MF->getDataLayout().getPrefTypeAlign(Type::getInt32PtrTy(*Context));
unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign);
MachineMemOperand *CPMMO =
MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
- MachineMemOperand::MOLoad, 4, 4);
+ MachineMemOperand::MOLoad, 4, Align(4));
Register TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index cb98b2b34efd..8a8f3237bb6f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -142,27 +142,6 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects();
}
-static bool isCSRestore(MachineInstr &MI, const ARMBaseInstrInfo &TII,
- const MCPhysReg *CSRegs) {
- // Integer spill area is handled with "pop".
- if (isPopOpcode(MI.getOpcode())) {
- // The first two operands are predicates. The last two are
- // imp-def and imp-use of SP. Check everything in between.
- for (int i = 5, e = MI.getNumOperands(); i != e; ++i)
- if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs))
- return false;
- return true;
- }
- if ((MI.getOpcode() == ARM::LDR_POST_IMM ||
- MI.getOpcode() == ARM::LDR_POST_REG ||
- MI.getOpcode() == ARM::t2LDR_POST) &&
- isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs) &&
- MI.getOperand(1).getReg() == ARM::SP)
- return true;
-
- return false;
-}
-
static void emitRegPlusImmediate(
bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
@@ -256,9 +235,9 @@ struct StackAdjustingInsts {
if (HasFP && !Info.BeforeFPSet)
return;
- CFAOffset -= Info.SPAdjust;
+ CFAOffset += Info.SPAdjust;
unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
BuildMI(MBB, std::next(Info.I), dl,
TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
@@ -281,13 +260,13 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const unsigned Reg,
- const unsigned Alignment,
+ const Align Alignment,
const bool MustBeSingleInstruction) {
const ARMSubtarget &AST =
static_cast<const ARMSubtarget &>(MF.getSubtarget());
const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops();
- const unsigned AlignMask = Alignment - 1;
- const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+ const unsigned AlignMask = Alignment.value() - 1U;
+ const unsigned NrBitsToZero = Log2(Alignment);
assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported");
if (!AFI->isThumbFunction()) {
// if the BFC instruction is available, use that to zero the lower
@@ -343,14 +322,15 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
/// Unfortunately we cannot determine this value in determineCalleeSaves() yet
/// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use
/// this to produce a conservative estimate that we check in an assert() later.
-static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) {
+static int getMaxFPOffset(const ARMSubtarget &STI, const ARMFunctionInfo &AFI) {
// For Thumb1, push.w isn't available, so the first push will always push
// r7 and lr onto the stack first.
if (AFI.isThumb1OnlyFunction())
return -AFI.getArgRegsSaveSize() - (2 * 4);
// This is a conservative estimation: Assume the frame pointer being r7 and
// pc("r15") up to r8 getting spilled before (= 8 registers).
- return -AFI.getArgRegsSaveSize() - (8 * 4);
+ int FPCXTSaveSize = (STI.hasV8_1MMainlineOps() && AFI.isCmseNSEntryFunction()) ? 4 : 0;
+ return - FPCXTSaveSize - AFI.getArgRegsSaveSize() - (8 * 4);
}
void ARMFrameLowering::emitPrologue(MachineFunction &MF,
@@ -367,10 +347,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
assert(!AFI->isThumb1OnlyFunction() &&
"This emitPrologue does not support Thumb1!");
bool isARM = !AFI->isThumbFunction();
- unsigned Align = STI.getFrameLowering()->getStackAlignment();
+ Align Alignment = STI.getFrameLowering()->getStackAlign();
unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
unsigned NumBytes = MFI.getStackSize();
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ int FPCXTSaveSize = 0;
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
@@ -439,6 +420,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
FramePtrSpillFI = FI;
GPRCS1Size += 4;
break;
+ case ARM::FPCXTNS:
+ FPCXTSaveSize = 4;
+ break;
default:
// This is a DPR. Exclude the aligned DPRCS2 spills.
if (Reg == ARM::D8)
@@ -448,25 +432,35 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
}
}
- // Move past area 1.
+ // Move past FPCXT area.
MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push;
+ if (FPCXTSaveSize > 0) {
+ LastPush = MBBI++;
+ DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, true);
+ }
+
+ // Move past area 1.
if (GPRCS1Size > 0) {
GPRCS1Push = LastPush = MBBI++;
DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true);
}
// Determine starting offsets of spill areas.
- unsigned GPRCS1Offset = NumBytes - ArgRegsSaveSize - GPRCS1Size;
+ unsigned FPCXTOffset = NumBytes - ArgRegsSaveSize - FPCXTSaveSize;
+ unsigned GPRCS1Offset = FPCXTOffset - GPRCS1Size;
unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size;
- unsigned DPRAlign = DPRCSSize ? std::min(8U, Align) : 4U;
- unsigned DPRGapSize = (GPRCS1Size + GPRCS2Size + ArgRegsSaveSize) % DPRAlign;
+ Align DPRAlign = DPRCSSize ? std::min(Align(8), Alignment) : Align(4);
+ unsigned DPRGapSize =
+ (GPRCS1Size + GPRCS2Size + FPCXTSaveSize + ArgRegsSaveSize) %
+ DPRAlign.value();
+
unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
int FramePtrOffsetInPush = 0;
if (HasFP) {
int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
- assert(getMaxFPOffset(MF.getFunction(), *AFI) <= FPOffset &&
+ assert(getMaxFPOffset(STI, *AFI) <= FPOffset &&
"Max FP estimation is wrong");
- FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize;
+ FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize + FPCXTSaveSize;
AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
NumBytes);
}
@@ -599,9 +593,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
PushSize + FramePtrOffsetInPush,
MachineInstr::FrameSetup);
if (FramePtrOffsetInPush + PushSize != 0) {
- unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
nullptr, MRI->getDwarfRegNum(FramePtr, true),
- -(ArgRegsSaveSize - FramePtrOffsetInPush)));
+ FPCXTSaveSize + ArgRegsSaveSize - FramePtrOffsetInPush));
BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -707,6 +701,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
MFI.setOffsetAdjustment(MFI.getOffsetAdjustment() -
AFI->getFramePtrSpillOffset());
+ AFI->setFPCXTSaveAreaSize(FPCXTSaveSize);
AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
AFI->setDPRCalleeSavedGapSize(DPRGapSize);
@@ -717,7 +712,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
// If aligned NEON registers were spilled, the stack has already been
// realigned.
if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
- unsigned MaxAlign = MFI.getMaxAlignment();
+ Align MaxAlign = MFI.getMaxAlign();
assert(!AFI->isThumb1OnlyFunction());
if (!AFI->isThumbFunction()) {
emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
@@ -793,20 +788,22 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
if (!AFI->hasStackFrame()) {
if (NumBytes - ArgRegsSaveSize != 0)
- emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize);
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize,
+ MachineInstr::FrameDestroy);
} else {
// Unwind MBBI to point to first LDR / VLDRD.
- const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
if (MBBI != MBB.begin()) {
do {
--MBBI;
- } while (MBBI != MBB.begin() && isCSRestore(*MBBI, TII, CSRegs));
- if (!isCSRestore(*MBBI, TII, CSRegs))
+ } while (MBBI != MBB.begin() &&
+ MBBI->getFlag(MachineInstr::FrameDestroy));
+ if (!MBBI->getFlag(MachineInstr::FrameDestroy))
++MBBI;
}
// Move SP to start of FP callee save spill area.
NumBytes -= (ArgRegsSaveSize +
+ AFI->getFPCXTSaveAreaSize() +
AFI->getGPRCalleeSavedArea1Size() +
AFI->getGPRCalleeSavedArea2Size() +
AFI->getDPRCalleeSavedGapSize() +
@@ -819,7 +816,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
if (NumBytes) {
if (isARM)
emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
- ARMCC::AL, 0, TII);
+ ARMCC::AL, 0, TII,
+ MachineInstr::FrameDestroy);
else {
// It's not possible to restore SP from FP in a single instruction.
// For iOS, this looks like:
@@ -831,10 +829,11 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
assert(!MFI.getPristineRegs(MF).test(ARM::R4) &&
"No scratch register to restore SP from FP!");
emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
- ARMCC::AL, 0, TII);
+ ARMCC::AL, 0, TII, MachineInstr::FrameDestroy);
BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
.addReg(ARM::R4)
- .add(predOps(ARMCC::AL));
+ .add(predOps(ARMCC::AL))
+ .setMIFlag(MachineInstr::FrameDestroy);
}
} else {
// Thumb2 or ARM.
@@ -842,15 +841,18 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
.addReg(FramePtr)
.add(predOps(ARMCC::AL))
- .add(condCodeOp());
+ .add(condCodeOp())
+ .setMIFlag(MachineInstr::FrameDestroy);
else
BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
.addReg(FramePtr)
- .add(predOps(ARMCC::AL));
+ .add(predOps(ARMCC::AL))
+ .setMIFlag(MachineInstr::FrameDestroy);
}
} else if (NumBytes &&
!tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
- emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes,
+ MachineInstr::FrameDestroy);
// Increment past our save areas.
if (MBBI != MBB.end() && AFI->getDPRCalleeSavedAreaSize()) {
@@ -863,31 +865,32 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
if (AFI->getDPRCalleeSavedGapSize()) {
assert(AFI->getDPRCalleeSavedGapSize() == 4 &&
"unexpected DPR alignment gap");
- emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize());
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize(),
+ MachineInstr::FrameDestroy);
}
if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
+ if (AFI->getFPCXTSaveAreaSize()) MBBI++;
}
if (ArgRegsSaveSize)
- emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize);
+ emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize,
+ MachineInstr::FrameDestroy);
}
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
/// debug info. It's the same as what we use for resolving the code-gen
/// references for now. FIXME: This can go wrong when references are
/// SP-relative and simple call frames aren't used.
-int
-ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const {
+int ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
}
-int
-ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
- int FI, unsigned &FrameReg,
- int SPAdj) const {
+int ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
+ int FI, Register &FrameReg,
+ int SPAdj) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
@@ -969,10 +972,9 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ ArrayRef<CalleeSavedInfo> CSI,
unsigned StmOpc, unsigned StrOpc,
- bool NoGap,
- bool(*Func)(unsigned, bool),
+ bool NoGap, bool (*Func)(unsigned, bool),
unsigned NumAlignedDPRCS2Regs,
unsigned MIFlags) const {
MachineFunction &MF = *MBB.getParent();
@@ -1047,10 +1049,10 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI,
+ MutableArrayRef<CalleeSavedInfo> CSI,
unsigned LdmOpc, unsigned LdrOpc,
bool isVarArg, bool NoGap,
- bool(*Func)(unsigned, bool),
+ bool (*Func)(unsigned, bool),
unsigned NumAlignedDPRCS2Regs) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -1060,6 +1062,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
bool isTailCall = false;
bool isInterrupt = false;
bool isTrap = false;
+ bool isCmseEntry = false;
if (MBB.end() != MI) {
DL = MI->getDebugLoc();
unsigned RetOpcode = MI->getOpcode();
@@ -1069,6 +1072,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
isTrap =
RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl ||
RetOpcode == ARM::tTRAP;
+ isCmseEntry = (RetOpcode == ARM::tBXNS || RetOpcode == ARM::tBXNS_RET);
}
SmallVector<unsigned, 4> Regs;
@@ -1086,7 +1090,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
continue;
if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
- !isTrap && STI.hasV5TOps()) {
+ !isCmseEntry && !isTrap && STI.hasV5TOps()) {
if (MBB.succ_empty()) {
Reg = ARM::PC;
// Fold the return instruction into the LDM.
@@ -1119,7 +1123,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
if (Regs.size() > 1 || LdrOpc == 0) {
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
.addReg(ARM::SP)
- .add(predOps(ARMCC::AL));
+ .add(predOps(ARMCC::AL))
+ .setMIFlags(MachineInstr::FrameDestroy);
for (unsigned i = 0, e = Regs.size(); i < e; ++i)
MIB.addReg(Regs[i], getDefRegState(true));
if (DeleteRet) {
@@ -1137,7 +1142,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
MachineInstrBuilder MIB =
BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0])
.addReg(ARM::SP, RegState::Define)
- .addReg(ARM::SP);
+ .addReg(ARM::SP)
+ .setMIFlags(MachineInstr::FrameDestroy);
// ARM mode needs an extra reg0 here due to addrmode2. Will go away once
// that refactoring is complete (eventually).
if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) {
@@ -1162,7 +1168,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned NumAlignedDPRCS2Regs,
- const std::vector<CalleeSavedInfo> &CSI,
+ ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) {
MachineFunction &MF = *MBB.getParent();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -1180,7 +1186,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
int FI = CSI[i].getFrameIdx();
// The even-numbered registers will be 16-byte aligned, the odd-numbered
// registers will be 8-byte aligned.
- MFI.setObjectAlignment(FI, DNum % 2 ? 8 : 16);
+ MFI.setObjectAlignment(FI, DNum % 2 ? Align(8) : Align(16));
// The stack slot for D8 needs to be maximally aligned because this is
// actually the point where we align the stack pointer. MachineFrameInfo
@@ -1189,7 +1195,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
// over-alignment is not realized because the code inserted below adjusts
// the stack pointer by numregs * 8 before aligning the stack pointer.
if (DNum == 0)
- MFI.setObjectAlignment(FI, MFI.getMaxAlignment());
+ MFI.setObjectAlignment(FI, MFI.getMaxAlign());
}
// Move the stack pointer to the d8 spill slot, and align it at the same
@@ -1212,7 +1218,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
.add(predOps(ARMCC::AL))
.add(condCodeOp());
- unsigned MaxAlign = MF.getFrameInfo().getMaxAlignment();
+ Align MaxAlign = MF.getFrameInfo().getMaxAlign();
// We must set parameter MustBeSingleInstruction to true, since
// skipAlignedDPRCS2Spills expects exactly 3 instructions to perform
// stack alignment. Luckily, this can always be done since all ARM
@@ -1335,7 +1341,7 @@ skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned NumAlignedDPRCS2Regs,
- const std::vector<CalleeSavedInfo> &CSI,
+ ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) {
MachineFunction &MF = *MBB.getParent();
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -1422,10 +1428,9 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
}
-bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
+bool ARMFrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
if (CSI.empty())
return false;
@@ -1437,6 +1442,16 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
ARM::t2STR_PRE : ARM::STR_PRE_IMM;
unsigned FltOpc = ARM::VSTMDDB_UPD;
unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+ // Save the non-secure floating point context.
+ if (llvm::any_of(CSI, [](const CalleeSavedInfo &C) {
+ return C.getReg() == ARM::FPCXTNS;
+ })) {
+ BuildMI(MBB, MI, DebugLoc(), STI.getInstrInfo()->get(ARM::VSTR_FPCXTNS_pre),
+ ARM::SP)
+ .addReg(ARM::SP)
+ .addImm(-4)
+ .add(predOps(ARMCC::AL));
+ }
emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0,
MachineInstr::FrameSetup);
emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0,
@@ -1453,10 +1468,9 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
return true;
}
-bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
+bool ARMFrameLowering::restoreCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
if (CSI.empty())
return false;
@@ -1601,7 +1615,7 @@ checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
return;
// Don't bother if the default stack alignment is sufficiently high.
- if (MF.getSubtarget().getFrameLowering()->getStackAlignment() >= 8)
+ if (MF.getSubtarget().getFrameLowering()->getStackAlign() >= Align(8))
return;
// Aligned spills require stack realignment.
@@ -1630,6 +1644,16 @@ checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
SavedRegs.set(ARM::R4);
}
+bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+ // For CMSE entry functions, we want to save the FPCXT_NS immediately
+ // upon function entry (resp. restore it immmediately before return)
+ if (STI.hasV8_1MMainlineOps() &&
+ MF.getInfo<ARMFunctionInfo>()->isCmseNSEntryFunction())
+ return false;
+
+ return true;
+}
+
void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
@@ -1699,6 +1723,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (RegInfo->hasBasePointer(MF))
SavedRegs.set(RegInfo->getBaseRegister());
+ // On v8.1-M.Main CMSE entry functions save/restore FPCXT.
+ if (STI.hasV8_1MMainlineOps() && AFI->isCmseNSEntryFunction())
+ CanEliminateFrame = false;
+
// Don't spill FP if the frame can be eliminated. This is determined
// by scanning the callee-save registers to see if any is modified.
const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
@@ -1771,8 +1799,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
unsigned FnSize = EstimateFunctionSizeInBytes(MF, TII);
// Force LR to be spilled if the Thumb function size is > 2048. This enables
- // use of BL to implement far jump. If it turns out that it's not needed
- // then the branch fix up path will undo it.
+ // use of BL to implement far jump.
if (FnSize >= (1 << 11)) {
CanEliminateFrame = false;
ForceLRSpill = true;
@@ -1858,7 +1885,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
//
// We could do slightly better on Thumb1; in some cases, an sp-relative
// offset would be legal even though an fp-relative offset is not.
- int MaxFPOffset = getMaxFPOffset(MF.getFunction(), *AFI);
+ int MaxFPOffset = getMaxFPOffset(STI, *AFI);
bool HasLargeArgumentList =
HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit;
@@ -2045,8 +2072,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
// of GPRs, spill one extra callee save GPR so we won't have to pad between
// the integer and double callee save areas.
LLVM_DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n");
- unsigned TargetAlign = getStackAlignment();
- if (TargetAlign >= 8 && (NumGPRSpills & 1)) {
+ const Align TargetAlign = getStackAlign();
+ if (TargetAlign >= Align(8) && (NumGPRSpills & 1)) {
if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
unsigned Reg = UnspilledCS1GPRs[i];
@@ -2083,7 +2110,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (BigFrameOffsets && !ExtraCSSpill) {
// If any non-reserved CS register isn't spilled, just spill one or two
// extra. That should take care of it!
- unsigned NumExtras = TargetAlign / 4;
+ unsigned NumExtras = TargetAlign.value() / 4;
SmallVector<unsigned, 2> Extras;
while (NumExtras && !UnspilledCS1GPRs.empty()) {
unsigned Reg = UnspilledCS1GPRs.back();
@@ -2117,16 +2144,15 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n");
const TargetRegisterClass &RC = ARM::GPRRegClass;
unsigned Size = TRI->getSpillSize(RC);
- unsigned Align = TRI->getSpillAlignment(RC);
- RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
+ Align Alignment = TRI->getSpillAlign(RC);
+ RS->addScavengingFrameIndex(
+ MFI.CreateStackObject(Size, Alignment, false));
}
}
}
- if (ForceLRSpill) {
+ if (ForceLRSpill)
SavedRegs.set(ARM::LR);
- AFI->setLRIsSpilledForFarJump(true);
- }
AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
}
@@ -2142,6 +2168,27 @@ void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF,
SavedRegs.set(ARM::R0);
}
+bool ARMFrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const {
+ // For CMSE entry functions, handle floating-point context as if it was a
+ // callee-saved register.
+ if (STI.hasV8_1MMainlineOps() &&
+ MF.getInfo<ARMFunctionInfo>()->isCmseNSEntryFunction()) {
+ CSI.emplace_back(ARM::FPCXTNS);
+ CSI.back().setRestored(false);
+ }
+
+ return false;
+}
+
+const TargetFrameLowering::SpillSlot *
+ARMFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+ static const SpillSlot FixedSpillOffsets[] = {{ARM::FPCXTNS, -4}};
+ NumEntries = array_lengthof(FixedSpillOffsets);
+ return FixedSpillOffsets;
+}
+
MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
@@ -2364,8 +2411,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
// Emit the relevant DWARF information about the change in stack pointer as
// well as where to find both r4 and r5 (the callee-save registers)
- CFIIndex =
- MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -8));
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 8));
BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
@@ -2409,7 +2455,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create(
MF.getFunction().getContext(), "__STACK_LIMIT", PCLabelId, 0);
MachineConstantPool *MCP = MF.getConstantPool();
- unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4);
+ unsigned CPI = MCP->getConstantPoolIndex(NewCPV, Align(4));
// ldr SR0, [pc, offset(STACK_LIMIT)]
BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
@@ -2507,8 +2553,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
// Emit the DWARF info about the change in stack as well as where to find the
// previous link register
- CFIIndex =
- MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -12));
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 12));
BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
@@ -2570,7 +2615,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
}
// Update the CFA offset now that we've popped
- CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
@@ -2594,7 +2639,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
}
// Update the CFA offset now that we've popped
- CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+ CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
index 0462b01af707..4c2c07d64f57 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -9,9 +9,7 @@
#ifndef LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
#define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
-#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
-#include <vector>
namespace llvm {
@@ -33,13 +31,14 @@ public:
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;
- bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const override;
+ bool
+ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
bool keepFramePointer(const MachineFunction &MF) const override;
@@ -49,9 +48,9 @@ public:
bool hasReservedCallFrame(const MachineFunction &MF) const override;
bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const override;
+ Register &FrameReg) const override;
int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg, int SPAdj) const;
+ Register &FrameReg, int SPAdj) const;
void getCalleeSaves(const MachineFunction &MF,
BitVector &SavedRegs) const override;
@@ -62,25 +61,31 @@ public:
MachineBasicBlock &MBB) const override;
/// Returns true if the target will correctly handle shrink wrapping.
- bool enableShrinkWrapping(const MachineFunction &MF) const override {
- return true;
- }
+ bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
bool isProfitableForNoCSROpt(const Function &F) const override {
// The no-CSR optimisation is bad for code size on ARM, because we can save
// many registers with a single PUSH/POP pair.
return false;
}
+ bool
+ assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI) const override;
+
+ const SpillSlot *
+ getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+
private:
void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
- unsigned StrOpc, bool NoGap,
- bool(*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs,
- unsigned MIFlags = 0) const;
+ ArrayRef<CalleeSavedInfo> CSI, unsigned StmOpc,
+ unsigned StrOpc, bool NoGap, bool (*Func)(unsigned, bool),
+ unsigned NumAlignedDPRCS2Regs, unsigned MIFlags = 0) const;
void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc,
+ MutableArrayRef<CalleeSavedInfo> CSI, unsigned LdmOpc,
unsigned LdrOpc, bool isVarArg, bool NoGap,
- bool(*Func)(unsigned, bool),
+ bool (*Func)(unsigned, bool),
unsigned NumAlignedDPRCS2Regs) const;
MachineBasicBlock::iterator
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 9b06987178d8..2a9a31dab74f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -145,6 +145,8 @@ public:
// Thumb 2 Addressing Modes:
bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
+ template <unsigned Shift>
+ bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm);
bool SelectT2AddrModeImm8(SDValue N, SDValue &Base,
SDValue &OffImm);
bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
@@ -237,6 +239,10 @@ private:
void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
uint16_t OpcodeWithNoCarry, bool Add, bool Predicated);
+ /// SelectMVE_VSHLC - Select MVE intrinsics for a shift that carries between
+ /// vector lanes.
+ void SelectMVE_VSHLC(SDNode *N, bool Predicated);
+
/// Select long MVE vector reductions with two vector operands
/// Stride is the number of vector element widths the instruction can operate
/// on:
@@ -264,7 +270,21 @@ private:
/// pointer points to a set of NumVecs sub-opcodes used for the
/// different stages (e.g. VLD20 versus VLD21) of each load family.
void SelectMVE_VLD(SDNode *N, unsigned NumVecs,
- const uint16_t *const *Opcodes);
+ const uint16_t *const *Opcodes, bool HasWriteback);
+
+ /// SelectMVE_VxDUP - Select MVE incrementing-dup instructions. Opcodes is an
+ /// array of 3 elements for the 8, 16 and 32-bit lane sizes.
+ void SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,
+ bool Wrapping, bool Predicated);
+
+ /// Select SelectCDE_CXxD - Select CDE dual-GPR instruction (one of CX1D,
+ /// CX1DA, CX2D, CX2DA, CX3, CX3DA).
+ /// \arg \c NumExtraOps number of extra operands besides the coprocossor,
+ /// the accumulator and the immediate operand, i.e. 0
+ /// for CX1*, 1 for CX2*, 2 for CX3*
+ /// \arg \c HasAccum whether the instruction has an accumulator operand
+ void SelectCDE_CXxD(SDNode *N, uint16_t Opcode, size_t NumExtraOps,
+ bool HasAccum);
/// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
/// should be 1, 2, 3 or 4. The opcode array specifies the instructions used
@@ -1171,8 +1191,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
// Only multiples of 4 are allowed for the offset, so the frame object
// alignment must be at least 4.
MachineFrameInfo &MFI = MF->getFrameInfo();
- if (MFI.getObjectAlignment(FI) < 4)
- MFI.setObjectAlignment(FI, 4);
+ if (MFI.getObjectAlign(FI) < Align(4))
+ MFI.setObjectAlignment(FI, Align(4));
Base = CurDAG->getTargetFrameIndex(
FI, TLI->getPointerTy(CurDAG->getDataLayout()));
OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
@@ -1195,9 +1215,9 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
if (RHSC * 4 < MFI.getObjectSize(FI)) {
// For LHS+RHS to result in an offset that's a multiple of 4 the object
// indexed by the LHS must be 4-byte aligned.
- if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4)
- MFI.setObjectAlignment(FI, 4);
- if (MFI.getObjectAlignment(FI) >= 4) {
+ if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlign(FI) < Align(4))
+ MFI.setObjectAlignment(FI, Align(4));
+ if (MFI.getObjectAlign(FI) >= Align(4)) {
Base = CurDAG->getTargetFrameIndex(
FI, TLI->getPointerTy(CurDAG->getDataLayout()));
OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
@@ -1294,6 +1314,33 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
return true;
}
+template <unsigned Shift>
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base,
+ SDValue &OffImm) {
+ if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) {
+ int RHSC;
+ if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -255, 256, RHSC)) {
+ Base = N.getOperand(0);
+ if (Base.getOpcode() == ISD::FrameIndex) {
+ int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+ Base = CurDAG->getTargetFrameIndex(
+ FI, TLI->getPointerTy(CurDAG->getDataLayout()));
+ }
+
+ if (N.getOpcode() == ISD::SUB)
+ RHSC = -RHSC;
+ OffImm =
+ CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32);
+ return true;
+ }
+ }
+
+ // Base only.
+ Base = N;
+ OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
+ return true;
+}
+
bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
SDValue &Base, SDValue &OffImm) {
// Match simple R - imm8 operands.
@@ -1679,7 +1726,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
EVT LoadedVT;
unsigned Opcode = 0;
bool isSExtLd, isPre;
- unsigned Align;
+ Align Alignment;
ARMVCC::VPTCodes Pred;
SDValue PredReg;
SDValue Chain, Base, Offset;
@@ -1695,7 +1742,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
Chain = LD->getChain();
Base = LD->getBasePtr();
Offset = LD->getOffset();
- Align = LD->getAlignment();
+ Alignment = LD->getAlign();
isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
Pred = ARMVCC::None;
@@ -1711,7 +1758,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
Chain = LD->getChain();
Base = LD->getBasePtr();
Offset = LD->getOffset();
- Align = LD->getAlignment();
+ Alignment = LD->getAlign();
isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
Pred = ARMVCC::Then;
@@ -1725,7 +1772,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
bool CanChangeType = Subtarget->isLittle() && !isa<MaskedLoadSDNode>(N);
SDValue NewOffset;
- if (Align >= 2 && LoadedVT == MVT::v4i16 &&
+ if (Alignment >= Align(2) && LoadedVT == MVT::v4i16 &&
SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) {
if (isSExtLd)
Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post;
@@ -1743,12 +1790,12 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post;
else
Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post;
- } else if (Align >= 4 &&
+ } else if (Alignment >= Align(4) &&
(CanChangeType || LoadedVT == MVT::v4i32 ||
LoadedVT == MVT::v4f32) &&
SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 2))
Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post;
- else if (Align >= 2 &&
+ else if (Alignment >= Align(2) &&
(CanChangeType || LoadedVT == MVT::v8i16 ||
LoadedVT == MVT::v8f16) &&
SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1))
@@ -1762,8 +1809,8 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
SDValue Ops[] = {Base, NewOffset,
CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg,
Chain};
- SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0),
- MVT::i32, MVT::Other, Ops);
+ SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+ N->getValueType(0), MVT::Other, Ops);
transferMemOperands(N, New);
ReplaceUses(SDValue(N, 0), SDValue(New, 1));
ReplaceUses(SDValue(N, 1), SDValue(New, 0));
@@ -2009,6 +2056,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
const uint16_t *DOpcodes,
const uint16_t *QOpcodes0,
const uint16_t *QOpcodes1) {
+ assert(Subtarget->hasNEON());
assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
SDLoc dl(N);
@@ -2030,6 +2078,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
// Double-register operations:
case MVT::v8i8: OpcodeIndex = 0; break;
case MVT::v4f16:
+ case MVT::v4bf16:
case MVT::v4i16: OpcodeIndex = 1; break;
case MVT::v2f32:
case MVT::v2i32: OpcodeIndex = 2; break;
@@ -2037,6 +2086,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
// Quad-register operations:
case MVT::v16i8: OpcodeIndex = 0; break;
case MVT::v8f16:
+ case MVT::v8bf16:
case MVT::v8i16: OpcodeIndex = 1; break;
case MVT::v4f32:
case MVT::v4i32: OpcodeIndex = 2; break;
@@ -2148,6 +2198,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
const uint16_t *DOpcodes,
const uint16_t *QOpcodes0,
const uint16_t *QOpcodes1) {
+ assert(Subtarget->hasNEON());
assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
SDLoc dl(N);
@@ -2172,6 +2223,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
// Double-register operations:
case MVT::v8i8: OpcodeIndex = 0; break;
case MVT::v4f16:
+ case MVT::v4bf16:
case MVT::v4i16: OpcodeIndex = 1; break;
case MVT::v2f32:
case MVT::v2i32: OpcodeIndex = 2; break;
@@ -2179,6 +2231,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
// Quad-register operations:
case MVT::v16i8: OpcodeIndex = 0; break;
case MVT::v8f16:
+ case MVT::v8bf16:
case MVT::v8i16: OpcodeIndex = 1; break;
case MVT::v4f32:
case MVT::v4i32: OpcodeIndex = 2; break;
@@ -2299,6 +2352,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
unsigned NumVecs,
const uint16_t *DOpcodes,
const uint16_t *QOpcodes) {
+ assert(Subtarget->hasNEON());
assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
SDLoc dl(N);
@@ -2339,11 +2393,13 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
// Double-register operations:
case MVT::v8i8: OpcodeIndex = 0; break;
case MVT::v4f16:
+ case MVT::v4bf16:
case MVT::v4i16: OpcodeIndex = 1; break;
case MVT::v2f32:
case MVT::v2i32: OpcodeIndex = 2; break;
// Quad-register operations:
case MVT::v8f16:
+ case MVT::v8bf16:
case MVT::v8i16: OpcodeIndex = 0; break;
case MVT::v4f32:
case MVT::v4i32: OpcodeIndex = 1; break;
@@ -2482,7 +2538,16 @@ void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes,
Ops.push_back(N->getOperand(0)); // chain
- CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+ SmallVector<EVT, 8> VTs;
+ VTs.push_back(N->getValueType(1));
+ VTs.push_back(N->getValueType(0));
+ VTs.push_back(N->getValueType(2));
+
+ SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), VTs, Ops);
+ ReplaceUses(SDValue(N, 0), SDValue(New, 1));
+ ReplaceUses(SDValue(N, 1), SDValue(New, 0));
+ ReplaceUses(SDValue(N, 2), SDValue(New, 2));
+ CurDAG->RemoveDeadNode(N);
}
void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode,
@@ -2552,6 +2617,25 @@ void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
}
+void ARMDAGToDAGISel::SelectMVE_VSHLC(SDNode *N, bool Predicated) {
+ SDLoc Loc(N);
+ SmallVector<SDValue, 8> Ops;
+
+ // One vector input, followed by a 32-bit word of bits to shift in
+ // and then an immediate shift count
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(2));
+ int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+ Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count
+
+ if (Predicated)
+ AddMVEPredicateToOps(Ops, Loc, N->getOperand(4));
+ else
+ AddEmptyMVEPredicateToOps(Ops, Loc);
+
+ CurDAG->SelectNodeTo(N, ARM::MVE_VSHLC, N->getVTList(), makeArrayRef(Ops));
+}
+
static bool SDValueToConstBool(SDValue SDVal) {
assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant");
ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal);
@@ -2644,7 +2728,8 @@ void ARMDAGToDAGISel::SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated,
}
void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs,
- const uint16_t *const *Opcodes) {
+ const uint16_t *const *Opcodes,
+ bool HasWriteback) {
EVT VT = N->getValueType(0);
SDLoc Loc(N);
@@ -2664,23 +2749,141 @@ void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs,
}
EVT DataTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, NumVecs * 2);
- EVT ResultTys[] = {DataTy, MVT::Other};
+ SmallVector<EVT, 4> ResultTys = {DataTy, MVT::Other};
+ unsigned PtrOperand = HasWriteback ? 1 : 2;
auto Data = SDValue(
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, DataTy), 0);
SDValue Chain = N->getOperand(0);
- for (unsigned Stage = 0; Stage < NumVecs; ++Stage) {
- SDValue Ops[] = {Data, N->getOperand(2), Chain};
+ // Add a MVE_VLDn instruction for each Vec, except the last
+ for (unsigned Stage = 0; Stage < NumVecs - 1; ++Stage) {
+ SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain};
auto LoadInst =
CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops);
Data = SDValue(LoadInst, 0);
Chain = SDValue(LoadInst, 1);
}
+ // The last may need a writeback on it
+ if (HasWriteback)
+ ResultTys = {DataTy, MVT::i32, MVT::Other};
+ SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain};
+ auto LoadInst =
+ CurDAG->getMachineNode(OurOpcodes[NumVecs - 1], Loc, ResultTys, Ops);
- for (unsigned i = 0; i < NumVecs; i++)
+ unsigned i;
+ for (i = 0; i < NumVecs; i++)
ReplaceUses(SDValue(N, i),
- CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, Data));
- ReplaceUses(SDValue(N, NumVecs), Chain);
+ CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT,
+ SDValue(LoadInst, 0)));
+ if (HasWriteback)
+ ReplaceUses(SDValue(N, i++), SDValue(LoadInst, 1));
+ ReplaceUses(SDValue(N, i), SDValue(LoadInst, HasWriteback ? 2 : 1));
+ CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,
+ bool Wrapping, bool Predicated) {
+ EVT VT = N->getValueType(0);
+ SDLoc Loc(N);
+
+ uint16_t Opcode;
+ switch (VT.getScalarSizeInBits()) {
+ case 8:
+ Opcode = Opcodes[0];
+ break;
+ case 16:
+ Opcode = Opcodes[1];
+ break;
+ case 32:
+ Opcode = Opcodes[2];
+ break;
+ default:
+ llvm_unreachable("bad vector element size in SelectMVE_VxDUP");
+ }
+
+ SmallVector<SDValue, 8> Ops;
+ unsigned OpIdx = 1;
+
+ SDValue Inactive;
+ if (Predicated)
+ Inactive = N->getOperand(OpIdx++);
+
+ Ops.push_back(N->getOperand(OpIdx++)); // base
+ if (Wrapping)
+ Ops.push_back(N->getOperand(OpIdx++)); // limit
+
+ SDValue ImmOp = N->getOperand(OpIdx++); // step
+ int ImmValue = cast<ConstantSDNode>(ImmOp)->getZExtValue();
+ Ops.push_back(getI32Imm(ImmValue, Loc));
+
+ if (Predicated)
+ AddMVEPredicateToOps(Ops, Loc, N->getOperand(OpIdx), Inactive);
+ else
+ AddEmptyMVEPredicateToOps(Ops, Loc, N->getValueType(0));
+
+ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
+void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode,
+ size_t NumExtraOps, bool HasAccum) {
+ bool IsBigEndian = CurDAG->getDataLayout().isBigEndian();
+ SDLoc Loc(N);
+ SmallVector<SDValue, 8> Ops;
+
+ unsigned OpIdx = 1;
+
+ // Convert and append the immediate operand designating the coprocessor.
+ SDValue ImmCorpoc = N->getOperand(OpIdx++);
+ uint32_t ImmCoprocVal = cast<ConstantSDNode>(ImmCorpoc)->getZExtValue();
+ Ops.push_back(getI32Imm(ImmCoprocVal, Loc));
+
+ // For accumulating variants copy the low and high order parts of the
+ // accumulator into a register pair and add it to the operand vector.
+ if (HasAccum) {
+ SDValue AccLo = N->getOperand(OpIdx++);
+ SDValue AccHi = N->getOperand(OpIdx++);
+ if (IsBigEndian)
+ std::swap(AccLo, AccHi);
+ Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, AccLo, AccHi), 0));
+ }
+
+ // Copy extra operands as-is.
+ for (size_t I = 0; I < NumExtraOps; I++)
+ Ops.push_back(N->getOperand(OpIdx++));
+
+ // Convert and append the immediate operand
+ SDValue Imm = N->getOperand(OpIdx);
+ uint32_t ImmVal = cast<ConstantSDNode>(Imm)->getZExtValue();
+ Ops.push_back(getI32Imm(ImmVal, Loc));
+
+ // Accumulating variants are IT-predicable, add predicate operands.
+ if (HasAccum) {
+ SDValue Pred = getAL(CurDAG, Loc);
+ SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+ Ops.push_back(Pred);
+ Ops.push_back(PredReg);
+ }
+
+ // Create the CDE intruction
+ SDNode *InstrNode = CurDAG->getMachineNode(Opcode, Loc, MVT::Untyped, Ops);
+ SDValue ResultPair = SDValue(InstrNode, 0);
+
+ // The original intrinsic had two outputs, and the output of the dual-register
+ // CDE instruction is a register pair. We need to extract the two subregisters
+ // and replace all uses of the original outputs with the extracted
+ // subregisters.
+ uint16_t SubRegs[2] = {ARM::gsub_0, ARM::gsub_1};
+ if (IsBigEndian)
+ std::swap(SubRegs[0], SubRegs[1]);
+
+ for (size_t ResIdx = 0; ResIdx < 2; ResIdx++) {
+ if (SDValue(N, ResIdx).use_empty())
+ continue;
+ SDValue SubReg = CurDAG->getTargetExtractSubreg(SubRegs[ResIdx], Loc,
+ MVT::i32, ResultPair);
+ ReplaceUses(SDValue(N, ResIdx), SubReg);
+ }
+
CurDAG->RemoveDeadNode(N);
}
@@ -2689,6 +2892,7 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
const uint16_t *DOpcodes,
const uint16_t *QOpcodes0,
const uint16_t *QOpcodes1) {
+ assert(Subtarget->hasNEON());
assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
SDLoc dl(N);
@@ -2725,6 +2929,8 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
case MVT::v8i16:
case MVT::v4f16:
case MVT::v8f16:
+ case MVT::v4bf16:
+ case MVT::v8bf16:
OpcodeIndex = 1; break;
case MVT::v2f32:
case MVT::v2i32:
@@ -3202,7 +3408,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
MachineFunction& MF = CurDAG->getMachineFunction();
MachineMemOperand *MemOp =
MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
- MachineMemOperand::MOLoad, 4, 4);
+ MachineMemOperand::MOLoad, 4, Align(4));
CurDAG->setNodeMemRefs(cast<MachineSDNode>(ResNode), {MemOp});
@@ -3222,8 +3428,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
// Set the alignment of the frame object to 4, to avoid having to generate
// more than one ADD
MachineFrameInfo &MFI = MF->getFrameInfo();
- if (MFI.getObjectAlignment(FI) < 4)
- MFI.setObjectAlignment(FI, 4);
+ if (MFI.getObjectAlign(FI) < Align(4))
+ MFI.setObjectAlignment(FI, Align(4));
CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
CurDAG->getTargetConstant(0, dl, MVT::i32));
return;
@@ -3486,6 +3692,59 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
CurDAG->RemoveDeadNode(N);
return;
}
+ case ARMISD::LDRD: {
+ if (Subtarget->isThumb2())
+ break; // TableGen handles isel in this case.
+ SDValue Base, RegOffset, ImmOffset;
+ const SDValue &Chain = N->getOperand(0);
+ const SDValue &Addr = N->getOperand(1);
+ SelectAddrMode3(Addr, Base, RegOffset, ImmOffset);
+ if (RegOffset != CurDAG->getRegister(0, MVT::i32)) {
+ // The register-offset variant of LDRD mandates that the register
+ // allocated to RegOffset is not reused in any of the remaining operands.
+ // This restriction is currently not enforced. Therefore emitting this
+ // variant is explicitly avoided.
+ Base = Addr;
+ RegOffset = CurDAG->getRegister(0, MVT::i32);
+ }
+ SDValue Ops[] = {Base, RegOffset, ImmOffset, Chain};
+ SDNode *New = CurDAG->getMachineNode(ARM::LOADDUAL, dl,
+ {MVT::Untyped, MVT::Other}, Ops);
+ SDValue Lo = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32,
+ SDValue(New, 0));
+ SDValue Hi = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32,
+ SDValue(New, 0));
+ transferMemOperands(N, New);
+ ReplaceUses(SDValue(N, 0), Lo);
+ ReplaceUses(SDValue(N, 1), Hi);
+ ReplaceUses(SDValue(N, 2), SDValue(New, 1));
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
+ case ARMISD::STRD: {
+ if (Subtarget->isThumb2())
+ break; // TableGen handles isel in this case.
+ SDValue Base, RegOffset, ImmOffset;
+ const SDValue &Chain = N->getOperand(0);
+ const SDValue &Addr = N->getOperand(3);
+ SelectAddrMode3(Addr, Base, RegOffset, ImmOffset);
+ if (RegOffset != CurDAG->getRegister(0, MVT::i32)) {
+ // The register-offset variant of STRD mandates that the register
+ // allocated to RegOffset is not reused in any of the remaining operands.
+ // This restriction is currently not enforced. Therefore emitting this
+ // variant is explicitly avoided.
+ Base = Addr;
+ RegOffset = CurDAG->getRegister(0, MVT::i32);
+ }
+ SDNode *RegPair =
+ createGPRPairNode(MVT::Untyped, N->getOperand(1), N->getOperand(2));
+ SDValue Ops[] = {SDValue(RegPair, 0), Base, RegOffset, ImmOffset, Chain};
+ SDNode *New = CurDAG->getMachineNode(ARM::STOREDUAL, dl, MVT::Other, Ops);
+ transferMemOperands(N, New);
+ ReplaceUses(SDValue(N, 0), SDValue(New, 0));
+ CurDAG->RemoveDeadNode(N);
+ return;
+ }
case ARMISD::LOOP_DEC: {
SDValue Ops[] = { N->getOperand(1),
N->getOperand(2),
@@ -3828,14 +4087,24 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
}
case ARMISD::VLD2_UPD: {
- static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed,
- ARM::VLD2d16wb_fixed,
- ARM::VLD2d32wb_fixed,
- ARM::VLD1q64wb_fixed};
- static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
- ARM::VLD2q16PseudoWB_fixed,
- ARM::VLD2q32PseudoWB_fixed };
- SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = {
+ ARM::VLD2d8wb_fixed, ARM::VLD2d16wb_fixed, ARM::VLD2d32wb_fixed,
+ ARM::VLD1q64wb_fixed};
+ static const uint16_t QOpcodes[] = {ARM::VLD2q8PseudoWB_fixed,
+ ARM::VLD2q16PseudoWB_fixed,
+ ARM::VLD2q32PseudoWB_fixed};
+ SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+ } else {
+ static const uint16_t Opcodes8[] = {ARM::MVE_VLD20_8,
+ ARM::MVE_VLD21_8_wb};
+ static const uint16_t Opcodes16[] = {ARM::MVE_VLD20_16,
+ ARM::MVE_VLD21_16_wb};
+ static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32,
+ ARM::MVE_VLD21_32_wb};
+ static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
+ SelectMVE_VLD(N, 2, Opcodes, true);
+ }
return;
}
@@ -3855,17 +4124,30 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
}
case ARMISD::VLD4_UPD: {
- static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD,
- ARM::VLD4d16Pseudo_UPD,
- ARM::VLD4d32Pseudo_UPD,
- ARM::VLD1d64QPseudoWB_fixed};
- static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
- ARM::VLD4q16Pseudo_UPD,
- ARM::VLD4q32Pseudo_UPD };
- static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
- ARM::VLD4q16oddPseudo_UPD,
- ARM::VLD4q32oddPseudo_UPD };
- SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = {
+ ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, ARM::VLD4d32Pseudo_UPD,
+ ARM::VLD1d64QPseudoWB_fixed};
+ static const uint16_t QOpcodes0[] = {ARM::VLD4q8Pseudo_UPD,
+ ARM::VLD4q16Pseudo_UPD,
+ ARM::VLD4q32Pseudo_UPD};
+ static const uint16_t QOpcodes1[] = {ARM::VLD4q8oddPseudo_UPD,
+ ARM::VLD4q16oddPseudo_UPD,
+ ARM::VLD4q32oddPseudo_UPD};
+ SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ } else {
+ static const uint16_t Opcodes8[] = {ARM::MVE_VLD40_8, ARM::MVE_VLD41_8,
+ ARM::MVE_VLD42_8,
+ ARM::MVE_VLD43_8_wb};
+ static const uint16_t Opcodes16[] = {ARM::MVE_VLD40_16, ARM::MVE_VLD41_16,
+ ARM::MVE_VLD42_16,
+ ARM::MVE_VLD43_16_wb};
+ static const uint16_t Opcodes32[] = {ARM::MVE_VLD40_32, ARM::MVE_VLD41_32,
+ ARM::MVE_VLD42_32,
+ ARM::MVE_VLD43_32_wb};
+ static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
+ SelectMVE_VLD(N, 4, Opcodes, true);
+ }
return;
}
@@ -3913,15 +4195,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
}
case ARMISD::VST2_UPD: {
- static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed,
- ARM::VST2d16wb_fixed,
- ARM::VST2d32wb_fixed,
- ARM::VST1q64wb_fixed};
- static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
- ARM::VST2q16PseudoWB_fixed,
- ARM::VST2q32PseudoWB_fixed };
- SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
- return;
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = {
+ ARM::VST2d8wb_fixed, ARM::VST2d16wb_fixed, ARM::VST2d32wb_fixed,
+ ARM::VST1q64wb_fixed};
+ static const uint16_t QOpcodes[] = {ARM::VST2q8PseudoWB_fixed,
+ ARM::VST2q16PseudoWB_fixed,
+ ARM::VST2q32PseudoWB_fixed};
+ SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+ return;
+ }
+ break;
}
case ARMISD::VST3_UPD: {
@@ -3940,18 +4224,20 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
}
case ARMISD::VST4_UPD: {
- static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD,
- ARM::VST4d16Pseudo_UPD,
- ARM::VST4d32Pseudo_UPD,
- ARM::VST1d64QPseudoWB_fixed};
- static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
- ARM::VST4q16Pseudo_UPD,
- ARM::VST4q32Pseudo_UPD };
- static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
- ARM::VST4q16oddPseudo_UPD,
- ARM::VST4q32oddPseudo_UPD };
- SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
- return;
+ if (Subtarget->hasNEON()) {
+ static const uint16_t DOpcodes[] = {
+ ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, ARM::VST4d32Pseudo_UPD,
+ ARM::VST1d64QPseudoWB_fixed};
+ static const uint16_t QOpcodes0[] = {ARM::VST4q8Pseudo_UPD,
+ ARM::VST4q16Pseudo_UPD,
+ ARM::VST4q32Pseudo_UPD};
+ static const uint16_t QOpcodes1[] = {ARM::VST4q8oddPseudo_UPD,
+ ARM::VST4q16oddPseudo_UPD,
+ ARM::VST4q32oddPseudo_UPD};
+ SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return;
+ }
+ break;
}
case ARMISD::VST2LN_UPD: {
@@ -4430,7 +4716,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32,
ARM::MVE_VLD21_32};
static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
- SelectMVE_VLD(N, 2, Opcodes);
+ SelectMVE_VLD(N, 2, Opcodes, false);
return;
}
@@ -4444,7 +4730,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
ARM::MVE_VLD42_32,
ARM::MVE_VLD43_32};
static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
- SelectMVE_VLD(N, 4, Opcodes);
+ SelectMVE_VLD(N, 4, Opcodes, false);
return;
}
}
@@ -4457,6 +4743,29 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
default:
break;
+ // Scalar f32 -> bf16
+ case Intrinsic::arm_neon_vcvtbfp2bf: {
+ SDLoc dl(N);
+ const SDValue &Src = N->getOperand(1);
+ llvm::EVT DestTy = N->getValueType(0);
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { Src, Src, Pred, Reg0 };
+ CurDAG->SelectNodeTo(N, ARM::BF16_VCVTB, DestTy, Ops);
+ return;
+ }
+
+ // Vector v4f32 -> v4bf16
+ case Intrinsic::arm_neon_vcvtfp2bf: {
+ SDLoc dl(N);
+ const SDValue &Src = N->getOperand(1);
+ SDValue Pred = getAL(CurDAG, dl);
+ SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SDValue Ops[] = { Src, Pred, Reg0 };
+ CurDAG->SelectNodeTo(N, ARM::BF16_VCVT, MVT::v4bf16, Ops);
+ return;
+ }
+
case Intrinsic::arm_mve_urshrl:
SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false);
return;
@@ -4475,18 +4784,21 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
case Intrinsic::arm_mve_sqrshrl:
SelectMVE_LongShift(N, ARM::MVE_SQRSHRL, false, true);
return;
- case Intrinsic::arm_mve_lsll:
- SelectMVE_LongShift(N, ARM::MVE_LSLLr, false, false);
- return;
- case Intrinsic::arm_mve_asrl:
- SelectMVE_LongShift(N, ARM::MVE_ASRLr, false, false);
- return;
case Intrinsic::arm_mve_vadc:
case Intrinsic::arm_mve_vadc_predicated:
SelectMVE_VADCSBC(N, ARM::MVE_VADC, ARM::MVE_VADCI, true,
IntNo == Intrinsic::arm_mve_vadc_predicated);
return;
+ case Intrinsic::arm_mve_vsbc:
+ case Intrinsic::arm_mve_vsbc_predicated:
+ SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true,
+ IntNo == Intrinsic::arm_mve_vsbc_predicated);
+ return;
+ case Intrinsic::arm_mve_vshlc:
+ case Intrinsic::arm_mve_vshlc_predicated:
+ SelectMVE_VSHLC(N, IntNo == Intrinsic::arm_mve_vshlc_predicated);
+ return;
case Intrinsic::arm_mve_vmlldava:
case Intrinsic::arm_mve_vmlldava_predicated: {
@@ -4524,6 +4836,80 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
OpcodesS, OpcodesU);
return;
}
+
+ case Intrinsic::arm_mve_vidup:
+ case Intrinsic::arm_mve_vidup_predicated: {
+ static const uint16_t Opcodes[] = {
+ ARM::MVE_VIDUPu8, ARM::MVE_VIDUPu16, ARM::MVE_VIDUPu32,
+ };
+ SelectMVE_VxDUP(N, Opcodes, false,
+ IntNo == Intrinsic::arm_mve_vidup_predicated);
+ return;
+ }
+
+ case Intrinsic::arm_mve_vddup:
+ case Intrinsic::arm_mve_vddup_predicated: {
+ static const uint16_t Opcodes[] = {
+ ARM::MVE_VDDUPu8, ARM::MVE_VDDUPu16, ARM::MVE_VDDUPu32,
+ };
+ SelectMVE_VxDUP(N, Opcodes, false,
+ IntNo == Intrinsic::arm_mve_vddup_predicated);
+ return;
+ }
+
+ case Intrinsic::arm_mve_viwdup:
+ case Intrinsic::arm_mve_viwdup_predicated: {
+ static const uint16_t Opcodes[] = {
+ ARM::MVE_VIWDUPu8, ARM::MVE_VIWDUPu16, ARM::MVE_VIWDUPu32,
+ };
+ SelectMVE_VxDUP(N, Opcodes, true,
+ IntNo == Intrinsic::arm_mve_viwdup_predicated);
+ return;
+ }
+
+ case Intrinsic::arm_mve_vdwdup:
+ case Intrinsic::arm_mve_vdwdup_predicated: {
+ static const uint16_t Opcodes[] = {
+ ARM::MVE_VDWDUPu8, ARM::MVE_VDWDUPu16, ARM::MVE_VDWDUPu32,
+ };
+ SelectMVE_VxDUP(N, Opcodes, true,
+ IntNo == Intrinsic::arm_mve_vdwdup_predicated);
+ return;
+ }
+
+ case Intrinsic::arm_cde_cx1d:
+ case Intrinsic::arm_cde_cx1da:
+ case Intrinsic::arm_cde_cx2d:
+ case Intrinsic::arm_cde_cx2da:
+ case Intrinsic::arm_cde_cx3d:
+ case Intrinsic::arm_cde_cx3da: {
+ bool HasAccum = IntNo == Intrinsic::arm_cde_cx1da ||
+ IntNo == Intrinsic::arm_cde_cx2da ||
+ IntNo == Intrinsic::arm_cde_cx3da;
+ size_t NumExtraOps;
+ uint16_t Opcode;
+ switch (IntNo) {
+ case Intrinsic::arm_cde_cx1d:
+ case Intrinsic::arm_cde_cx1da:
+ NumExtraOps = 0;
+ Opcode = HasAccum ? ARM::CDE_CX1DA : ARM::CDE_CX1D;
+ break;
+ case Intrinsic::arm_cde_cx2d:
+ case Intrinsic::arm_cde_cx2da:
+ NumExtraOps = 1;
+ Opcode = HasAccum ? ARM::CDE_CX2DA : ARM::CDE_CX2D;
+ break;
+ case Intrinsic::arm_cde_cx3d:
+ case Intrinsic::arm_cde_cx3da:
+ NumExtraOps = 2;
+ Opcode = HasAccum ? ARM::CDE_CX3DA : ARM::CDE_CX3D;
+ break;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+ SelectCDE_CXxD(N, Opcode, NumExtraOps, HasAccum);
+ return;
+ }
}
break;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9f504b1eaa42..287e2e60e572 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -210,6 +210,8 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
if (!VT.isFloatingPoint() &&
VT != MVT::v2i64 && VT != MVT::v1i64)
@@ -284,6 +286,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UDIVREM, VT, Expand);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::CTPOP, VT, Expand);
// Vector reductions
@@ -292,6 +296,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
if (!HasMVEFP) {
setOperationAction(ISD::SINT_TO_FP, VT, Expand);
@@ -341,6 +349,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::FMINNUM, VT, Legal);
setOperationAction(ISD::FMAXNUM, VT, Legal);
setOperationAction(ISD::FROUND, VT, Legal);
+ setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
// No native support for these.
setOperationAction(ISD::FDIV, VT, Expand);
@@ -358,6 +370,17 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
}
}
+ // Custom Expand smaller than legal vector reductions to prevent false zero
+ // items being added.
+ setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
+
// We 'support' these types up to bitcast/load/store level, regardless of
// MVE integer-only / float support. Only doing FP data processing on the FP
// vector types is inhibited at integer-only level.
@@ -717,13 +740,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->hasFullFP16()) {
addRegisterClass(MVT::f16, &ARM::HPRRegClass);
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
- setOperationAction(ISD::BITCAST, MVT::i32, Custom);
setOperationAction(ISD::BITCAST, MVT::f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
}
+ if (Subtarget->hasBF16()) {
+ addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
+ setAllExpand(MVT::bf16);
+ if (!Subtarget->hasFullFP16())
+ setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+ }
+
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
@@ -771,6 +800,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v8f16);
addDRTypeForNEON(MVT::v4f16);
}
+
+ if (Subtarget->hasBF16()) {
+ addQRTypeForNEON(MVT::v8bf16);
+ addDRTypeForNEON(MVT::v4bf16);
+ }
}
if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
@@ -912,9 +946,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::v4f32, Expand);
}
- setTargetDAGCombine(ISD::INTRINSIC_VOID);
- setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
- setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::SRA);
@@ -938,10 +969,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::BUILD_VECTOR);
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::VECREDUCE_ADD);
+ setTargetDAGCombine(ISD::ADD);
+ setTargetDAGCombine(ISD::BITCAST);
+ }
+ if (Subtarget->hasMVEIntegerOps()) {
+ setTargetDAGCombine(ISD::SMIN);
+ setTargetDAGCombine(ISD::UMIN);
+ setTargetDAGCombine(ISD::SMAX);
+ setTargetDAGCombine(ISD::UMAX);
+ setTargetDAGCombine(ISD::FP_EXTEND);
}
if (!Subtarget->hasFP64()) {
@@ -1073,6 +1118,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRA, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+ setOperationAction(ISD::LOAD, MVT::i64, Custom);
+ setOperationAction(ISD::STORE, MVT::i64, Custom);
// MVE lowers 64 bit shifts to lsll and lsrl
// assuming that ISD::SRL and SRA of i64 are already marked custom
@@ -1419,12 +1466,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
}
if (Subtarget->hasNEON()) {
- // vmin and vmax aren't available in a scalar form, so we use
- // a NEON instruction with an undef lane instead.
- setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
- setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
- setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
- setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+ // vmin and vmax aren't available in a scalar form, so we can use
+ // a NEON instruction with an undef lane instead. This has a performance
+ // penalty on some cores, so we don't do this unless we have been
+ // asked to by the core tuning model.
+ if (Subtarget->useNEONForSinglePrecisionFP()) {
+ setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+ setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
+ }
setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
@@ -1452,6 +1503,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::XOR);
+ if (Subtarget->hasMVEIntegerOps())
+ setTargetDAGCombine(ISD::VSELECT);
+
if (Subtarget->hasV6Ops())
setTargetDAGCombine(ISD::SRL);
if (Subtarget->isThumb1Only())
@@ -1550,10 +1604,12 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::CALL: return "ARMISD::CALL";
case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED";
case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK";
+ case ARMISD::tSECALL: return "ARMISD::tSECALL";
case ARMISD::BRCOND: return "ARMISD::BRCOND";
case ARMISD::BR_JT: return "ARMISD::BR_JT";
case ARMISD::BR2_JT: return "ARMISD::BR2_JT";
case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG";
+ case ARMISD::SERET_FLAG: return "ARMISD::SERET_FLAG";
case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG";
case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD";
case ARMISD::CMP: return "ARMISD::CMP";
@@ -1606,10 +1662,14 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::PRELOAD: return "ARMISD::PRELOAD";
+ case ARMISD::LDRD: return "ARMISD::LDRD";
+ case ARMISD::STRD: return "ARMISD::STRD";
+
case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK";
case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK";
case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
+ case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST";
case ARMISD::VCMP: return "ARMISD::VCMP";
case ARMISD::VCMPZ: return "ARMISD::VCMPZ";
case ARMISD::VTST: return "ARMISD::VTST";
@@ -1650,8 +1710,28 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VTBL1: return "ARMISD::VTBL1";
case ARMISD::VTBL2: return "ARMISD::VTBL2";
case ARMISD::VMOVN: return "ARMISD::VMOVN";
+ case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs";
+ case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu";
+ case ARMISD::VCVTN: return "ARMISD::VCVTN";
+ case ARMISD::VCVTL: return "ARMISD::VCVTL";
case ARMISD::VMULLs: return "ARMISD::VMULLs";
case ARMISD::VMULLu: return "ARMISD::VMULLu";
+ case ARMISD::VADDVs: return "ARMISD::VADDVs";
+ case ARMISD::VADDVu: return "ARMISD::VADDVu";
+ case ARMISD::VADDLVs: return "ARMISD::VADDLVs";
+ case ARMISD::VADDLVu: return "ARMISD::VADDLVu";
+ case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs";
+ case ARMISD::VADDLVAu: return "ARMISD::VADDLVAu";
+ case ARMISD::VADDLVps: return "ARMISD::VADDLVps";
+ case ARMISD::VADDLVpu: return "ARMISD::VADDLVpu";
+ case ARMISD::VADDLVAps: return "ARMISD::VADDLVAps";
+ case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu";
+ case ARMISD::VMLAVs: return "ARMISD::VMLAVs";
+ case ARMISD::VMLAVu: return "ARMISD::VMLAVu";
+ case ARMISD::VMLALVs: return "ARMISD::VMLALVs";
+ case ARMISD::VMLALVu: return "ARMISD::VMLALVu";
+ case ARMISD::VMLALVAs: return "ARMISD::VMLALVAs";
+ case ARMISD::VMLALVAu: return "ARMISD::VMLALVAu";
case ARMISD::UMAAL: return "ARMISD::UMAAL";
case ARMISD::UMLAL: return "ARMISD::UMLAL";
case ARMISD::SMLAL: return "ARMISD::SMLAL";
@@ -1955,6 +2035,35 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
}
}
+SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
+ MVT LocVT, MVT ValVT, SDValue Val) const {
+ Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
+ Val);
+ if (Subtarget->hasFullFP16()) {
+ Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
+ } else {
+ Val = DAG.getNode(ISD::TRUNCATE, dl,
+ MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
+ Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
+ }
+ return Val;
+}
+
+SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
+ MVT LocVT, MVT ValVT,
+ SDValue Val) const {
+ if (Subtarget->hasFullFP16()) {
+ Val = DAG.getNode(ARMISD::VMOVrh, dl,
+ MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
+ } else {
+ Val = DAG.getNode(ISD::BITCAST, dl,
+ MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
+ Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
+ MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
+ }
+ return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
+}
+
/// LowerCallResult - Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
SDValue ARMTargetLowering::LowerCallResult(
@@ -1982,7 +2091,8 @@ SDValue ARMTargetLowering::LowerCallResult(
}
SDValue Val;
- if (VA.needsCustom()) {
+ if (VA.needsCustom() &&
+ (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
// Handle f64 or half of a v2f64.
SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
InFlag);
@@ -2031,6 +2141,13 @@ SDValue ARMTargetLowering::LowerCallResult(
break;
}
+ // f16 arguments have their size extended to 4 bytes and passed as if they
+ // had been copied to the LSBs of a 32-bit register.
+ // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
+ if (VA.needsCustom() &&
+ (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
+ Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
+
InVals.push_back(Val);
}
@@ -2097,22 +2214,34 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool isVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
+ ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
MachineFunction::CallSiteInfo CSInfo;
bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
bool isThisReturn = false;
+ bool isCmseNSCall = false;
bool PreferIndirect = false;
+ // Determine whether this is a non-secure function call.
+ if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call"))
+ isCmseNSCall = true;
+
// Disable tail calls if they're not supported.
if (!Subtarget->supportsTailCall())
isTailCall = false;
+ // For both the non-secure calls and the returns from a CMSE entry function,
+ // the function needs to do some extra work afte r the call, or before the
+ // return, respectively, thus it cannot end with atail call
+ if (isCmseNSCall || AFI->isCmseNSEntryFunction())
+ isTailCall = false;
+
if (isa<GlobalAddressSDNode>(Callee)) {
// If we're optimizing for minimum size and the function is called three or
// more times in this block, we can improve codesize by calling indirectly
// as BLXr has a 16-bit encoding.
auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
- if (CLI.CS) {
- auto *BB = CLI.CS.getParent();
+ if (CLI.CB) {
+ auto *BB = CLI.CB->getParent();
PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
count_if(GV->users(), [&BB](const User *U) {
return isa<Instruction>(U) &&
@@ -2126,7 +2255,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Callee, CallConv, isVarArg, isStructRet,
MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
PreferIndirect);
- if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
+ if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// We don't support GuaranteedTailCallOpt for ARM, only automatically
@@ -2187,31 +2316,50 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
break;
}
- // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
- if (VA.needsCustom()) {
- if (VA.getLocVT() == MVT::v2f64) {
- SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
- DAG.getConstant(0, dl, MVT::i32));
- SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
- DAG.getConstant(1, dl, MVT::i32));
-
- PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
- VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
-
- VA = ArgLocs[++i]; // skip ahead to next loc
- if (VA.isRegLoc()) {
- PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
- VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
- } else {
- assert(VA.isMemLoc());
+ // f16 arguments have their size extended to 4 bytes and passed as if they
+ // had been copied to the LSBs of a 32-bit register.
+ // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
+ if (VA.needsCustom() &&
+ (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
+ Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
+ } else {
+ // f16 arguments could have been extended prior to argument lowering.
+ // Mask them arguments if this is a CMSE nonsecure call.
+ auto ArgVT = Outs[realArgIdx].ArgVT;
+ if (isCmseNSCall && (ArgVT == MVT::f16)) {
+ auto LocBits = VA.getLocVT().getSizeInBits();
+ auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
+ SDValue Mask =
+ DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
+ Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
+ Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
+ Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+ }
+ }
- MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
- dl, DAG, VA, Flags));
- }
- } else {
- PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+ // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
+ if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
+ SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+ DAG.getConstant(1, dl, MVT::i32));
+
+ PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
+ StackPtr, MemOpChains, Flags);
+
+ VA = ArgLocs[++i]; // skip ahead to next loc
+ if (VA.isRegLoc()) {
+ PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
StackPtr, MemOpChains, Flags);
+ } else {
+ assert(VA.isMemLoc());
+
+ MemOpChains.push_back(
+ LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags));
}
+ } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
+ PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+ StackPtr, MemOpChains, Flags);
} else if (VA.isRegLoc()) {
if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
Outs[0].VT == MVT::i32) {
@@ -2222,7 +2370,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
isThisReturn = true;
}
const TargetOptions &Options = DAG.getTarget().Options;
- if (Options.EnableDebugEntryValues)
+ if (Options.EmitCallSiteInfo)
CSInfo.emplace_back(VA.getLocReg(), i);
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else if (isByVal) {
@@ -2245,9 +2393,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
- SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
- MachinePointerInfo(),
- DAG.InferPtrAlignment(AddArg));
+ SDValue Load =
+ DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
+ DAG.InferPtrAlign(AddArg));
MemOpChains.push_back(Load.getValue(1));
RegsToPass.push_back(std::make_pair(j, Load));
}
@@ -2268,8 +2416,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
MVT::i32);
- SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
- MVT::i32);
+ SDValue AlignNode =
+ DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
@@ -2311,7 +2459,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
bool isLocalARMFunc = false;
- ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
auto PtrVt = getPointerTy(DAG.getDataLayout());
if (Subtarget->genLongCalls()) {
@@ -2327,7 +2474,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
// Get the address of the callee into a register
- SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
Callee = DAG.getLoad(
PtrVt, dl, DAG.getEntryNode(), CPAddr,
@@ -2341,7 +2488,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
ARMPCLabelIndex, 0);
// Get the address of the callee into a register
- SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
Callee = DAG.getLoad(
PtrVt, dl, DAG.getEntryNode(), CPAddr,
@@ -2393,7 +2540,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
ARMConstantPoolValue *CPV =
ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
ARMPCLabelIndex, 4);
- SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
Callee = DAG.getLoad(
PtrVt, dl, DAG.getEntryNode(), CPAddr,
@@ -2405,10 +2552,31 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
}
+ if (isCmseNSCall) {
+ assert(!isARMFunc && !isDirect &&
+ "Cannot handle call to ARM function or direct call");
+ if (NumBytes > 0) {
+ DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(),
+ "call to non-secure function would "
+ "require passing arguments on stack",
+ dl.getDebugLoc());
+ DAG.getContext()->diagnose(Diag);
+ }
+ if (isStructRet) {
+ DiagnosticInfoUnsupported Diag(
+ DAG.getMachineFunction().getFunction(),
+ "call to non-secure function would return value through pointer",
+ dl.getDebugLoc());
+ DAG.getContext()->diagnose(Diag);
+ }
+ }
+
// FIXME: handle tail calls differently.
unsigned CallOpc;
if (Subtarget->isThumb()) {
- if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
+ if (isCmseNSCall)
+ CallOpc = ARMISD::tSECALL;
+ else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
CallOpc = ARMISD::CALL_NOLINK;
else
CallOpc = ARMISD::CALL;
@@ -2468,6 +2636,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
+ DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
InFlag = Chain.getValue(1);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
@@ -2488,15 +2657,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
/// and then confiscate the rest of the parameter registers to insure
/// this.
void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
- unsigned Align) const {
+ Align Alignment) const {
// Byval (as with any stack) slots are always at least 4 byte aligned.
- Align = std::max(Align, 4U);
+ Alignment = std::max(Alignment, Align(4));
unsigned Reg = State->AllocateReg(GPRArgRegs);
if (!Reg)
return;
- unsigned AlignInRegs = Align / 4;
+ unsigned AlignInRegs = Alignment.value() / 4;
unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
for (unsigned i = 0; i < Waste; ++i)
Reg = State->AllocateReg(GPRArgRegs);
@@ -2635,9 +2804,11 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
// Check that the call results are passed in the same way.
LLVMContext &C = *DAG.getContext();
- if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
- CCAssignFnForReturn(CalleeCC, isVarArg),
- CCAssignFnForReturn(CallerCC, isVarArg)))
+ if (!CCState::resultsCompatible(
+ getEffectiveCallingConv(CalleeCC, isVarArg),
+ getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
+ CCAssignFnForReturn(CalleeCC, isVarArg),
+ CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
return false;
// The callee has to preserve all registers the caller needs to preserve.
const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
@@ -2678,7 +2849,7 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
if (VA.getLocInfo() == CCValAssign::Indirect)
return false;
- if (VA.needsCustom()) {
+ if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
// f64 and vector types are split into multiple registers or
// register/stack-slot combinations. The types will not match
// the registers; give up on memory f64 refs until we figure
@@ -2777,6 +2948,17 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
AFI->setReturnRegsCount(RVLocs.size());
+ // Report error if cmse entry function returns structure through first ptr arg.
+ if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
+ // Note: using an empty SDLoc(), as the first line of the function is a
+ // better place to report than the last line.
+ DiagnosticInfoUnsupported Diag(
+ DAG.getMachineFunction().getFunction(),
+ "secure entry function would return value through pointer",
+ SDLoc().getDebugLoc());
+ DAG.getContext()->diagnose(Diag);
+ }
+
// Copy the result values into the output registers.
for (unsigned i = 0, realRVLocIdx = 0;
i != RVLocs.size();
@@ -2819,7 +3001,24 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
break;
}
- if (VA.needsCustom()) {
+ // Mask f16 arguments if this is a CMSE nonsecure entry.
+ auto RetVT = Outs[realRVLocIdx].ArgVT;
+ if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
+ if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
+ Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
+ } else {
+ auto LocBits = VA.getLocVT().getSizeInBits();
+ auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
+ SDValue Mask =
+ DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
+ Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
+ Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
+ Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+ }
+ }
+
+ if (VA.needsCustom() &&
+ (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
if (VA.getLocVT() == MVT::v2f64) {
// Extract the first half and return it in two registers.
SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
@@ -2827,15 +3026,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), Half);
- Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
- HalfGPRs.getValue(isLittleEndian ? 0 : 1),
- Flag);
+ Chain =
+ DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+ HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
VA = RVLocs[++i]; // skip ahead to next loc
- Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
- HalfGPRs.getValue(isLittleEndian ? 1 : 0),
- Flag);
+ Chain =
+ DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+ HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
VA = RVLocs[++i]; // skip ahead to next loc
@@ -2849,22 +3048,20 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
DAG.getVTList(MVT::i32, MVT::i32), Arg);
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
- fmrrd.getValue(isLittleEndian ? 0 : 1),
- Flag);
+ fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
VA = RVLocs[++i]; // skip ahead to next loc
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
- fmrrd.getValue(isLittleEndian ? 1 : 0),
- Flag);
+ fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
} else
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
// Guarantee that all emitted copies are
// stuck together, avoiding something bad.
Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(VA.getLocReg(),
- ReturnF16 ? MVT::f16 : VA.getLocVT()));
+ RetOps.push_back(DAG.getRegister(
+ VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
}
const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
@@ -2898,7 +3095,9 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
return LowerInterruptReturn(RetOps, dl, DAG);
}
- return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
+ ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG :
+ ARMISD::RET_FLAG;
+ return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
}
bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -3040,11 +3239,10 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
}
if (CP->isMachineConstantPoolEntry())
- Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
- CP->getAlignment());
+ Res =
+ DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
else
- Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
- CP->getAlignment());
+ Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign());
return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
}
@@ -3063,14 +3261,14 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
SDValue CPAddr;
bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
if (!IsPositionIndependent) {
- CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
+ CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
} else {
unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
ARMPCLabelIndex = AFI->createPICLabelUId();
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
ARMCP::CPBlockAddress, PCAdj);
- CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
}
CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
SDValue Result = DAG.getLoad(
@@ -3199,8 +3397,9 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
const auto *GA = cast<GlobalAddressSDNode>(Op);
auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
SDValue Offset = DAG.getLoad(
- PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
- DAG.getTargetConstantPool(CPV, PtrVT, 4)),
+ PtrVT, DL, Chain,
+ DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
+ DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
@@ -3219,7 +3418,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
- SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
Argument = DAG.getLoad(
PtrVT, dl, DAG.getEntryNode(), Argument,
@@ -3270,7 +3469,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
true);
- Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
Offset = DAG.getLoad(
PtrVT, dl, Chain, Offset,
@@ -3288,7 +3487,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
assert(model == TLSModel::LocalExec);
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
- Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
Offset = DAG.getLoad(
PtrVT, dl, Chain, Offset,
@@ -3391,11 +3590,11 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
// that are strings for simplicity.
auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
- unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar);
+ Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
unsigned RequiredPadding = 4 - (Size % 4);
bool PaddingPossible =
RequiredPadding == 4 || (CDAInit && CDAInit->isString());
- if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
+ if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
Size == 0)
return SDValue();
@@ -3434,8 +3633,7 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
}
auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
- SDValue CPAddr =
- DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
+ SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
AFI->markGlobalAsPromotedToConstantPool(GVar);
AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
@@ -3505,7 +3703,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
} else { // use literal pool for address constant
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
- SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
RelAddr = DAG.getLoad(
PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -3525,7 +3723,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
DAG.getTargetGlobalAddress(GV, dl, PtrVT));
} else {
- SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+ SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
return DAG.getLoad(
PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -3636,7 +3834,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
SDValue ReturnAddress =
DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
- std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue};
+ constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
SDValue Callee =
DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
SDValue RegisterMask = DAG.getRegisterMask(Mask);
@@ -3720,7 +3918,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
ARMConstantPoolValue *CPV =
ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
ARMCP::CPLSDA, PCAdj);
- CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+ CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
SDValue Result = DAG.getLoad(
PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -3782,6 +3980,15 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
case Intrinsic::arm_mve_pred_v2i:
return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
Op.getOperand(1));
+ case Intrinsic::arm_mve_vreinterpretq:
+ return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::arm_mve_lsll:
+ return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::arm_mve_asrl:
+ return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
}
}
@@ -3982,6 +4189,42 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
AFI->setVarArgsFrameIndex(FrameIndex);
}
+bool ARMTargetLowering::splitValueIntoRegisterParts(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
+ unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
+ bool IsABIRegCopy = CC.hasValue();
+ EVT ValueVT = Val.getValueType();
+ if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
+ PartVT == MVT::f32) {
+ unsigned ValueBits = ValueVT.getSizeInBits();
+ unsigned PartBits = PartVT.getSizeInBits();
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
+ Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
+ Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+ Parts[0] = Val;
+ return true;
+ }
+ return false;
+}
+
+SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
+ SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
+ MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
+ bool IsABIRegCopy = CC.hasValue();
+ if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
+ PartVT == MVT::f32) {
+ unsigned ValueBits = ValueVT.getSizeInBits();
+ unsigned PartBits = PartVT.getSizeInBits();
+ SDValue Val = Parts[0];
+
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
+ Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
+ Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+ return Val;
+ }
+ return SDValue();
+}
+
SDValue ARMTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -4054,44 +4297,41 @@ SDValue ARMTargetLowering::LowerFormalArguments(
if (VA.isRegLoc()) {
EVT RegVT = VA.getLocVT();
- if (VA.needsCustom()) {
+ if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
// f64 and vector types are split up into multiple registers or
// combinations of registers and stack slots.
- if (VA.getLocVT() == MVT::v2f64) {
- SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
- Chain, DAG, dl);
- VA = ArgLocs[++i]; // skip ahead to next loc
- SDValue ArgValue2;
- if (VA.isMemLoc()) {
- int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
- SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
- ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
- MachinePointerInfo::getFixedStack(
- DAG.getMachineFunction(), FI));
- } else {
- ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
- Chain, DAG, dl);
- }
- ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
- ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
- ArgValue, ArgValue1,
- DAG.getIntPtrConstant(0, dl));
- ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
- ArgValue, ArgValue2,
- DAG.getIntPtrConstant(1, dl));
- } else
- ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+ SDValue ArgValue1 =
+ GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+ VA = ArgLocs[++i]; // skip ahead to next loc
+ SDValue ArgValue2;
+ if (VA.isMemLoc()) {
+ int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
+ SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+ ArgValue2 = DAG.getLoad(
+ MVT::f64, dl, Chain, FIN,
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+ } else {
+ ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+ }
+ ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+ ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
+ ArgValue1, DAG.getIntPtrConstant(0, dl));
+ ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
+ ArgValue2, DAG.getIntPtrConstant(1, dl));
+ } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
+ ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
} else {
const TargetRegisterClass *RC;
-
- if (RegVT == MVT::f16)
+ if (RegVT == MVT::f16 || RegVT == MVT::bf16)
RC = &ARM::HPRRegClass;
else if (RegVT == MVT::f32)
RC = &ARM::SPRRegClass;
- else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
+ else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
+ RegVT == MVT::v4bf16)
RC = &ARM::DPRRegClass;
- else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
+ else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
+ RegVT == MVT::v8bf16)
RC = &ARM::QPRRegClass;
else if (RegVT == MVT::i32)
RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
@@ -4131,6 +4371,13 @@ SDValue ARMTargetLowering::LowerFormalArguments(
break;
}
+ // f16 arguments have their size extended to 4 bytes and passed as if they
+ // had been copied to the LSBs of a 32-bit register.
+ // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
+ if (VA.needsCustom() &&
+ (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
+ ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
+
InVals.push_back(ArgValue);
} else { // VA.isRegLoc()
// sanity check
@@ -5709,85 +5956,27 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
/// operand type is illegal (e.g., v2f32 for a target that doesn't support
/// vectors), since the legalizer won't know what to do with that.
-static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
- const ARMSubtarget *Subtarget) {
+SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) const {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc dl(N);
SDValue Op = N->getOperand(0);
- // This function is only supposed to be called for i64 types, either as the
- // source or destination of the bit convert.
+ // This function is only supposed to be called for i16 and i64 types, either
+ // as the source or destination of the bit convert.
EVT SrcVT = Op.getValueType();
EVT DstVT = N->getValueType(0);
- const bool HasFullFP16 = Subtarget->hasFullFP16();
-
- if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
- // FullFP16: half values are passed in S-registers, and we don't
- // need any of the bitcast and moves:
- //
- // t2: f32,ch = CopyFromReg t0, Register:f32 %0
- // t5: i32 = bitcast t2
- // t18: f16 = ARMISD::VMOVhr t5
- if (Op.getOpcode() != ISD::CopyFromReg ||
- Op.getValueType() != MVT::f32)
- return SDValue();
-
- auto Move = N->use_begin();
- if (Move->getOpcode() != ARMISD::VMOVhr)
- return SDValue();
-
- SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
- SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
- DAG.ReplaceAllUsesWith(*Move, &Copy);
- return Copy;
- }
-
- if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
- if (!HasFullFP16)
- return SDValue();
- // SoftFP: read half-precision arguments:
- //
- // t2: i32,ch = ...
- // t7: i16 = truncate t2 <~~~~ Op
- // t8: f16 = bitcast t7 <~~~~ N
- //
- if (Op.getOperand(0).getValueType() == MVT::i32)
- return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
- MVT::f16, Op.getOperand(0));
-
- return SDValue();
- }
- // Half-precision return values
- if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
- if (!HasFullFP16)
- return SDValue();
- //
- // t11: f16 = fadd t8, t10
- // t12: i16 = bitcast t11 <~~~ SDNode N
- // t13: i32 = zero_extend t12
- // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
- // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
- //
- // transform this into:
- //
- // t20: i32 = ARMISD::VMOVrh t11
- // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
- //
- auto ZeroExtend = N->use_begin();
- if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
- ZeroExtend->getValueType(0) != MVT::i32)
- return SDValue();
+ if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
+ (DstVT == MVT::f16 || DstVT == MVT::bf16))
+ return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
+ DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
- auto Copy = ZeroExtend->use_begin();
- if (Copy->getOpcode() == ISD::CopyToReg &&
- Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
- SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
- DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
- return Cvt;
- }
- return SDValue();
- }
+ if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
+ (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
+ return DAG.getNode(
+ ISD::TRUNCATE, SDLoc(N), DstVT,
+ MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
return SDValue();
@@ -5930,16 +6119,20 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
// so that the shift + and get folded into a bitfield extract.
SDLoc dl(Op);
- SDValue Ops[] = { DAG.getEntryNode(),
- DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
+ SDValue Chain = Op.getOperand(0);
+ SDValue Ops[] = {Chain,
+ DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
- SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
+ SDValue FPSCR =
+ DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
+ Chain = FPSCR.getValue(1);
SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
DAG.getConstant(1U << 22, dl, MVT::i32));
SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
DAG.getConstant(22, dl, MVT::i32));
- return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
- DAG.getConstant(3, dl, MVT::i32));
+ SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+ DAG.getConstant(3, dl, MVT::i32));
+ return DAG.getMergeValues({And, Chain}, dl);
}
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
@@ -6424,9 +6617,10 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
/// immediate" operand (e.g., VMOV). If so, return the encoded value.
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
unsigned SplatBitSize, SelectionDAG &DAG,
- const SDLoc &dl, EVT &VT, bool is128Bits,
+ const SDLoc &dl, EVT &VT, EVT VectorVT,
VMOVModImmType type) {
unsigned OpCmode, Imm;
+ bool is128Bits = VectorVT.is128BitVector();
// SplatBitSize is set to the smallest size that splats the vector, so a
// zero vector will always have SplatBitSize == 8. However, NEON modified
@@ -6544,9 +6738,18 @@ static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
ImmMask <<= 1;
}
- if (DAG.getDataLayout().isBigEndian())
- // swap higher and lower 32 bit word
- Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
+ if (DAG.getDataLayout().isBigEndian()) {
+ // Reverse the order of elements within the vector.
+ unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
+ unsigned Mask = (1 << BytesPerElem) - 1;
+ unsigned NumElems = 8 / BytesPerElem;
+ unsigned NewImm = 0;
+ for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
+ unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
+ NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
+ }
+ Imm = NewImm;
+ }
// Op=1, Cmode=1110.
OpCmode = 0x1e;
@@ -6585,8 +6788,6 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
case MVT::f64: {
SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
- if (!ST->isLittle())
- std::swap(Lo, Hi);
return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
}
case MVT::f32:
@@ -6639,7 +6840,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
// Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
- VMovVT, false, VMOVModImm);
+ VMovVT, VT, VMOVModImm);
if (NewVal != SDValue()) {
SDLoc DL(Op);
SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
@@ -6656,7 +6857,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
// Finally, try a VMVN.i32
NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
- false, VMVNModImm);
+ VT, VMVNModImm);
if (NewVal != SDValue()) {
SDLoc DL(Op);
SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
@@ -7064,6 +7265,104 @@ static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) {
return true;
}
+// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
+// from a pair of inputs. For example:
+// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
+// FP_ROUND(EXTRACT_ELT(Y, 0),
+// FP_ROUND(EXTRACT_ELT(X, 1),
+// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
+static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+ if (!ST->hasMVEFloatOps())
+ return SDValue();
+
+ SDLoc dl(BV);
+ EVT VT = BV.getValueType();
+ if (VT != MVT::v8f16)
+ return SDValue();
+
+ // We are looking for a buildvector of fptrunc elements, where all the
+ // elements are interleavingly extracted from two sources. Check the first two
+ // items are valid enough and extract some info from them (they are checked
+ // properly in the loop below).
+ if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
+ BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)
+ return SDValue();
+ if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
+ BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)
+ return SDValue();
+ SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
+ SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
+ if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
+ return SDValue();
+
+ // Check all the values in the BuildVector line up with our expectations.
+ for (unsigned i = 1; i < 4; i++) {
+ auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
+ return Trunc.getOpcode() == ISD::FP_ROUND &&
+ Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Trunc.getOperand(0).getOperand(0) == Op &&
+ Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
+ };
+ if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
+ return SDValue();
+ if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
+ return SDValue();
+ }
+
+ SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
+ DAG.getConstant(0, dl, MVT::i32));
+ return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
+ DAG.getConstant(1, dl, MVT::i32));
+}
+
+// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
+// from a single input on alternating lanes. For example:
+// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
+// FP_ROUND(EXTRACT_ELT(X, 2),
+// FP_ROUND(EXTRACT_ELT(X, 4), ...)
+static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+ if (!ST->hasMVEFloatOps())
+ return SDValue();
+
+ SDLoc dl(BV);
+ EVT VT = BV.getValueType();
+ if (VT != MVT::v4f32)
+ return SDValue();
+
+ // We are looking for a buildvector of fptext elements, where all the
+ // elements are alternating lanes from a single source. For example <0,2,4,6>
+ // or <1,3,5,7>. Check the first two items are valid enough and extract some
+ // info from them (they are checked properly in the loop below).
+ if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
+ BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+ SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
+ int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);
+ if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
+ return SDValue();
+
+ // Check all the values in the BuildVector line up with our expectations.
+ for (unsigned i = 1; i < 4; i++) {
+ auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
+ return Trunc.getOpcode() == ISD::FP_EXTEND &&
+ Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ Trunc.getOperand(0).getOperand(0) == Op &&
+ Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
+ };
+ if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
+ return SDValue();
+ }
+
+ return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
+ DAG.getConstant(Offset, dl, MVT::i32));
+}
+
// If N is an integer constant that can be moved into a register in one
// instruction, return an SDValue of such a constant (will become a MOV
// instruction). Otherwise return null.
@@ -7163,13 +7462,12 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
return DAG.getUNDEF(VT);
if ((ST->hasNEON() && SplatBitSize <= 64) ||
- (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
+ (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
// Check if an immediate VMOV works.
EVT VmovVT;
- SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
- SplatUndef.getZExtValue(), SplatBitSize,
- DAG, dl, VmovVT, VT.is128BitVector(),
- VMOVModImm);
+ SDValue Val =
+ isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+ SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
if (Val.getNode()) {
SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
@@ -7179,9 +7477,8 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
// Try an immediate VMVN.
uint64_t NegatedImm = (~SplatBits).getZExtValue();
Val = isVMOVModifiedImm(
- NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
- DAG, dl, VmovVT, VT.is128BitVector(),
- ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
+ NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
+ VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
if (Val.getNode()) {
SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
@@ -7321,12 +7618,19 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
if (isConstant)
return SDValue();
- // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
- if (NumElts >= 4) {
- SDValue shuffle = ReconstructShuffle(Op, DAG);
- if (shuffle != SDValue())
+ // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
+ // vmovn). Empirical tests suggest this is rarely worth it for vectors of
+ // length <= 2.
+ if (NumElts >= 4)
+ if (SDValue shuffle = ReconstructShuffle(Op, DAG))
return shuffle;
- }
+
+ // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
+ // VCVT's
+ if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
+ return VCVT;
+ if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
+ return VCVT;
if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
// If we haven't found an efficient lowering, try splitting a 128-bit vector
@@ -7527,7 +7831,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
if (SrcEltTy == SmallestEltTy)
continue;
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
- Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+ Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
Src.WindowBase *= Src.WindowScale;
}
@@ -7579,7 +7883,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
ShuffleOps[1], Mask, DAG);
if (!Shuffle)
return SDValue();
- return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+ return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
}
enum ShuffleOpCodes {
@@ -8892,7 +9196,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
if (ShouldUseSRet) {
// Create stack object for sret.
const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
- const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
+ const Align StackAlign = DL.getPrefTypeAlign(RetTy);
int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
@@ -9067,8 +9371,7 @@ void ARMTargetLowering::ExpandDIV_Windows(
DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
- Results.push_back(Lower);
- Results.push_back(Upper);
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
}
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
@@ -9101,6 +9404,25 @@ static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
}
+void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ LoadSDNode *LD = cast<LoadSDNode>(N);
+ EVT MemVT = LD->getMemoryVT();
+ assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
+
+ if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
+ !Subtarget->isThumb1Only() && LD->isVolatile()) {
+ SDLoc dl(N);
+ SDValue Result = DAG.getMemIntrinsicNode(
+ ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
+ {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
+ SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
+ SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+ Results.append({Pair, Result.getValue(2)});
+ }
+}
+
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
EVT MemVT = ST->getMemoryVT();
@@ -9130,6 +9452,38 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
ST->getMemOperand());
}
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+ EVT MemVT = ST->getMemoryVT();
+ assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
+
+ if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
+ !Subtarget->isThumb1Only() && ST->isVolatile()) {
+ SDNode *N = Op.getNode();
+ SDLoc dl(N);
+
+ SDValue Lo = DAG.getNode(
+ ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+ DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
+ MVT::i32));
+ SDValue Hi = DAG.getNode(
+ ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+ DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
+ MVT::i32));
+
+ return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
+ {ST->getChain(), Lo, Hi, ST->getBasePtr()},
+ MemVT, ST->getMemOperand());
+ } else if (Subtarget->hasMVEIntegerOps() &&
+ ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
+ MemVT == MVT::v16i1))) {
+ return LowerPredicateStore(Op, DAG);
+ }
+
+ return SDValue();
+}
+
static bool isZeroVector(SDValue N) {
return (ISD::isBuildVectorAllZeros(N.getNode()) ||
(N->getOpcode() == ARMISD::VMOVIMM &&
@@ -9155,13 +9509,87 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
N->getExtensionType(), N->isExpandingLoad());
SDValue Combo = NewLoad;
- if (!PassThru.isUndef() &&
- (PassThru.getOpcode() != ISD::BITCAST ||
- !isZeroVector(PassThru->getOperand(0))))
+ bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
+ PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
+ isZeroVector(PassThru->getOperand(0));
+ if (!PassThru.isUndef() && !PassThruIsCastZero)
Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
}
+static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ if (!ST->hasMVEIntegerOps())
+ return SDValue();
+
+ SDLoc dl(Op);
+ unsigned BaseOpcode = 0;
+ switch (Op->getOpcode()) {
+ default: llvm_unreachable("Expected VECREDUCE opcode");
+ case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
+ case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
+ case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
+ case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
+ case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
+ case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
+ case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
+ case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
+ }
+
+ SDValue Op0 = Op->getOperand(0);
+ EVT VT = Op0.getValueType();
+ EVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumActiveLanes = NumElts;
+
+ assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
+ NumActiveLanes == 2) &&
+ "Only expected a power 2 vector size");
+
+ // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
+ // allows us to easily extract vector elements from the lanes.
+ while (NumActiveLanes > 4) {
+ unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
+ SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
+ Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
+ NumActiveLanes /= 2;
+ }
+
+ SDValue Res;
+ if (NumActiveLanes == 4) {
+ // The remaining 4 elements are summed sequentially
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
+ SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
+ SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
+ SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
+ SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
+ Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
+ } else {
+ SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(0, dl, MVT::i32));
+ SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+ DAG.getConstant(1, dl, MVT::i32));
+ Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
+ }
+
+ // Result type may be wider than element type.
+ if (EltVT != Op->getValueType(0))
+ Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
+ return Res;
+}
+
+static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ if (!ST->hasMVEFloatOps())
+ return SDValue();
+ return LowerVecReduce(Op, DAG, ST);
+}
+
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
// Acquire/Release load/store is not legal for targets without a dmb or
@@ -9231,12 +9659,13 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N,
bool isBigEndian = DAG.getDataLayout().isBigEndian();
- Results.push_back(
+ SDValue Lo =
DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
- SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
- Results.push_back(
+ SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
+ SDValue Hi =
DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
- SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
+ SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
Results.push_back(SDValue(CmpSwap, 2));
}
@@ -9362,9 +9791,19 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::LOAD:
return LowerPredicateLoad(Op, DAG);
case ISD::STORE:
- return LowerPredicateStore(Op, DAG);
+ return LowerSTORE(Op, DAG, Subtarget);
case ISD::MLOAD:
return LowerMLOAD(Op, DAG);
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ return LowerVecReduce(Op, DAG, Subtarget);
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_FMIN:
+ case ISD::VECREDUCE_FMAX:
+ return LowerVecReduceF(Op, DAG, Subtarget);
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG);
case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
@@ -9411,8 +9850,8 @@ static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
DAG.getVTList(MVT::i32, MVT::i32),
N->getOperand(1), N->getOperand(2),
Lo, Hi);
- Results.push_back(LongMul.getValue(0));
- Results.push_back(LongMul.getValue(1));
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
+ LongMul.getValue(0), LongMul.getValue(1)));
}
/// ReplaceNodeResults - Replace the results of node with an illegal result
@@ -9466,7 +9905,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ABS:
lowerABS(N, Results, DAG);
return ;
-
+ case ISD::LOAD:
+ LowerLOAD(N, Results, DAG);
+ break;
}
if (Res.getNode())
Results.push_back(Res);
@@ -9499,7 +9940,7 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
ARMConstantPoolValue *CPV =
ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
- unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
+ unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
: &ARM::GPRRegClass;
@@ -9507,11 +9948,11 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
// Grab constant pool and fixed stack memory operands.
MachineMemOperand *CPMMO =
MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
- MachineMemOperand::MOLoad, 4, 4);
+ MachineMemOperand::MOLoad, 4, Align(4));
MachineMemOperand *FIMMOSt =
MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
- MachineMemOperand::MOStore, 4, 4);
+ MachineMemOperand::MOStore, 4, Align(4));
// Load the address of the dispatch MBB into the jump buffer.
if (isThumb2) {
@@ -9697,7 +10138,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*MF, FI),
- MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
+ MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4));
MachineInstrBuilder MIB;
MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
@@ -9788,10 +10229,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
// MachineConstantPool wants an explicit alignment.
- unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
- if (Align == 0)
- Align = MF->getDataLayout().getTypeAllocSize(C->getType());
- unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+ Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
@@ -9828,8 +10267,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addReg(NewVReg3)
.add(predOps(ARMCC::AL));
- MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
- MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
+ MachineMemOperand *JTMMOLd =
+ MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
+ MachineMemOperand::MOLoad, 4, Align(4));
Register NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
@@ -9889,10 +10329,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
// MachineConstantPool wants an explicit alignment.
- unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
- if (Align == 0)
- Align = MF->getDataLayout().getTypeAllocSize(C->getType());
- unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+ Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
Register VReg1 = MRI->createVirtualRegister(TRC);
BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
@@ -9922,8 +10360,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addJumpTableIndex(MJTI)
.add(predOps(ARMCC::AL));
- MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
- MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
+ MachineMemOperand *JTMMOLd =
+ MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
+ MachineMemOperand::MOLoad, 4, Align(4));
Register NewVReg5 = MRI->createVirtualRegister(TRC);
BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
.addReg(NewVReg3, RegState::Kill)
@@ -10162,7 +10601,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
Register dest = MI.getOperand(0).getReg();
Register src = MI.getOperand(1).getReg();
unsigned SizeVal = MI.getOperand(2).getImm();
- unsigned Align = MI.getOperand(3).getImm();
+ unsigned Alignment = MI.getOperand(3).getImm();
DebugLoc dl = MI.getDebugLoc();
MachineFunction *MF = BB->getParent();
@@ -10175,17 +10614,17 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
bool IsThumb2 = Subtarget->isThumb2();
bool IsThumb = Subtarget->isThumb();
- if (Align & 1) {
+ if (Alignment & 1) {
UnitSize = 1;
- } else if (Align & 2) {
+ } else if (Alignment & 2) {
UnitSize = 2;
} else {
// Check whether we can use NEON instructions.
if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
Subtarget->hasNEON()) {
- if ((Align % 16 == 0) && SizeVal >= 16)
+ if ((Alignment % 16 == 0) && SizeVal >= 16)
UnitSize = 16;
- else if ((Align % 8 == 0) && SizeVal >= 8)
+ else if ((Alignment % 8 == 0) && SizeVal >= 8)
UnitSize = 8;
}
// Can't use NEON instructions.
@@ -10291,13 +10730,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
// MachineConstantPool wants an explicit alignment.
- unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
- if (Align == 0)
- Align = MF->getDataLayout().getTypeAllocSize(C->getType());
- unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+ Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
MachineMemOperand *CPMMO =
MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
- MachineMemOperand::MOLoad, 4, 4);
+ MachineMemOperand::MOLoad, 4, Align(4));
if (IsThumb)
BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
@@ -11667,6 +12104,42 @@ static SDValue PerformAddeSubeCombine(SDNode *N,
return SDValue();
}
+static SDValue PerformVSELECTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
+ //
+ // We need to re-implement this optimization here as the implementation in the
+ // Target-Independent DAGCombiner does not handle the kind of constant we make
+ // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
+ // good reason, allowing truncation there would break other targets).
+ //
+ // Currently, this is only done for MVE, as it's the only target that benefits
+ // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
+ if (!Subtarget->hasMVEIntegerOps())
+ return SDValue();
+
+ if (N->getOperand(0).getOpcode() != ISD::XOR)
+ return SDValue();
+ SDValue XOR = N->getOperand(0);
+
+ // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
+ // It is important to check with truncation allowed as the BUILD_VECTORs we
+ // generate in those situations will truncate their operands.
+ ConstantSDNode *Const =
+ isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
+ /*AllowTruncation*/ true);
+ if (!Const || !Const->isOne())
+ return SDValue();
+
+ // Rewrite into vselect(cond, rhs, lhs).
+ SDValue Cond = XOR->getOperand(0);
+ SDValue LHS = N->getOperand(1);
+ SDValue RHS = N->getOperand(2);
+ EVT Type = N->getValueType(0);
+ return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
+}
+
static SDValue PerformABSCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
@@ -11724,6 +12197,71 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
+static SDValue PerformADDVecReduce(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
+ // will look like:
+ // t1: i32,i32 = ARMISD::VADDLVs x
+ // t2: i64 = build_pair t1, t1:1
+ // t3: i64 = add t2, y
+ // We also need to check for sext / zext and commutitive adds.
+ auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
+ SDValue NB) {
+ if (NB->getOpcode() != ISD::BUILD_PAIR)
+ return SDValue();
+ SDValue VecRed = NB->getOperand(0);
+ if (VecRed->getOpcode() != Opcode || VecRed.getResNo() != 0 ||
+ NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
+ return SDValue();
+
+ SDLoc dl(N);
+ SmallVector<SDValue, 4> Ops;
+ Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
+ DCI.DAG.getConstant(0, dl, MVT::i32)));
+ Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
+ DCI.DAG.getConstant(1, dl, MVT::i32)));
+ for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++)
+ Ops.push_back(VecRed->getOperand(i));
+ SDValue Red = DCI.DAG.getNode(OpcodeA, dl,
+ DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops);
+ return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
+ SDValue(Red.getNode(), 1));
+ };
+
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
+ return M;
+ if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
+ return M;
+ return SDValue();
+}
+
bool
ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const {
@@ -11895,6 +12433,9 @@ static SDValue PerformADDCombine(SDNode *N,
if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
return Result;
+ if (SDValue Result = PerformADDVecReduce(N, DCI, Subtarget))
+ return Result;
+
// First try with the default operand order.
if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
return Result;
@@ -11986,18 +12527,86 @@ static SDValue PerformVMULCombine(SDNode *N,
DAG.getNode(ISD::MUL, DL, VT, N01, N1));
}
+static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v2i64)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ auto IsSignExt = [&](SDValue Op) {
+ if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
+ return SDValue();
+ EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
+ if (VT.getScalarSizeInBits() == 32)
+ return Op->getOperand(0);
+ return SDValue();
+ };
+ auto IsZeroExt = [&](SDValue Op) {
+ // Zero extends are a little more awkward. At the point we are matching
+ // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
+ // That might be before of after a bitcast depending on how the and is
+ // placed. Because this has to look through bitcasts, it is currently only
+ // supported on LE.
+ if (!Subtarget->isLittle())
+ return SDValue();
+
+ SDValue And = Op;
+ if (And->getOpcode() == ISD::BITCAST)
+ And = And->getOperand(0);
+ if (And->getOpcode() != ISD::AND)
+ return SDValue();
+ SDValue Mask = And->getOperand(1);
+ if (Mask->getOpcode() == ISD::BITCAST)
+ Mask = Mask->getOperand(0);
+
+ if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
+ Mask.getValueType() != MVT::v4i32)
+ return SDValue();
+ if (isAllOnesConstant(Mask->getOperand(0)) &&
+ isNullConstant(Mask->getOperand(1)) &&
+ isAllOnesConstant(Mask->getOperand(2)) &&
+ isNullConstant(Mask->getOperand(3)))
+ return And->getOperand(0);
+ return SDValue();
+ };
+
+ SDLoc dl(N);
+ if (SDValue Op0 = IsSignExt(N0)) {
+ if (SDValue Op1 = IsSignExt(N1)) {
+ SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
+ SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
+ return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
+ }
+ }
+ if (SDValue Op0 = IsZeroExt(N0)) {
+ if (SDValue Op1 = IsZeroExt(N1)) {
+ SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
+ SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
+ return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
+ }
+ }
+
+ return SDValue();
+}
+
static SDValue PerformMULCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
+ return PerformMVEVMULLCombine(N, DAG, Subtarget);
+
if (Subtarget->isThumb1Only())
return SDValue();
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
- EVT VT = N->getValueType(0);
if (VT.is64BitVector() || VT.is128BitVector())
return PerformVMULCombine(N, DCI, Subtarget);
if (VT != MVT::i32)
@@ -12182,20 +12791,21 @@ static SDValue PerformANDCombine(SDNode *N,
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
- if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 ||
+ VT == MVT::v8i1 || VT == MVT::v16i1)
return SDValue();
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
- if (BVN && Subtarget->hasNEON() &&
+ if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
- if (SplatBitSize <= 64) {
+ if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
+ SplatBitSize == 64) {
EVT VbicVT;
SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
SplatUndef.getZExtValue(), SplatBitSize,
- DAG, dl, VbicVT, VT.is128BitVector(),
- OtherModImm);
+ DAG, dl, VbicVT, VT, OtherModImm);
if (Val.getNode()) {
SDValue Input =
DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
@@ -12425,58 +13035,44 @@ static bool isValidMVECond(unsigned CC, bool IsFloat) {
};
}
+static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
+ if (N->getOpcode() == ARMISD::VCMP)
+ return (ARMCC::CondCodes)N->getConstantOperandVal(2);
+ else if (N->getOpcode() == ARMISD::VCMPZ)
+ return (ARMCC::CondCodes)N->getConstantOperandVal(1);
+ else
+ llvm_unreachable("Not a VCMP/VCMPZ!");
+}
+
+static bool CanInvertMVEVCMP(SDValue N) {
+ ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));
+ return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
+}
+
static SDValue PerformORCombine_i1(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
// together with predicates
EVT VT = N->getValueType(0);
+ SDLoc DL(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- ARMCC::CondCodes CondCode0 = ARMCC::AL;
- ARMCC::CondCodes CondCode1 = ARMCC::AL;
- if (N0->getOpcode() == ARMISD::VCMP)
- CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
- ->getZExtValue();
- else if (N0->getOpcode() == ARMISD::VCMPZ)
- CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
- ->getZExtValue();
- if (N1->getOpcode() == ARMISD::VCMP)
- CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
- ->getZExtValue();
- else if (N1->getOpcode() == ARMISD::VCMPZ)
- CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
- ->getZExtValue();
-
- if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
- return SDValue();
-
- unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
- unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
+ auto IsFreelyInvertable = [&](SDValue V) {
+ if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
+ return CanInvertMVEVCMP(V);
+ return false;
+ };
- if (!isValidMVECond(Opposite0,
- N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
- !isValidMVECond(Opposite1,
- N1->getOperand(0)->getValueType(0).isFloatingPoint()))
+ // At least one operand must be freely invertable.
+ if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
return SDValue();
- SmallVector<SDValue, 4> Ops0;
- Ops0.push_back(N0->getOperand(0));
- if (N0->getOpcode() == ARMISD::VCMP)
- Ops0.push_back(N0->getOperand(1));
- Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
- SmallVector<SDValue, 4> Ops1;
- Ops1.push_back(N1->getOperand(0));
- if (N1->getOpcode() == ARMISD::VCMP)
- Ops1.push_back(N1->getOperand(1));
- Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));
-
- SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
- SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
- SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
- return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
- DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
+ SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT);
+ SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT);
+ SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
+ return DCI.DAG.getLogicalNOT(DL, And, VT);
}
/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
@@ -12492,17 +13088,21 @@ static SDValue PerformORCombine(SDNode *N,
if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
+ if (Subtarget->hasMVEIntegerOps() &&
+ (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
+ return PerformORCombine_i1(N, DCI, Subtarget);
+
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
- if (BVN && Subtarget->hasNEON() &&
+ if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
- if (SplatBitSize <= 64) {
+ if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
+ SplatBitSize == 64) {
EVT VorrVT;
- SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
- SplatUndef.getZExtValue(), SplatBitSize,
- DAG, dl, VorrVT, VT.is128BitVector(),
- OtherModImm);
+ SDValue Val =
+ isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+ SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
if (Val.getNode()) {
SDValue Input =
DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
@@ -12563,10 +13163,6 @@ static SDValue PerformORCombine(SDNode *N,
}
}
- if (Subtarget->hasMVEIntegerOps() &&
- (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
- return PerformORCombine_i1(N, DCI, Subtarget);
-
// Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
// reasonable.
if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
@@ -12598,6 +13194,27 @@ static SDValue PerformXORCombine(SDNode *N,
return Result;
}
+ if (Subtarget->hasMVEIntegerOps()) {
+ // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ const TargetLowering *TLI = Subtarget->getTargetLowering();
+ if (TLI->isConstTrueVal(N1.getNode()) &&
+ (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
+ if (CanInvertMVEVCMP(N0)) {
+ SDLoc DL(N0);
+ ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));
+
+ SmallVector<SDValue, 4> Ops;
+ Ops.push_back(N0->getOperand(0));
+ if (N0->getOpcode() == ARMISD::VCMP)
+ Ops.push_back(N0->getOperand(1));
+ Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32));
+ return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
+ }
+ }
+ }
+
return SDValue();
}
@@ -12796,6 +13413,78 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue Op0 = N->getOperand(0);
+
+ // VMOVhr (VMOVrh (X)) -> X
+ if (Op0->getOpcode() == ARMISD::VMOVrh)
+ return Op0->getOperand(0);
+
+ // FullFP16: half values are passed in S-registers, and we don't
+ // need any of the bitcast and moves:
+ //
+ // t2: f32,ch = CopyFromReg t0, Register:f32 %0
+ // t5: i32 = bitcast t2
+ // t18: f16 = ARMISD::VMOVhr t5
+ if (Op0->getOpcode() == ISD::BITCAST) {
+ SDValue Copy = Op0->getOperand(0);
+ if (Copy.getValueType() == MVT::f32 &&
+ Copy->getOpcode() == ISD::CopyFromReg) {
+ SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
+ SDValue NewCopy =
+ DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops);
+ return NewCopy;
+ }
+ }
+
+ // fold (VMOVhr (load x)) -> (load (f16*)x)
+ if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
+ if (LN0->hasOneUse() && LN0->isUnindexed() &&
+ LN0->getMemoryVT() == MVT::i16) {
+ SDValue Load =
+ DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
+ LN0->getBasePtr(), LN0->getMemOperand());
+ DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
+ DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
+ return Load;
+ }
+ }
+
+ // Only the bottom 16 bits of the source register are used.
+ APInt DemandedMask = APInt::getLowBitsSet(32, 16);
+ const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+static SDValue PerformVMOVrhCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // fold (VMOVrh (load x)) -> (zextload (i16*)x)
+ if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
+ LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+
+ SDValue Load =
+ DCI.DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
+ LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
+ DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
+ DCI.DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+ return Load;
+ }
+
+ // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
+ if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(N0->getOperand(1)))
+ return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
+ N0->getOperand(1));
+
+ return SDValue();
+}
+
/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
/// are normal, non-volatile loads. If so, it is profitable to bitcast an
/// i64 vector to have f64 elements, since the value can then be loaded
@@ -12946,8 +13635,29 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
// If the valuetypes are the same, we can remove the cast entirely.
if (Op->getOperand(0).getValueType() == VT)
return Op->getOperand(0);
- return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl,
- Op->getOperand(0).getValueType(), Op->getOperand(0));
+ return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
+ }
+
+ return SDValue();
+}
+
+static SDValue
+PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *ST) {
+ EVT VT = N->getValueType(0);
+ SDValue Op = N->getOperand(0);
+ SDLoc dl(N);
+
+ // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
+ if (ST->isLittle())
+ return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+ // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
+ if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
+ // If the valuetypes are the same, we can remove the cast entirely.
+ if (Op->getOperand(0).getValueType() == VT)
+ return Op->getOperand(0);
+ return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
}
return SDValue();
@@ -13012,6 +13722,29 @@ static SDValue PerformInsertEltCombine(SDNode *N,
return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
}
+static SDValue PerformExtractEltCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue Op0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ SDLoc dl(N);
+
+ // extract (vdup x) -> x
+ if (Op0->getOpcode() == ARMISD::VDUP) {
+ SDValue X = Op0->getOperand(0);
+ if (VT == MVT::f16 && X.getValueType() == MVT::i32)
+ return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
+ if (VT == MVT::i32 && X.getValueType() == MVT::f16)
+ return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
+
+ while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
+ X = X->getOperand(0);
+ if (X.getValueType() == VT)
+ return X;
+ }
+
+ return SDValue();
+}
+
/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
/// ISD::VECTOR_SHUFFLE.
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
@@ -13293,6 +14026,128 @@ static SDValue PerformVLDCombine(SDNode *N,
return CombineBaseUpdate(N, DCI);
}
+static SDValue PerformMVEVLDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue Addr = N->getOperand(2);
+ MemSDNode *MemN = cast<MemSDNode>(N);
+ SDLoc dl(N);
+
+ // For the stores, where there are multiple intrinsics we only actually want
+ // to post-inc the last of the them.
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ if (IntNo == Intrinsic::arm_mve_vst2q &&
+ cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
+ return SDValue();
+ if (IntNo == Intrinsic::arm_mve_vst4q &&
+ cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
+ return SDValue();
+
+ // Search for a use of the address operand that is an increment.
+ for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+ UE = Addr.getNode()->use_end();
+ UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::ADD ||
+ UI.getUse().getResNo() != Addr.getResNo())
+ continue;
+
+ // Check that the add is independent of the load/store. Otherwise, folding
+ // it would create a cycle. We can avoid searching through Addr as it's a
+ // predecessor to both.
+ SmallPtrSet<const SDNode *, 32> Visited;
+ SmallVector<const SDNode *, 16> Worklist;
+ Visited.insert(Addr.getNode());
+ Worklist.push_back(N);
+ Worklist.push_back(User);
+ if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
+ SDNode::hasPredecessorHelper(User, Visited, Worklist))
+ continue;
+
+ // Find the new opcode for the updating load/store.
+ bool isLoadOp = true;
+ unsigned NewOpc = 0;
+ unsigned NumVecs = 0;
+ switch (IntNo) {
+ default:
+ llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
+ case Intrinsic::arm_mve_vld2q:
+ NewOpc = ARMISD::VLD2_UPD;
+ NumVecs = 2;
+ break;
+ case Intrinsic::arm_mve_vld4q:
+ NewOpc = ARMISD::VLD4_UPD;
+ NumVecs = 4;
+ break;
+ case Intrinsic::arm_mve_vst2q:
+ NewOpc = ARMISD::VST2_UPD;
+ NumVecs = 2;
+ isLoadOp = false;
+ break;
+ case Intrinsic::arm_mve_vst4q:
+ NewOpc = ARMISD::VST4_UPD;
+ NumVecs = 4;
+ isLoadOp = false;
+ break;
+ }
+
+ // Find the size of memory referenced by the load/store.
+ EVT VecTy;
+ if (isLoadOp) {
+ VecTy = N->getValueType(0);
+ } else {
+ VecTy = N->getOperand(3).getValueType();
+ }
+
+ unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+
+ // If the increment is a constant, it must match the memory ref size.
+ SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+ ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
+ if (!CInc || CInc->getZExtValue() != NumBytes)
+ continue;
+
+ // Create the new updating load/store node.
+ // First, create an SDVTList for the new updating node's results.
+ EVT Tys[6];
+ unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
+ unsigned n;
+ for (n = 0; n < NumResultVecs; ++n)
+ Tys[n] = VecTy;
+ Tys[n++] = MVT::i32;
+ Tys[n] = MVT::Other;
+ SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
+
+ // Then, gather the new node's operands.
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(N->getOperand(0)); // incoming chain
+ Ops.push_back(N->getOperand(2)); // ptr
+ Ops.push_back(Inc);
+
+ for (unsigned i = 3; i < N->getNumOperands(); ++i)
+ Ops.push_back(N->getOperand(i));
+
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
+ MemN->getMemOperand());
+
+ // Update the uses.
+ SmallVector<SDValue, 5> NewResults;
+ for (unsigned i = 0; i < NumResultVecs; ++i)
+ NewResults.push_back(SDValue(UpdN.getNode(), i));
+
+ NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+ DCI.CombineTo(N, NewResults);
+ DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+ break;
+ }
+
+ return SDValue();
+}
+
/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
@@ -13377,8 +14232,21 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
/// PerformVDUPLANECombine - Target-specific dag combine xforms for
/// ARMISD::VDUPLANE.
static SDValue PerformVDUPLANECombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
SDValue Op = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
+ if (Subtarget->hasMVEIntegerOps()) {
+ EVT ExtractVT = VT.getVectorElementType();
+ // We need to ensure we are creating a legal type.
+ if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
+ ExtractVT = MVT::i32;
+ SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
+ N->getOperand(0), N->getOperand(1));
+ return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
+ }
// If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
// of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
@@ -13399,7 +14267,6 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
unsigned EltBits;
if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
EltSize = 8;
- EVT VT = N->getValueType(0);
if (EltSize > VT.getScalarSizeInBits())
return SDValue();
@@ -13412,6 +14279,18 @@ static SDValue PerformVDUPCombine(SDNode *N,
const ARMSubtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
SDValue Op = N->getOperand(0);
+ SDLoc dl(N);
+
+ if (Subtarget->hasMVEIntegerOps()) {
+ // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
+ // need to come from a GPR.
+ if (Op.getValueType() == MVT::f32)
+ return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
+ DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
+ else if (Op.getValueType() == MVT::f16)
+ return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
+ DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
+ }
if (!Subtarget->hasNEON())
return SDValue();
@@ -13540,7 +14419,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
return SDValue();
SDValue Trunc = St->getValue();
- if (Trunc->getOpcode() != ISD::TRUNCATE)
+ if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND)
return SDValue();
EVT FromVT = Trunc->getOperand(0).getValueType();
EVT ToVT = Trunc.getValueType();
@@ -13555,20 +14434,54 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
NumElements = 4;
if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
NumElements = 8;
- if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
+ if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16)
+ NumElements = 4;
+ if (NumElements == 0 ||
+ (FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) ||
FromVT.getVectorNumElements() % NumElements != 0)
return SDValue();
+ // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
+ // use the VMOVN over splitting the store. We are looking for patterns of:
+ // !rev: 0 N 1 N+1 2 N+2 ...
+ // rev: N 0 N+1 1 N+2 2 ...
+ auto isVMOVNOriginalMask = [&](ArrayRef<int> M, bool rev) {
+ unsigned NumElts = ToVT.getVectorNumElements();
+ if (NumElts != M.size())
+ return false;
+
+ unsigned Off0 = rev ? NumElts : 0;
+ unsigned Off1 = rev ? 0 : NumElts;
+
+ for (unsigned i = 0; i < NumElts; i += 2) {
+ if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
+ return false;
+ if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
+ return false;
+ }
+
+ return true;
+ };
+
+ if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc->getOperand(0)))
+ if (isVMOVNOriginalMask(Shuffle->getMask(), false) ||
+ isVMOVNOriginalMask(Shuffle->getMask(), true))
+ return SDValue();
+
+ LLVMContext &C = *DAG.getContext();
SDLoc DL(St);
// Details about the old store
SDValue Ch = St->getChain();
SDValue BasePtr = St->getBasePtr();
- unsigned Alignment = St->getOriginalAlignment();
+ Align Alignment = St->getOriginalAlign();
MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
AAMDNodes AAInfo = St->getAAInfo();
- EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
- EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
+ // We split the store into slices of NumElements. fp16 trunc stores are vcvt
+ // and then stored as truncating integer stores.
+ EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
+ EVT NewToVT = EVT::getVectorVT(
+ C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
SmallVector<SDValue, 4> Stores;
for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
@@ -13578,9 +14491,17 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
SDValue Extract =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
DAG.getConstant(i * NumElements, DL, MVT::i32));
+
+ if (ToEltVT == MVT::f16) {
+ SDValue FPTrunc =
+ DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
+ Extract, DAG.getConstant(0, DL, MVT::i32));
+ Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
+ }
+
SDValue Store = DAG.getTruncStore(
Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
- NewToVT, Alignment, MMOFlags, AAInfo);
+ NewToVT, Alignment.value(), MMOFlags, AAInfo);
Stores.push_back(Store);
}
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
@@ -13778,8 +14699,163 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
ConvInput, DAG.getConstant(C, dl, MVT::i32));
}
+static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ if (!ST->hasMVEIntegerOps())
+ return SDValue();
+
+ assert(N->getOpcode() == ISD::VECREDUCE_ADD);
+ EVT ResVT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDLoc dl(N);
+
+ // We are looking for something that will have illegal types if left alone,
+ // but that we can convert to a single instruction undef MVE. For example
+ // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
+ // or
+ // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
+
+ // Cases:
+ // VADDV u/s 8/16/32
+ // VMLAV u/s 8/16/32
+ // VADDLV u/s 32
+ // VMLALV u/s 16/32
+
+ auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
+ if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
+ return SDValue();
+ SDValue A = N0->getOperand(0);
+ if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+ return A;
+ return SDValue();
+ };
+ auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
+ SDValue &A, SDValue &B) {
+ if (ResVT != RetTy || N0->getOpcode() != ISD::MUL)
+ return false;
+ SDValue ExtA = N0->getOperand(0);
+ SDValue ExtB = N0->getOperand(1);
+ if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
+ return false;
+ A = ExtA->getOperand(0);
+ B = ExtB->getOperand(0);
+ if (A.getValueType() == B.getValueType() &&
+ llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+ return true;
+ return false;
+ };
+ auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
+ SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
+ SDValue(Node.getNode(), 1));
+ };
+
+ if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
+ return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
+ if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
+ return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
+ if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
+ return Create64bitNode(ARMISD::VADDLVs, {A});
+ if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
+ return Create64bitNode(ARMISD::VADDLVu, {A});
+
+ SDValue A, B;
+ if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
+ return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
+ if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
+ return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
+ if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
+ return Create64bitNode(ARMISD::VMLALVs, {A, B});
+ if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
+ return Create64bitNode(ARMISD::VMLALVu, {A, B});
+ return SDValue();
+}
+
+static SDValue PerformVMOVNCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ unsigned IsTop = N->getConstantOperandVal(2);
+
+ // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
+ // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
+ if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
+ Op1->getOpcode() == ARMISD::VQMOVNu) &&
+ Op1->getConstantOperandVal(2) == 0)
+ return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
+ Op0, Op1->getOperand(1), N->getOperand(2));
+
+ // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
+ // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
+ // into the top or bottom lanes.
+ unsigned NumElts = N->getValueType(0).getVectorNumElements();
+ APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
+ APInt Op0DemandedElts =
+ IsTop ? Op1DemandedElts
+ : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
+
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
+ KnownZero, DCI))
+ return SDValue(N, 0);
+ if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef,
+ KnownZero, DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
+static SDValue PerformVQMOVNCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue Op0 = N->getOperand(0);
+ unsigned IsTop = N->getConstantOperandVal(2);
+
+ unsigned NumElts = N->getValueType(0).getVectorNumElements();
+ APInt Op0DemandedElts =
+ APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
+ : APInt::getHighBitsSet(2, 1));
+
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
+ KnownZero, DCI))
+ return SDValue(N, 0);
+ return SDValue();
+}
+
+static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+
+ // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
+ // uses of the intrinsics.
+ if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+ int ShiftAmt = C->getSExtValue();
+ if (ShiftAmt == 0) {
+ SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
+ DAG.ReplaceAllUsesWith(N, Merge.getNode());
+ return SDValue();
+ }
+
+ if (ShiftAmt >= -32 && ShiftAmt < 0) {
+ unsigned NewOpcode =
+ N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
+ SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
+ DAG.getConstant(-ShiftAmt, DL, MVT::i32));
+ DAG.ReplaceAllUsesWith(N, NewShift.getNode());
+ return NewShift;
+ }
+ }
+
+ return SDValue();
+}
+
/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
-static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
switch (IntNo) {
default:
@@ -13928,6 +15004,72 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
case Intrinsic::arm_neon_vqrshiftu:
// No immediate versions of these to check for.
break;
+
+ case Intrinsic::arm_mve_vqdmlah:
+ case Intrinsic::arm_mve_vqdmlash:
+ case Intrinsic::arm_mve_vqrdmlah:
+ case Intrinsic::arm_mve_vqrdmlash:
+ case Intrinsic::arm_mve_vmla_n_predicated:
+ case Intrinsic::arm_mve_vmlas_n_predicated:
+ case Intrinsic::arm_mve_vqdmlah_predicated:
+ case Intrinsic::arm_mve_vqdmlash_predicated:
+ case Intrinsic::arm_mve_vqrdmlah_predicated:
+ case Intrinsic::arm_mve_vqrdmlash_predicated: {
+ // These intrinsics all take an i32 scalar operand which is narrowed to the
+ // size of a single lane of the vector type they return. So we don't need
+ // any bits of that operand above that point, which allows us to eliminate
+ // uxth/sxth.
+ unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
+ if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
+ return SDValue();
+ break;
+ }
+
+ case Intrinsic::arm_mve_minv:
+ case Intrinsic::arm_mve_maxv:
+ case Intrinsic::arm_mve_minav:
+ case Intrinsic::arm_mve_maxav:
+ case Intrinsic::arm_mve_minv_predicated:
+ case Intrinsic::arm_mve_maxv_predicated:
+ case Intrinsic::arm_mve_minav_predicated:
+ case Intrinsic::arm_mve_maxav_predicated: {
+ // These intrinsics all take an i32 scalar operand which is narrowed to the
+ // size of a single lane of the vector type they take as the other input.
+ unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
+ if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+ return SDValue();
+ break;
+ }
+
+ case Intrinsic::arm_mve_addv: {
+ // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
+ // which allow PerformADDVecReduce to turn it into VADDLV when possible.
+ bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
+ return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
+ }
+
+ case Intrinsic::arm_mve_addlv:
+ case Intrinsic::arm_mve_addlv_predicated: {
+ // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
+ // which recombines the two outputs into an i64
+ bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+ unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
+ (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
+ (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
+
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
+ if (i != 2) // skip the unsigned flag
+ Ops.push_back(N->getOperand(i));
+
+ SDLoc dl(N);
+ SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
+ val.getValue(1));
+ }
}
return SDValue();
@@ -14023,9 +15165,10 @@ static SDValue PerformShiftCombine(SDNode *N,
return SDValue();
}
-// Look for a sign/zero extend of a larger than legal load. This can be split
-// into two extending loads, which are simpler to deal with than an arbitrary
-// sign extend.
+// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
+// split into multiple extending loads, which are simpler to deal with than an
+// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
+// to convert the type to an f32.
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() != ISD::LOAD)
@@ -14047,45 +15190,63 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
NumElements = 4;
if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
NumElements = 8;
+ if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
+ NumElements = 4;
if (NumElements == 0 ||
- FromVT.getVectorNumElements() == NumElements ||
+ (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
FromVT.getVectorNumElements() % NumElements != 0 ||
!isPowerOf2_32(NumElements))
return SDValue();
+ LLVMContext &C = *DAG.getContext();
SDLoc DL(LD);
// Details about the old load
SDValue Ch = LD->getChain();
SDValue BasePtr = LD->getBasePtr();
- unsigned Alignment = LD->getOriginalAlignment();
+ Align Alignment = LD->getOriginalAlign();
MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
AAMDNodes AAInfo = LD->getAAInfo();
ISD::LoadExtType NewExtType =
N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
- EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
- EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
- unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
- SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
-
- // Split the load in half, each side of which is extended separately. This
- // is good enough, as legalisation will take it from there. They are either
- // already legal or they will be split further into something that is
- // legal.
- SDValue NewLoad1 =
- DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
- LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo);
- SDValue NewLoad2 =
- DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
- LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
- Alignment, MMOFlags, AAInfo);
-
- SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
- SDValue(NewLoad1.getNode(), 1),
- SDValue(NewLoad2.getNode(), 1));
+ EVT NewFromVT = EVT::getVectorVT(
+ C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
+ EVT NewToVT = EVT::getVectorVT(
+ C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
+
+ SmallVector<SDValue, 4> Loads;
+ SmallVector<SDValue, 4> Chains;
+ for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
+ unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
+ SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+ SDValue NewLoad =
+ DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
+ LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
+ Alignment.value(), MMOFlags, AAInfo);
+ Loads.push_back(NewLoad);
+ Chains.push_back(SDValue(NewLoad.getNode(), 1));
+ }
+
+ // Float truncs need to extended with VCVTB's into their floating point types.
+ if (FromEltVT == MVT::f16) {
+ SmallVector<SDValue, 4> Extends;
+
+ for (unsigned i = 0; i < Loads.size(); i++) {
+ SDValue LoadBC =
+ DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
+ SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
+ DAG.getConstant(0, DL, MVT::i32));
+ Extends.push_back(FPExt);
+ }
+
+ Loads = Extends;
+ }
+
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
}
/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
@@ -14133,6 +15294,116 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ if (ST->hasMVEFloatOps())
+ if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
+ return NewLoad;
+
+ return SDValue();
+}
+
+/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
+/// saturates.
+static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ if (!ST->hasMVEIntegerOps())
+ return SDValue();
+
+ if (VT != MVT::v4i32 && VT != MVT::v8i16)
+ return SDValue();
+
+ auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
+ // Check one is a smin and the other is a smax
+ if (Min->getOpcode() != ISD::SMIN)
+ std::swap(Min, Max);
+ if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
+ return false;
+
+ APInt SaturateC;
+ if (VT == MVT::v4i32)
+ SaturateC = APInt(32, (1 << 15) - 1, true);
+ else //if (VT == MVT::v8i16)
+ SaturateC = APInt(16, (1 << 7) - 1, true);
+
+ APInt MinC, MaxC;
+ if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
+ MinC != SaturateC)
+ return false;
+ if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
+ MaxC != ~SaturateC)
+ return false;
+ return true;
+ };
+
+ if (IsSignedSaturate(N, N0.getNode())) {
+ SDLoc DL(N);
+ MVT ExtVT, HalfVT;
+ if (VT == MVT::v4i32) {
+ HalfVT = MVT::v8i16;
+ ExtVT = MVT::v4i16;
+ } else { // if (VT == MVT::v8i16)
+ HalfVT = MVT::v16i8;
+ ExtVT = MVT::v8i8;
+ }
+
+ // Create a VQMOVNB with undef top lanes, then signed extended into the top
+ // half. That extend will hopefully be removed if only the bottom bits are
+ // demanded (though a truncating store, for example).
+ SDValue VQMOVN =
+ DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
+ N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
+ SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
+ DAG.getValueType(ExtVT));
+ }
+
+ auto IsUnsignedSaturate = [&](SDNode *Min) {
+ // For unsigned, we just need to check for <= 0xffff
+ if (Min->getOpcode() != ISD::UMIN)
+ return false;
+
+ APInt SaturateC;
+ if (VT == MVT::v4i32)
+ SaturateC = APInt(32, (1 << 16) - 1, true);
+ else //if (VT == MVT::v8i16)
+ SaturateC = APInt(16, (1 << 8) - 1, true);
+
+ APInt MinC;
+ if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
+ MinC != SaturateC)
+ return false;
+ return true;
+ };
+
+ if (IsUnsignedSaturate(N)) {
+ SDLoc DL(N);
+ MVT HalfVT;
+ unsigned ExtConst;
+ if (VT == MVT::v4i32) {
+ HalfVT = MVT::v8i16;
+ ExtConst = 0x0000FFFF;
+ } else { //if (VT == MVT::v8i16)
+ HalfVT = MVT::v16i8;
+ ExtConst = 0x00FF;
+ }
+
+ // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
+ // an AND. That extend will hopefully be removed if only the bottom bits are
+ // demanded (though a truncating store, for example).
+ SDValue VQMOVN =
+ DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
+ DAG.getConstant(0, DL, MVT::i32));
+ SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
+ return DAG.getNode(ISD::AND, DL, VT, Bitcast,
+ DAG.getConstant(ExtConst, DL, VT));
+ }
+
+ return SDValue();
+}
+
static const APInt *isPowerOf2Constant(SDValue V) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
if (!C)
@@ -14614,10 +15885,41 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
return Res;
}
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
+ SDValue Src = N->getOperand(0);
+ EVT DstVT = N->getValueType(0);
+
+ // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
+ if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
+ return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
+ }
+
+ // We may have a bitcast of something that has already had this bitcast
+ // combine performed on it, so skip past any VECTOR_REG_CASTs.
+ while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
+ Src = Src.getOperand(0);
+
+ // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
+ // would be generated is at least the width of the element type.
+ EVT SrcVT = Src.getValueType();
+ if ((Src.getOpcode() == ARMISD::VMOVIMM ||
+ Src.getOpcode() == ARMISD::VMVNIMM ||
+ Src.getOpcode() == ARMISD::VMOVFPIMM) &&
+ SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
+ DAG.getDataLayout().isBigEndian())
+ return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
+
+ return SDValue();
+}
+
SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
default: break;
+ case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget);
case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
@@ -14635,25 +15937,37 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ARMISD::BFI: return PerformBFICombine(N, DCI);
case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
+ case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
+ case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI);
case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
+ case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
- case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
+ case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
case ISD::FDIV:
return PerformVDIVCombine(N, DCI.DAG, Subtarget);
- case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return PerformIntrinsicCombine(N, DCI);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
return PerformShiftCombine(N, DCI, Subtarget);
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
- case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
+ case ISD::ANY_EXTEND:
+ return PerformExtendCombine(N, DCI.DAG, Subtarget);
+ case ISD::FP_EXTEND:
+ return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
+ case ISD::SMIN:
+ case ISD::UMIN:
+ case ISD::SMAX:
+ case ISD::UMAX:
+ return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
case ISD::LOAD: return PerformLOADCombine(N, DCI);
@@ -14664,10 +15978,25 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
return PerformVLDCombine(N, DCI);
case ARMISD::BUILD_VECTOR:
return PerformARMBUILD_VECTORCombine(N, DCI);
+ case ISD::BITCAST:
+ return PerformBITCASTCombine(N, DCI.DAG, Subtarget);
case ARMISD::PREDICATE_CAST:
return PerformPREDICATE_CASTCombine(N, DCI);
+ case ARMISD::VECTOR_REG_CAST:
+ return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
case ARMISD::VCMP:
return PerformVCMPCombine(N, DCI, Subtarget);
+ case ISD::VECREDUCE_ADD:
+ return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
+ case ARMISD::VMOVN:
+ return PerformVMOVNCombine(N, DCI);
+ case ARMISD::VQMOVNs:
+ case ARMISD::VQMOVNu:
+ return PerformVQMOVNCombine(N, DCI);
+ case ARMISD::ASRL:
+ case ARMISD::LSRL:
+ case ARMISD::LSLL:
+ return PerformLongShiftCombine(N, DCI.DAG);
case ARMISD::SMULWB: {
unsigned BitWidth = N->getValueType(0).getSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
@@ -14756,6 +16085,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::arm_neon_vst3lane:
case Intrinsic::arm_neon_vst4lane:
return PerformVLDCombine(N, DCI);
+ case Intrinsic::arm_mve_vld2q:
+ case Intrinsic::arm_mve_vld4q:
+ case Intrinsic::arm_mve_vst2q:
+ case Intrinsic::arm_mve_vst4q:
+ return PerformMVEVLDCombine(N, DCI);
default: break;
}
break;
@@ -14839,28 +16173,21 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
return false;
}
-static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
- unsigned AlignCheck) {
- return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
- (DstAlign == 0 || DstAlign % AlignCheck == 0));
-}
EVT ARMTargetLowering::getOptimalMemOpType(
- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const {
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
// See if we can use NEON instructions for this...
- if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
+ if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
bool Fast;
- if (Size >= 16 &&
- (memOpAlign(SrcAlign, DstAlign, 16) ||
+ if (Op.size() >= 16 &&
+ (Op.isAligned(Align(16)) ||
(allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
MachineMemOperand::MONone, &Fast) &&
Fast))) {
return MVT::v2f64;
- } else if (Size >= 8 &&
- (memOpAlign(SrcAlign, DstAlign, 8) ||
+ } else if (Op.size() >= 8 &&
+ (Op.isAligned(Align(8)) ||
(allowsMisalignedMemoryAccesses(
MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
Fast))) {
@@ -14974,45 +16301,97 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
if (!Subtarget->hasMVEIntegerOps())
return false;
- auto IsSinker = [](Instruction *I, int Operand) {
+ auto IsFMSMul = [&](Instruction *I) {
+ if (!I->hasOneUse())
+ return false;
+ auto *Sub = cast<Instruction>(*I->users().begin());
+ return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
+ };
+ auto IsFMS = [&](Instruction *I) {
+ if (match(I->getOperand(0), m_FNeg(m_Value())) ||
+ match(I->getOperand(1), m_FNeg(m_Value())))
+ return true;
+ return false;
+ };
+
+ auto IsSinker = [&](Instruction *I, int Operand) {
switch (I->getOpcode()) {
case Instruction::Add:
case Instruction::Mul:
+ case Instruction::FAdd:
case Instruction::ICmp:
+ case Instruction::FCmp:
return true;
+ case Instruction::FMul:
+ return !IsFMSMul(I);
case Instruction::Sub:
+ case Instruction::FSub:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
return Operand == 1;
+ case Instruction::Call:
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::fma:
+ return !IsFMS(I);
+ default:
+ return false;
+ }
+ }
+ return false;
default:
return false;
}
};
- int Op = 0;
- if (!isa<ShuffleVectorInst>(I->getOperand(Op)))
- Op = 1;
- if (!IsSinker(I, Op))
- return false;
- if (!match(I->getOperand(Op),
- m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
- m_Undef(), m_Zero()))) {
- return false;
- }
- Instruction *Shuffle = cast<Instruction>(I->getOperand(Op));
- // All uses of the shuffle should be sunk to avoid duplicating it across gpr
- // and vector registers
- for (Use &U : Shuffle->uses()) {
- Instruction *Insn = cast<Instruction>(U.getUser());
- if (!IsSinker(Insn, U.getOperandNo()))
- return false;
+ for (auto OpIdx : enumerate(I->operands())) {
+ Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
+ // Make sure we are not already sinking this operand
+ if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
+ continue;
+
+ Instruction *Shuffle = Op;
+ if (Shuffle->getOpcode() == Instruction::BitCast)
+ Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
+ // We are looking for a splat that can be sunk.
+ if (!Shuffle ||
+ !match(Shuffle, m_Shuffle(
+ m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
+ m_Undef(), m_ZeroMask())))
+ continue;
+ if (!IsSinker(I, OpIdx.index()))
+ continue;
+
+ // All uses of the shuffle should be sunk to avoid duplicating it across gpr
+ // and vector registers
+ for (Use &U : Op->uses()) {
+ Instruction *Insn = cast<Instruction>(U.getUser());
+ if (!IsSinker(Insn, U.getOperandNo()))
+ return false;
+ }
+
+ Ops.push_back(&Shuffle->getOperandUse(0));
+ if (Shuffle != Op)
+ Ops.push_back(&Op->getOperandUse(0));
+ Ops.push_back(&OpIdx.value());
}
- Ops.push_back(&Shuffle->getOperandUse(0));
- Ops.push_back(&I->getOperandUse(Op));
return true;
}
+Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const {
+ if (!Subtarget->hasMVEIntegerOps())
+ return nullptr;
+ Type *SVIType = SVI->getType();
+ Type *ScalarType = SVIType->getScalarType();
+
+ if (ScalarType->isFloatTy())
+ return Type::getInt32Ty(SVIType->getContext());
+ if (ScalarType->isHalfTy())
+ return Type::getInt16Ty(SVIType->getContext());
+ return nullptr;
+}
+
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT VT = ExtVal.getValueType();
@@ -15024,6 +16403,9 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return false;
}
+ if (Subtarget->hasMVEIntegerOps())
+ return true;
+
// Don't create a loadext if we can fold the extension into a wide/long
// instruction.
// If there's more than one user instruction, the loadext is desirable no
@@ -15445,7 +16827,7 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
return false;
}
-static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
+static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
bool isSEXTLoad, bool IsMasked, bool isLE,
SDValue &Base, SDValue &Offset,
bool &isInc, SelectionDAG &DAG) {
@@ -15480,16 +16862,16 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
// (in BE/masked) type.
Base = Ptr->getOperand(0);
if (VT == MVT::v4i16) {
- if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
+ if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
return true;
} else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
if (IsInRange(RHSC, 0x80, 1))
return true;
- } else if (Align >= 4 &&
+ } else if (Alignment >= 4 &&
(CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
IsInRange(RHSC, 0x80, 4))
return true;
- else if (Align >= 2 &&
+ else if (Alignment >= 2 &&
(CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
IsInRange(RHSC, 0x80, 2))
return true;
@@ -15511,28 +16893,28 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
EVT VT;
SDValue Ptr;
- unsigned Align;
+ Align Alignment;
bool isSEXTLoad = false;
bool IsMasked = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
Ptr = LD->getBasePtr();
VT = LD->getMemoryVT();
- Align = LD->getAlignment();
+ Alignment = LD->getAlign();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
Ptr = ST->getBasePtr();
VT = ST->getMemoryVT();
- Align = ST->getAlignment();
+ Alignment = ST->getAlign();
} else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
Ptr = LD->getBasePtr();
VT = LD->getMemoryVT();
- Align = LD->getAlignment();
+ Alignment = LD->getAlign();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
IsMasked = true;
} else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
Ptr = ST->getBasePtr();
VT = ST->getMemoryVT();
- Align = ST->getAlignment();
+ Alignment = ST->getAlign();
IsMasked = true;
} else
return false;
@@ -15541,9 +16923,9 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
bool isLegal = false;
if (VT.isVector())
isLegal = Subtarget->hasMVEIntegerOps() &&
- getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
- IsMasked, Subtarget->isLittle(), Base,
- Offset, isInc, DAG);
+ getMVEIndexedAddressParts(
+ Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
+ Subtarget->isLittle(), Base, Offset, isInc, DAG);
else {
if (Subtarget->isThumb2())
isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
@@ -15569,31 +16951,31 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
SelectionDAG &DAG) const {
EVT VT;
SDValue Ptr;
- unsigned Align;
+ Align Alignment;
bool isSEXTLoad = false, isNonExt;
bool IsMasked = false;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
- Align = LD->getAlignment();
+ Alignment = LD->getAlign();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
- Align = ST->getAlignment();
+ Alignment = ST->getAlign();
isNonExt = !ST->isTruncatingStore();
} else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
- Align = LD->getAlignment();
+ Alignment = LD->getAlign();
isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
IsMasked = true;
} else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
- Align = ST->getAlignment();
+ Alignment = ST->getAlign();
isNonExt = !ST->isTruncatingStore();
IsMasked = true;
} else
@@ -15619,7 +17001,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
bool isLegal = false;
if (VT.isVector())
isLegal = Subtarget->hasMVEIntegerOps() &&
- getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked,
+ getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
Subtarget->isLittle(), Base, Offset,
isInc, DAG);
else {
@@ -15734,18 +17116,23 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
if (Op.getOpcode() == ARMISD::VGETLANEs)
Known = Known.sext(DstSz);
else {
- Known = Known.zext(DstSz, true /* extended bits are known zero */);
+ Known = Known.zext(DstSz);
}
assert(DstSz == Known.getBitWidth());
break;
}
+ case ARMISD::VMOVrh: {
+ KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+ assert(KnownOp.getBitWidth() == 16);
+ Known = KnownOp.zext(32);
+ break;
+ }
}
}
-bool
-ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
- const APInt &DemandedAPInt,
- TargetLoweringOpt &TLO) const {
+bool ARMTargetLowering::targetShrinkDemandedConstant(
+ SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+ TargetLoweringOpt &TLO) const {
// Delay optimization, so we don't have to deal with illegal types, or block
// optimizations.
if (!TLO.LegalOps)
@@ -15770,7 +17157,7 @@ ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
unsigned Mask = C->getZExtValue();
- unsigned Demanded = DemandedAPInt.getZExtValue();
+ unsigned Demanded = DemandedBits.getZExtValue();
unsigned ShrunkMask = Mask & Demanded;
unsigned ExpandedMask = Mask | ~Demanded;
@@ -15825,6 +17212,35 @@ ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
return false;
}
+bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
+ SDValue Op, const APInt &OriginalDemandedBits,
+ const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+ unsigned Depth) const {
+ unsigned Opc = Op.getOpcode();
+
+ switch (Opc) {
+ case ARMISD::ASRL:
+ case ARMISD::LSRL: {
+ // If this is result 0 and the other result is unused, see if the demand
+ // bits allow us to shrink this long shift into a standard small shift in
+ // the opposite direction.
+ if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
+ isa<ConstantSDNode>(Op->getOperand(2))) {
+ unsigned ShAmt = Op->getConstantOperandVal(2);
+ if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(
+ APInt::getAllOnesValue(32) << (32 - ShAmt)))
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(
+ ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
+ TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
+ }
+ break;
+ }
+ }
+
+ return TargetLowering::SimplifyDemandedBitsForTargetNode(
+ Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
+}
//===----------------------------------------------------------------------===//
// ARM Inline Assembly Support
@@ -15835,7 +17251,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
if (!Subtarget->hasV6Ops())
return false;
- InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
std::string AsmStr = IA->getAsmString();
SmallVector<StringRef, 4> AsmPieces;
SplitString(AsmStr, AsmPieces, ";\n");
@@ -15843,7 +17259,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
switch (AsmPieces.size()) {
default: return false;
case 1:
- AsmStr = AsmPieces[0];
+ AsmStr = std::string(AsmPieces[0]);
AsmPieces.clear();
SplitString(AsmStr, AsmPieces, " \t,");
@@ -16342,13 +17758,15 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
if (DAG.getMachineFunction().getFunction().hasFnAttribute(
"no-stack-arg-probe")) {
- unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ MaybeAlign Align =
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
Chain = SP.getValue(1);
SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
if (Align)
- SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
+ SP =
+ DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
SDValue Ops[2] = { SP, Chain };
return DAG.getMergeValues(Ops, DL);
@@ -16552,7 +17970,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
+ Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
// volatile loads with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOLoad;
return true;
@@ -16593,7 +18011,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
+ Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
// volatile stores with NEON intrinsics not supported
Info.flags = MachineMemOperand::MOStore;
return true;
@@ -16619,6 +18037,34 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::arm_mve_vld2q:
+ case Intrinsic::arm_mve_vld4q: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ // Conservatively set memVT to the entire set of vectors loaded.
+ Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
+ unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
+ Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = Align(VecTy->getScalarSizeInBits() / 8);
+ // volatile loads with MVE intrinsics not supported
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ }
+ case Intrinsic::arm_mve_vst2q:
+ case Intrinsic::arm_mve_vst4q: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ // Conservatively set memVT to the entire set of vectors stored.
+ Type *VecTy = I.getArgOperand(1)->getType();
+ unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
+ Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = Align(VecTy->getScalarSizeInBits() / 8);
+ // volatile stores with MVE intrinsics not supported
+ Info.flags = MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::arm_ldaex:
case Intrinsic::arm_ldrex: {
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
@@ -16627,7 +18073,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
return true;
}
@@ -16639,7 +18085,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
- Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+ Info.align = DL.getABITypeAlign(PtrTy->getElementType());
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
}
@@ -16873,7 +18319,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
return false;
assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
- unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
+ unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize();
// We can do a store + vector extract on any vector that fits perfectly in a D
// or Q register.
if (BitWidth == 64 || BitWidth == 128) {
@@ -16986,7 +18432,7 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
}
bool ARMTargetLowering::isLegalInterleavedAccessType(
- unsigned Factor, VectorType *VecTy, const DataLayout &DL) const {
+ unsigned Factor, FixedVectorType *VecTy, const DataLayout &DL) const {
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
@@ -17045,8 +18491,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
- VectorType *VecTy = Shuffles[0]->getType();
- Type *EltTy = VecTy->getVectorElementType();
+ auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
+ Type *EltTy = VecTy->getElementType();
const DataLayout &DL = LI->getModule()->getDataLayout();
@@ -17061,8 +18507,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
// A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors.
if (EltTy->isPointerTy())
- VecTy =
- VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+ VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
IRBuilder<> Builder(LI);
@@ -17072,15 +18517,15 @@ bool ARMTargetLowering::lowerInterleavedLoad(
if (NumLoads > 1) {
// If we're going to generate more than one load, reset the sub-vector type
// to something legal.
- VecTy = VectorType::get(VecTy->getVectorElementType(),
- VecTy->getVectorNumElements() / NumLoads);
+ VecTy = FixedVectorType::get(VecTy->getElementType(),
+ VecTy->getNumElements() / NumLoads);
// We will compute the pointer operand of each load from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
- BaseAddr, VecTy->getVectorElementType()->getPointerTo(
- LI->getPointerAddressSpace()));
+ BaseAddr,
+ VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
}
assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
@@ -17105,8 +18550,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
"expected interleave factor of 2 or 4 for MVE");
Intrinsic::ID LoadInts =
Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
- Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo(
- LI->getPointerAddressSpace());
+ Type *VecEltTy =
+ VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace());
Type *Tys[] = {VecTy, VecEltTy};
Function *VldnFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
@@ -17126,9 +18571,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
// If we're generating more than one load, compute the base address of
// subsequent loads as an offset from the previous.
if (LoadCount > 0)
- BaseAddr =
- Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
- VecTy->getVectorNumElements() * Factor);
+ BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
+ VecTy->getNumElements() * Factor);
CallInst *VldN = createLoadIntrinsic(BaseAddr);
@@ -17143,8 +18587,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
// Convert the integer vector to pointer vector if the element is pointer.
if (EltTy->isPointerTy())
SubVec = Builder.CreateIntToPtr(
- SubVec, VectorType::get(SV->getType()->getVectorElementType(),
- VecTy->getVectorNumElements()));
+ SubVec,
+ FixedVectorType::get(SV->getType()->getElementType(), VecTy));
SubVecs[SV].push_back(SubVec);
}
@@ -17196,13 +18640,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
- VectorType *VecTy = SVI->getType();
- assert(VecTy->getVectorNumElements() % Factor == 0 &&
- "Invalid interleaved store");
+ auto *VecTy = cast<FixedVectorType>(SVI->getType());
+ assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
- unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
- Type *EltTy = VecTy->getVectorElementType();
- VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+ unsigned LaneLen = VecTy->getNumElements() / Factor;
+ Type *EltTy = VecTy->getElementType();
+ auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
@@ -17224,12 +18667,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
Type *IntTy = DL.getIntPtrType(EltTy);
// Convert to the corresponding integer vector.
- Type *IntVecTy =
- VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
+ auto *IntVecTy =
+ FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
- SubVecTy = VectorType::get(IntTy, LaneLen);
+ SubVecTy = FixedVectorType::get(IntTy, LaneLen);
}
// The base address of the store.
@@ -17239,14 +18682,14 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// If we're going to generate more than one store, reset the lane length
// and sub-vector type to something legal.
LaneLen /= NumStores;
- SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+ SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
// We will compute the pointer operand of each store from the original base
// address using GEPs. Cast the base address to a pointer to the scalar
// element type.
BaseAddr = Builder.CreateBitCast(
- BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
- SI->getPointerAddressSpace()));
+ BaseAddr,
+ SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
}
assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
@@ -17276,7 +18719,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
"expected interleave factor of 2 or 4 for MVE");
Intrinsic::ID StoreInts =
Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
- Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo(
+ Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo(
SI->getPointerAddressSpace());
Type *Tys[] = {EltPtrTy, SubVecTy};
Function *VstNFunc =
@@ -17298,7 +18741,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// If we generating more than one store, we compute the base address of
// subsequent stores as an offset from the previous.
if (StoreCount > 0)
- BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
+ BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
BaseAddr, LaneLen * Factor);
SmallVector<Value *, 4> Shuffles;
@@ -17308,7 +18751,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
unsigned IdxI = StoreCount * LaneLen * Factor + i;
if (Mask[IdxI] >= 0) {
Shuffles.push_back(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+ Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
} else {
unsigned StartMask = 0;
for (unsigned j = 1; j < LaneLen; j++) {
@@ -17325,7 +18768,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
// Note: StartMask cannot be negative, it's checked in
// isReInterleaveMask
Shuffles.push_back(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
+ Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
}
}
@@ -17373,11 +18816,11 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
case HA_DOUBLE:
return false;
case HA_VECT64:
- return VT->getBitWidth() == 64;
+ return VT->getPrimitiveSizeInBits().getFixedSize() == 64;
case HA_VECT128:
- return VT->getBitWidth() == 128;
+ return VT->getPrimitiveSizeInBits().getFixedSize() == 128;
case HA_UNKNOWN:
- switch (VT->getBitWidth()) {
+ switch (VT->getPrimitiveSizeInBits().getFixedSize()) {
case 64:
Base = HA_VECT64;
return true;
@@ -17396,7 +18839,7 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
/// Return the correct alignment for the current calling convention.
Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
DataLayout DL) const {
- const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy));
+ const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
if (!ArgTy->isVectorTy())
return ABITypeAlign;
@@ -17423,18 +18866,18 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
return IsHA || IsIntArray;
}
-unsigned ARMTargetLowering::getExceptionPointerRegister(
+Register ARMTargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
// Platforms which do not use SjLj EH may return values in these registers
// via the personality function.
- return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
+ return Subtarget->useSjLjEH() ? Register() : ARM::R0;
}
-unsigned ARMTargetLowering::getExceptionSelectorRegister(
+Register ARMTargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
// Platforms which do not use SjLj EH may return values in these registers
// via the personality function.
- return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
+ return Subtarget->useSjLjEH() ? Register() : ARM::R1;
}
void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
index 6061a65d3b89..8b1f4183032e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -68,10 +68,12 @@ class VectorType;
CALL, // Function call.
CALL_PRED, // Function call that's predicable.
CALL_NOLINK, // Function call with branch not branch-and-link.
+ tSECALL, // CMSE non-secure function call.
BRCOND, // Conditional branch.
BR_JT, // Jumptable branch.
BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump).
RET_FLAG, // Return with a flag operand.
+ SERET_FLAG, // CMSE Entry function return with a flag operand.
INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand.
PIC_ADD, // Add with a PC operand and a PIC label.
@@ -133,6 +135,7 @@ class VectorType;
LE, // Low-overhead loops, Loop End
PREDICATE_CAST, // Predicate cast for MVE i1 types
+ VECTOR_REG_CAST, // Reinterpret the current contents of a vector register
VCMP, // Vector compare.
VCMPZ, // Vector compare to zero.
@@ -201,10 +204,36 @@ class VectorType;
VTBL2, // 2-register shuffle with mask
VMOVN, // MVE vmovn
+ // MVE Saturating truncates
+ VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s)
+ VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u)
+
+ // MVE float <> half converts
+ VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top lanes
+ VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes
+
// Vector multiply long:
VMULLs, // ...signed
VMULLu, // ...unsigned
+ // MVE reductions
+ VADDVs, // sign- or zero-extend the elements of a vector to i32,
+ VADDVu, // add them all together, and return an i32 of their sum
+ VADDLVs, // sign- or zero-extend elements to i64 and sum, returning
+ VADDLVu, // the low and high 32-bit halves of the sum
+ VADDLVAs, // same as VADDLV[su] but also add an input accumulator
+ VADDLVAu, // provided as low and high halves
+ VADDLVps, // same as VADDLVs but with a v4i1 predicate mask
+ VADDLVpu, // same as VADDLVu but with a v4i1 predicate mask
+ VADDLVAps, // same as VADDLVps but with a v4i1 predicate mask
+ VADDLVApu, // same as VADDLVpu but with a v4i1 predicate mask
+ VMLAVs,
+ VMLAVu,
+ VMLALVs,
+ VMLALVu,
+ VMLALVAs,
+ VMLALVAu,
+
SMULWB, // Signed multiply word by half word, bottom
SMULWT, // Signed multiply word by half word, top
UMLAL, // 64bit Unsigned Accumulate Multiply
@@ -280,7 +309,11 @@ class VectorType;
VST4_UPD,
VST2LN_UPD,
VST3LN_UPD,
- VST4LN_UPD
+ VST4LN_UPD,
+
+ // Load/Store of dual registers
+ LDRD,
+ STRD
};
} // end namespace ARMISD
@@ -333,8 +366,16 @@ class VectorType;
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
+ SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+ bool SimplifyDemandedBitsForTargetNode(SDValue Op,
+ const APInt &OriginalDemandedBits,
+ const APInt &OriginalDemandedElts,
+ KnownBits &Known,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const override;
+
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
/// allowsMisalignedMemoryAccesses - Returns true if the target allows
@@ -345,10 +386,7 @@ class VectorType;
MachineMemOperand::Flags Flags,
bool *Fast) const override;
- EVT getOptimalMemOpType(uint64_t Size,
- unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset,
- bool MemcpyStrSrc,
+ EVT getOptimalMemOpType(const MemOp &Op,
const AttributeList &FuncAttributes) const override;
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
@@ -356,6 +394,7 @@ class VectorType;
bool isZExtFree(SDValue Val, EVT VT2) const override;
bool shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
+ Type* shouldConvertSplatType(ShuffleVectorInst* SVI) const override;
bool isFNegFree(EVT VT) const override;
@@ -414,10 +453,10 @@ class VectorType;
const SelectionDAG &DAG,
unsigned Depth) const override;
- bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+ bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
TargetLoweringOpt &TLO) const override;
-
bool ExpandInlineAsm(CallInst *CI) const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
@@ -522,6 +561,12 @@ class VectorType;
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const override;
+ bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool MathUsed) const override {
+ // Using overflow ops for overflow checks only should beneficial on ARM.
+ return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
+ }
+
/// Returns true if an argument of type Ty needs to be passed in a
/// contiguous block of registers in calling convention CallConv.
bool functionArgumentNeedsConsecutiveRegisters(
@@ -529,12 +574,12 @@ class VectorType;
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
- unsigned
+ Register
getExceptionPointerRegister(const Constant *PersonalityFn) const override;
/// If a physical register, this returns the register that receives the
/// exception typeid on entry to a landing pad.
- unsigned
+ Register
getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const;
@@ -606,7 +651,7 @@ class VectorType;
/// Returns true if \p VecTy is a legal interleaved access type. This
/// function checks the vector element type and the overall width of the
/// vector.
- bool isLegalInterleavedAccessType(unsigned Factor, VectorType *VecTy,
+ bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy,
const DataLayout &DL) const;
bool alignLoopsWithOptSize() const override;
@@ -723,6 +768,8 @@ class VectorType;
SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
SmallVectorImpl<SDValue> &Results) const;
+ SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) const;
SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed,
SDValue &Chain) const;
SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const;
@@ -734,6 +781,8 @@ class VectorType;
SDValue LowerFSETCC(SDValue Op, SelectionDAG &DAG) const;
void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const;
+ void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
@@ -744,6 +793,11 @@ class VectorType;
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const override;
+ SDValue MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, MVT LocVT, MVT ValVT,
+ SDValue Val) const;
+ SDValue MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, MVT LocVT,
+ MVT ValVT, SDValue Val) const;
+
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
@@ -763,6 +817,17 @@ class VectorType;
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+ bool
+ splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
+ SDValue *Parts, unsigned NumParts, MVT PartVT,
+ Optional<CallingConv::ID> CC) const override;
+
+ SDValue
+ joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
+ const SDValue *Parts, unsigned NumParts,
+ MVT PartVT, EVT ValueVT,
+ Optional<CallingConv::ID> CC) const override;
+
SDValue
LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -783,7 +848,7 @@ class VectorType;
SmallVectorImpl<SDValue> &InVals) const override;
/// HandleByVal - Target-specific cleanup for ByVal support.
- void HandleByVal(CCState *, unsigned &, unsigned) const override;
+ void HandleByVal(CCState *, unsigned &, Align) const override;
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrCDE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrCDE.td
new file mode 100644
index 000000000000..0e97668e2e01
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -0,0 +1,666 @@
+//===-- ARMInstrCDE.td - CDE support for ARM ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Arm CDE (Custom Datapath Extension) instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// Immediate operand of arbitrary bit width
+class BitWidthImmOperand<int width>
+ : ImmAsmOperand<0, !add(!shl(1, width), -1)> {
+ let Name = "Imm"#width#"b";
+}
+
+class BitWidthImm<int width>
+ : Operand<i32>,
+ ImmLeaf<i32, "{ return Imm >= 0 && Imm < (1 << "#width#"); }"> {
+ let ParserMatchClass = BitWidthImmOperand<width>;
+}
+
+def CDEDualRegOp : RegisterOperand<GPRPairnosp, "printGPRPairOperand">;
+
+// Used by VCX3 FP
+def imm_3b : BitWidthImm<3>;
+
+// Used by VCX3 vector
+def imm_4b : BitWidthImm<4>;
+
+// Used by VCX2 FP and CX3
+def imm_6b : BitWidthImm<6>;
+
+// Used by VCX2 vector
+def imm_7b : BitWidthImm<7>;
+
+// Used by CX2
+def imm_9b : BitWidthImm<9>;
+
+// Used by VCX1 FP
+def imm_11b : BitWidthImm<11>;
+
+// Used by VCX1 vector
+def imm_12b : BitWidthImm<12>;
+
+// Used by CX1
+def imm_13b : BitWidthImm<13>;
+
+// Base class for all CDE instructions
+class CDE_Instr<bit acc, dag oops, dag iops, string asm, string cstr>
+ : Thumb2XI<oops, !con((ins p_imm:$coproc), iops),
+ AddrModeNone, /*sz=*/4, NoItinerary,
+ asm, cstr, /*pattern=*/[]>,
+ Sched<[]> {
+ bits<3> coproc;
+
+ let Inst{31-29} = 0b111; // 15:13
+ let Inst{28} = acc;
+ let Inst{27-26} = 0b11;
+ let Inst{11} = 0b0;
+ let Inst{10-8} = coproc{2-0};
+
+ let isPredicable = 0;
+ let DecoderNamespace = "Thumb2CDE";
+}
+
+// Base class for CX* CDE instructions
+class CDE_GPR_Instr<bit dual, bit acc, dag oops, dag iops,
+ string asm, string cstr>
+ : CDE_Instr<acc, oops, iops, asm, cstr>,
+ Requires<[HasCDE]> {
+
+ let Inst{25-24} = 0b10;
+ let Inst{6} = dual;
+ let isPredicable = acc;
+}
+
+// Set of registers used by the CDE instructions.
+class CDE_RegisterOperands {
+ dag Rd;
+ dag Rd_src;
+ dag Rn;
+ dag Rm;
+}
+
+// CX* CDE instruction parameter set
+class CX_Params {
+ dag Oops; // Output operands for CX* instructions
+ dag Iops1; // Input operands for CX1* instructions
+ dag Iops2; // Input operands for CX2* instructions
+ dag Iops3; // Input operands for CX3* instructions
+ dag PredOp; // Input predicate operand
+ string PAsm; // Predicate assembly string
+ string Cstr; // asm constraint string
+ bit Dual; // "dual" field for encoding
+ bit Acc; // "acc" field for encoding
+}
+
+// VCX* CDE instruction parameter set
+class VCX_Params {
+ dag Oops; // Output operands for VCX* instructions
+ dag Iops1; // Input operands for VCX1* instructions
+ dag Iops2; // Input operands for VCX2* instructions
+ dag Iops3; // Input operands for VCX3* instructions
+ string Cstr; // asm constraint string
+ bit Acc; // "acc" field for encoding
+ vpred_ops Vpred; // Predication type for VCX* vector instructions
+}
+
+// CX1, CX1A, CX1D, CX1DA
+class CDE_CX1_Instr<string iname, CX_Params params>
+ : CDE_GPR_Instr<params.Dual, params.Acc, params.Oops,
+ !con(params.Iops1, (ins imm_13b:$imm), params.PredOp),
+ !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $imm"),
+ params.Cstr> {
+ bits<13> imm;
+ bits<4> Rd;
+
+ let Inst{23-22} = 0b00;
+ let Inst{21-16} = imm{12-7};
+ let Inst{15-12} = Rd{3-0};
+ let Inst{7} = imm{6};
+ let Inst{5-0} = imm{5-0};
+}
+
+// CX2, CX2A, CX2D, CX2DA
+class CDE_CX2_Instr<string iname, CX_Params params>
+ : CDE_GPR_Instr<params.Dual, params.Acc, params.Oops,
+ !con(params.Iops2, (ins imm_9b:$imm), params.PredOp),
+ !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $Rn, $imm"),
+ params.Cstr> {
+ bits<9> imm;
+ bits<4> Rd;
+ bits<4> Rn;
+
+ let Inst{23-22} = 0b01;
+ let Inst{21-20} = imm{8-7};
+ let Inst{19-16} = Rn{3-0};
+ let Inst{15-12} = Rd{3-0};
+ let Inst{7} = imm{6};
+ let Inst{5-0} = imm{5-0};
+}
+
+// CX3, CX3A, CX3D, CX3DA
+class CDE_CX3_Instr<string iname, CX_Params params>
+ : CDE_GPR_Instr<params.Dual, params.Acc, params.Oops,
+ !con(params.Iops3, (ins imm_6b:$imm), params.PredOp),
+ !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $Rn, $Rm, $imm"),
+ params.Cstr> {
+ bits<6> imm;
+ bits<4> Rd;
+ bits<4> Rn;
+ bits<4> Rm;
+
+ let Inst{23} = 0b1;
+ let Inst{22-20} = imm{5-3};
+ let Inst{19-16} = Rn{3-0};
+ let Inst{15-12} = Rm{3-0};
+ let Inst{7} = imm{2};
+ let Inst{5-4} = imm{1-0};
+ let Inst{3-0} = Rd{3-0};
+}
+
+// Registers for single-register variants of CX* instructions
+def cde_cx_single_regs : CDE_RegisterOperands {
+ let Rd = (outs GPRwithAPSR_NZCVnosp:$Rd);
+ let Rd_src = (ins GPRwithAPSR_NZCVnosp:$Rd_src);
+ let Rn = (ins GPRwithAPSR_NZCVnosp:$Rn);
+ let Rm = (ins GPRwithAPSR_NZCVnosp:$Rm);
+}
+
+// Registers for single-register variants of CX* instructions
+def cde_cx_dual_regs : CDE_RegisterOperands {
+ let Rd = (outs CDEDualRegOp:$Rd);
+ let Rd_src = (ins CDEDualRegOp:$Rd_src);
+ let Rn = (ins GPRwithAPSR_NZCVnosp:$Rn);
+ let Rm = (ins GPRwithAPSR_NZCVnosp:$Rm);
+}
+
+class CDE_CX_ParamsTemplate<bit dual, bit acc, CDE_RegisterOperands ops>
+ : CX_Params {
+
+ dag IOpsPrefix = !if(acc, ops.Rd_src, (ins));
+
+ let Oops = ops.Rd;
+ let Iops1 = IOpsPrefix;
+ let Iops2 = !con(IOpsPrefix, ops.Rn);
+ let Iops3 = !con(IOpsPrefix, ops.Rn, ops.Rm);
+ let PredOp = !if(acc, (ins pred:$p), (ins));
+ let PAsm = !if(acc, "${p}", "");
+ let Cstr = !if(acc, "$Rd = $Rd_src", "");
+ let Dual = dual;
+ let Acc = acc;
+}
+
+def cde_cx_params_single_noacc : CDE_CX_ParamsTemplate<0b0, 0b0, cde_cx_single_regs>;
+def cde_cx_params_single_acc : CDE_CX_ParamsTemplate<0b0, 0b1, cde_cx_single_regs>;
+def cde_cx_params_dual_noacc : CDE_CX_ParamsTemplate<0b1, 0b0, cde_cx_dual_regs>;
+def cde_cx_params_dual_acc : CDE_CX_ParamsTemplate<0b1, 0b1, cde_cx_dual_regs>;
+
+def CDE_CX1 : CDE_CX1_Instr<"cx1", cde_cx_params_single_noacc>;
+def CDE_CX1A : CDE_CX1_Instr<"cx1a", cde_cx_params_single_acc>;
+def CDE_CX1D : CDE_CX1_Instr<"cx1d", cde_cx_params_dual_noacc>;
+def CDE_CX1DA : CDE_CX1_Instr<"cx1da", cde_cx_params_dual_acc>;
+
+def CDE_CX2 : CDE_CX2_Instr<"cx2", cde_cx_params_single_noacc>;
+def CDE_CX2A : CDE_CX2_Instr<"cx2a", cde_cx_params_single_acc>;
+def CDE_CX2D : CDE_CX2_Instr<"cx2d", cde_cx_params_dual_noacc>;
+def CDE_CX2DA : CDE_CX2_Instr<"cx2da", cde_cx_params_dual_acc>;
+
+def CDE_CX3 : CDE_CX3_Instr<"cx3", cde_cx_params_single_noacc>;
+def CDE_CX3A : CDE_CX3_Instr<"cx3a", cde_cx_params_single_acc>;
+def CDE_CX3D : CDE_CX3_Instr<"cx3d", cde_cx_params_dual_noacc>;
+def CDE_CX3DA : CDE_CX3_Instr<"cx3da", cde_cx_params_dual_acc>;
+
+let Predicates = [HasCDE] in {
+ def : Pat<(i32 (int_arm_cde_cx1 timm:$coproc, timm:$imm)),
+ (i32 (CDE_CX1 p_imm:$coproc, imm_13b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx1a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+ timm:$imm)),
+ (i32 (CDE_CX1A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+ imm_13b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx2 timm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+ timm:$imm)),
+ (i32 (CDE_CX2 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+ imm_9b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx2a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+ GPRwithAPSR_NZCVnosp:$n, timm:$imm)),
+ (i32 (CDE_CX2A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+ GPRwithAPSR_NZCVnosp:$n, imm_9b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx3 timm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+ GPRwithAPSR_NZCVnosp:$m, timm:$imm)),
+ (i32 (CDE_CX3 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+ GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>;
+ def : Pat<(i32 (int_arm_cde_cx3a timm:$coproc,
+ GPRwithAPSR_NZCVnosp:$acc,
+ GPRwithAPSR_NZCVnosp:$n,
+ GPRwithAPSR_NZCVnosp:$m, timm:$imm)),
+ (i32 (CDE_CX3A p_imm:$coproc,
+ GPRwithAPSR_NZCVnosp:$acc,
+ GPRwithAPSR_NZCVnosp:$n,
+ GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>;
+}
+
+class CDE_RequiresSReg : Requires<[HasCDE, HasFPRegs]>;
+class CDE_RequiresDReg : Requires<[HasCDE, HasFPRegs]>;
+class CDE_RequiresQReg : Requires<[HasCDE, HasMVEInt]>;
+
+// Base class for CDE VCX* instructions
+class CDE_FP_Vec_Instr<bit vec, bit acc, dag oops, dag iops, string asm, string cstr>
+ : CDE_Instr<acc, oops, iops, asm, cstr> {
+ let Inst{25} = 0b0;
+ let Inst{6} = vec;
+}
+
+// Base class for floating-point variants of CDE VCX* instructions
+class CDE_FP_Instr<bit acc, bit sz, dag oops, dag iops, string asm, string cstr>
+ : CDE_FP_Vec_Instr<0b0, acc, oops, iops, asm, cstr> {
+ let Inst{24} = sz;
+}
+
+// Base class for vector variants of CDE VCX* instruction
+class CDE_Vec_Instr<bit acc, dag oops, dag iops, string asm, string cstr,
+ vpred_ops vpred>
+ : CDE_FP_Vec_Instr<0b1, acc, oops,
+ !con(iops, (ins vpred:$vp)), asm,
+ !strconcat(cstr, vpred.vpred_constraint)>,
+ CDE_RequiresQReg {
+}
+
+
+// VCX1/VCX1A, vector variant
+class CDE_VCX1_Vec_Instr<string iname, VCX_Params params>
+ : CDE_Vec_Instr<params.Acc, params.Oops,
+ !con(params.Iops1, (ins imm_12b:$imm)),
+ iname#"${vp}\t$coproc, $Qd, $imm", params.Cstr, params.Vpred> {
+ bits<12> imm;
+ bits<3> Qd;
+
+ let Inst{24} = imm{11};
+ let Inst{23} = 0b0;
+ let Inst{22} = 0b0;
+ let Inst{21-20} = 0b10;
+ let Inst{19-16} = imm{10-7};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = 0b0;
+ let Inst{7} = imm{6};
+ let Inst{5-0} = imm{5-0};
+
+ let Unpredictable{22} = 0b1;
+}
+
+// VCX1/VCX1A, base class for FP variants
+class CDE_VCX1_FP_Instr<bit sz, string iname, VCX_Params params>
+ : CDE_FP_Instr<params.Acc, sz, params.Oops,
+ !con(params.Iops1, (ins imm_11b:$imm)),
+ iname#"\t$coproc, $Vd, $imm", params.Cstr> {
+ bits<11> imm;
+
+ let Inst{23} = 0b0;
+ let Inst{21-20} = 0b10;
+ let Inst{19-16} = imm{10-7};
+ let Inst{7} = imm{6};
+ let Inst{5-0} = imm{5-0};
+}
+
+// VCX1/VCX1A, S registers
+class CDE_VCX1_FP_Instr_S<string iname, VCX_Params params>
+ : CDE_VCX1_FP_Instr<0b0, iname, params>,
+ CDE_RequiresSReg {
+ bits<5> Vd;
+
+ let Inst{22} = Vd{0};
+ let Inst{15-12} = Vd{4-1};
+}
+
+// VCX1/VCX1A, D registers
+class CDE_VCX1_FP_Instr_D<string iname, VCX_Params params>
+ : CDE_VCX1_FP_Instr<0b1, iname, params>,
+ CDE_RequiresDReg {
+ bits<5> Vd;
+
+ let Inst{22} = Vd{4};
+ let Inst{15-12} = Vd{3-0};
+}
+
+// VCX2/VCX2A, vector variant
+class CDE_VCX2_Vec_Instr<string iname, VCX_Params params>
+ : CDE_Vec_Instr<params.Acc, params.Oops,
+ !con(params.Iops2, (ins imm_7b:$imm)),
+ iname#"${vp}\t$coproc, $Qd, $Qm, $imm", params.Cstr,
+ params.Vpred> {
+ bits<7> imm;
+ bits<3> Qd;
+ bits<3> Qm;
+
+ let Inst{24} = imm{6};
+ let Inst{23} = 0b0;
+ let Inst{22} = 0b0;
+ let Inst{21-20} = 0b11;
+ let Inst{19-16} = imm{5-2};
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = 0b0;
+ let Inst{7} = imm{1};
+ let Inst{5} = 0b0;
+ let Inst{4} = imm{0};
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+
+ let Unpredictable{22} = 0b1;
+ let Unpredictable{5} = 0b1;
+}
+
+// VCX2/VCX2A, base class for FP variants
+class CDE_VCX2_FP_Instr<bit sz, string iname, VCX_Params params>
+ : CDE_FP_Instr<params.Acc, sz, params.Oops,
+ !con(params.Iops2, (ins imm_6b:$imm)),
+ iname#"\t$coproc, $Vd, $Vm, $imm", params.Cstr> {
+ bits<6> imm;
+
+ let Inst{23} = 0b0;
+ let Inst{21-20} = 0b11;
+ let Inst{19-16} = imm{5-2};
+ let Inst{7} = imm{1};
+ let Inst{4} = imm{0};
+}
+
+// VCX2/VCX2A, S registers
+class CDE_VCX2_FP_Instr_S<string iname, VCX_Params params>
+ : CDE_VCX2_FP_Instr<0b0, iname, params>,
+ CDE_RequiresSReg {
+ bits<5> Vd;
+ bits<5> Vm;
+
+ let Inst{15-12} = Vd{4-1};
+ let Inst{22} = Vd{0};
+ let Inst{3-0} = Vm{4-1};
+ let Inst{5} = Vm{0};
+}
+
+// VCX2/VCX2A, D registers
+class CDE_VCX2_FP_Instr_D<string iname, VCX_Params params>
+ : CDE_VCX2_FP_Instr<0b1, iname, params>,
+ CDE_RequiresDReg {
+ bits<5> Vd;
+ bits<5> Vm;
+
+ let Inst{15-12} = Vd{3-0};
+ let Inst{22} = Vd{4};
+ let Inst{3-0} = Vm{3-0};
+ let Inst{5} = Vm{4};
+}
+
+// VCX3/VCX3A, vector variant
+class CDE_VCX3_Vec_Instr<string iname, VCX_Params params>
+ : CDE_Vec_Instr<params.Acc, params.Oops,
+ !con(params.Iops3, (ins imm_4b:$imm)),
+ iname#"${vp}\t$coproc, $Qd, $Qn, $Qm, $imm", params.Cstr,
+ params.Vpred> {
+ bits<4> imm;
+ bits<3> Qd;
+ bits<3> Qm;
+ bits<3> Qn;
+
+ let Inst{24} = imm{3};
+ let Inst{23} = 0b1;
+ let Inst{22} = 0b0;
+ let Inst{21-20} = imm{2-1};
+ let Inst{19-17} = Qn{2-0};
+ let Inst{16} = 0b0;
+ let Inst{15-13} = Qd{2-0};
+ let Inst{12} = 0b0;
+ let Inst{7} = 0b0;
+ let Inst{5} = 0b0;
+ let Inst{4} = imm{0};
+ let Inst{3-1} = Qm{2-0};
+ let Inst{0} = 0b0;
+
+ let Unpredictable{22} = 0b1;
+ let Unpredictable{7} = 0b1;
+ let Unpredictable{5} = 0b1;
+}
+
+// VCX3/VCX3A, base class for FP variants
+class CDE_VCX3_FP_Instr<bit sz, string iname, VCX_Params params>
+ : CDE_FP_Instr<params.Acc, sz, params.Oops,
+ !con(params.Iops3, (ins imm_3b:$imm)),
+ iname#"\t$coproc, $Vd, $Vn, $Vm, $imm", params.Cstr> {
+ bits<3> imm;
+
+ let Inst{23} = 0b1;
+ let Inst{21-20} = imm{2-1};
+ let Inst{4} = imm{0};
+}
+
+// VCX3/VCX3A, S registers
+class CDE_VCX3_FP_Instr_S<string iname, VCX_Params params>
+ : CDE_VCX3_FP_Instr<0b0, iname, params>,
+ CDE_RequiresSReg {
+ bits<5> Vd;
+ bits<5> Vm;
+ bits<5> Vn;
+
+ let Inst{22} = Vd{0};
+ let Inst{19-16} = Vn{4-1};
+ let Inst{15-12} = Vd{4-1};
+ let Inst{7} = Vn{0};
+ let Inst{5} = Vm{0};
+ let Inst{3-0} = Vm{4-1};
+}
+
+// VCX3/VCX3A, D registers
+class CDE_VCX3_FP_Instr_D<string iname, VCX_Params params>
+ : CDE_VCX3_FP_Instr<0b1, iname, params>,
+ CDE_RequiresDReg {
+ bits<5> Vd;
+ bits<5> Vm;
+ bits<5> Vn;
+
+ let Inst{22} = Vd{4};
+ let Inst{19-16} = Vn{3-0};
+ let Inst{15-12} = Vd{3-0};
+ let Inst{7} = Vn{4};
+ let Inst{5} = Vm{4};
+ let Inst{3-0} = Vm{3-0};
+}
+
+// Register operands for VCX* instructions
+class CDE_VCX_RegisterOperandsTemplate<RegisterClass regclass>
+ : CDE_RegisterOperands {
+ let Rd = (outs regclass:$Vd);
+ let Rd_src = (ins regclass:$Vd_src);
+ let Rn = (ins regclass:$Vn);
+ let Rm = (ins regclass:$Vm);
+}
+
+class CDE_VCXQ_RegisterOperandsTemplate<RegisterClass regclass>
+ : CDE_RegisterOperands {
+ let Rd = (outs regclass:$Qd);
+ let Rd_src = (ins regclass:$Qd_src);
+ let Rn = (ins regclass:$Qn);
+ let Rm = (ins regclass:$Qm);
+}
+
+def cde_vcx_s_regs : CDE_VCX_RegisterOperandsTemplate<SPR>;
+def cde_vcx_d_regs : CDE_VCX_RegisterOperandsTemplate<DPR_VFP2>;
+def cde_vcx_q_regs : CDE_VCXQ_RegisterOperandsTemplate<MQPR>;
+
+class CDE_VCX_ParamsTemplate<bit acc, CDE_RegisterOperands ops>
+ : VCX_Params {
+
+ dag IOpsPrefix = !if(acc, ops.Rd_src, (ins));
+
+ let Oops = ops.Rd;
+ let Iops1 = IOpsPrefix;
+ let Iops2 = !con(IOpsPrefix, ops.Rm);
+ let Iops3 = !con(IOpsPrefix, ops.Rn, ops.Rm);
+ let Cstr = !if(acc, "$Vd = $Vd_src", "");
+ let Acc = acc;
+}
+
+class CDE_VCXQ_ParamsTemplate<bit acc, CDE_RegisterOperands ops>
+ : VCX_Params {
+
+ dag IOpsPrefix = !if(acc, ops.Rd_src, (ins));
+
+ let Oops = ops.Rd;
+ let Iops1 = IOpsPrefix;
+ let Iops2 = !con(IOpsPrefix, ops.Rm);
+ let Iops3 = !con(IOpsPrefix, ops.Rn, ops.Rm);
+ let Cstr = !if(acc, "$Qd = $Qd_src", "");
+ let Acc = acc;
+ let Vpred = !if(acc, vpred_n, vpred_r);
+}
+
+def cde_vcx_params_s_noacc : CDE_VCX_ParamsTemplate<0b0, cde_vcx_s_regs>;
+def cde_vcx_params_s_acc : CDE_VCX_ParamsTemplate<0b1, cde_vcx_s_regs>;
+def cde_vcx_params_d_noacc : CDE_VCX_ParamsTemplate<0b0, cde_vcx_d_regs>;
+def cde_vcx_params_d_acc : CDE_VCX_ParamsTemplate<0b1, cde_vcx_d_regs>;
+def cde_vcx_params_q_noacc : CDE_VCXQ_ParamsTemplate<0b0, cde_vcx_q_regs>;
+def cde_vcx_params_q_acc : CDE_VCXQ_ParamsTemplate<0b1, cde_vcx_q_regs>;
+
+def CDE_VCX1_fpsp : CDE_VCX1_FP_Instr_S<"vcx1", cde_vcx_params_s_noacc>;
+def CDE_VCX1A_fpsp : CDE_VCX1_FP_Instr_S<"vcx1a", cde_vcx_params_s_acc>;
+def CDE_VCX1_fpdp : CDE_VCX1_FP_Instr_D<"vcx1", cde_vcx_params_d_noacc>;
+def CDE_VCX1A_fpdp : CDE_VCX1_FP_Instr_D<"vcx1a", cde_vcx_params_d_acc>;
+def CDE_VCX1_vec : CDE_VCX1_Vec_Instr<"vcx1", cde_vcx_params_q_noacc>;
+def CDE_VCX1A_vec : CDE_VCX1_Vec_Instr<"vcx1a", cde_vcx_params_q_acc>;
+
+def CDE_VCX2_fpsp : CDE_VCX2_FP_Instr_S<"vcx2", cde_vcx_params_s_noacc>;
+def CDE_VCX2A_fpsp : CDE_VCX2_FP_Instr_S<"vcx2a", cde_vcx_params_s_acc>;
+def CDE_VCX2_fpdp : CDE_VCX2_FP_Instr_D<"vcx2", cde_vcx_params_d_noacc>;
+def CDE_VCX2A_fpdp : CDE_VCX2_FP_Instr_D<"vcx2a", cde_vcx_params_d_acc>;
+def CDE_VCX2_vec : CDE_VCX2_Vec_Instr<"vcx2", cde_vcx_params_q_noacc>;
+def CDE_VCX2A_vec : CDE_VCX2_Vec_Instr<"vcx2a", cde_vcx_params_q_acc>;
+
+def CDE_VCX3_fpsp : CDE_VCX3_FP_Instr_S<"vcx3", cde_vcx_params_s_noacc>;
+def CDE_VCX3A_fpsp : CDE_VCX3_FP_Instr_S<"vcx3a", cde_vcx_params_s_acc>;
+def CDE_VCX3_fpdp : CDE_VCX3_FP_Instr_D<"vcx3", cde_vcx_params_d_noacc>;
+def CDE_VCX3A_fpdp : CDE_VCX3_FP_Instr_D<"vcx3a", cde_vcx_params_d_acc>;
+def CDE_VCX3_vec : CDE_VCX3_Vec_Instr<"vcx3", cde_vcx_params_q_noacc>;
+def CDE_VCX3A_vec : CDE_VCX3_Vec_Instr<"vcx3a", cde_vcx_params_q_acc>;
+
+
+let Predicates = [HasCDE, HasFPRegs] in {
+ def : Pat<(f32 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)),
+ (f32 (CDE_VCX1_fpsp p_imm:$coproc, imm_11b:$imm))>;
+ def : Pat<(f32 (int_arm_cde_vcx1a timm:$coproc, (f32 SPR:$acc), timm:$imm)),
+ (f32 (CDE_VCX1A_fpsp p_imm:$coproc, SPR:$acc, imm_11b:$imm))>;
+ def : Pat<(f64 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)),
+ (f64 (CDE_VCX1_fpdp p_imm:$coproc, imm_11b:$imm))>;
+ def : Pat<(f64 (int_arm_cde_vcx1a timm:$coproc, (f64 DPR:$acc), timm:$imm)),
+ (f64 (CDE_VCX1A_fpdp p_imm:$coproc, DPR:$acc, imm_11b:$imm))>;
+
+ def : Pat<(f32 (int_arm_cde_vcx2 timm:$coproc, (f32 SPR:$n), timm:$imm)),
+ (f32 (CDE_VCX2_fpsp p_imm:$coproc, SPR:$n, imm_6b:$imm))>;
+ def : Pat<(f32 (int_arm_cde_vcx2a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n),
+ timm:$imm)),
+ (f32 (CDE_VCX2A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, imm_6b:$imm))>;
+ def : Pat<(f64 (int_arm_cde_vcx2 timm:$coproc, (f64 DPR:$n), timm:$imm)),
+ (f64 (CDE_VCX2_fpdp p_imm:$coproc, DPR:$n, imm_6b:$imm))>;
+ def : Pat<(f64 (int_arm_cde_vcx2a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n),
+ timm:$imm)),
+ (f64 (CDE_VCX2A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, imm_6b:$imm))>;
+
+ def : Pat<(f32 (int_arm_cde_vcx3 timm:$coproc, (f32 SPR:$n), (f32 SPR:$m),
+ timm:$imm)),
+ (f32 (CDE_VCX3_fpsp p_imm:$coproc, (f32 SPR:$n), (f32 SPR:$m),
+ imm_3b:$imm))>;
+ def : Pat<(f32 (int_arm_cde_vcx3a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n),
+ (f32 SPR:$m), timm:$imm)),
+ (f32 (CDE_VCX3A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, SPR:$m,
+ imm_3b:$imm))>;
+ def : Pat<(f64 (int_arm_cde_vcx3 timm:$coproc, (f64 DPR:$n), (f64 DPR:$m),
+ timm:$imm)),
+ (f64 (CDE_VCX3_fpdp p_imm:$coproc, DPR:$n, DPR:$m, imm_3b:$imm))>;
+ def : Pat<(f64 (int_arm_cde_vcx3a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n),
+ (f64 DPR:$m), timm:$imm)),
+ (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m,
+ imm_3b:$imm))>;
+}
+
+let Predicates = [HasCDE, HasMVEInt] in {
+ def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)),
+ (v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>;
+ def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc),
+ timm:$imm)),
+ (v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>;
+
+ def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)),
+ (v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>;
+ def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc),
+ (v16i8 MQPR:$n), timm:$imm)),
+ (v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n,
+ imm_7b:$imm))>;
+
+ def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n),
+ (v16i8 MQPR:$m), timm:$imm)),
+ (v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m,
+ imm_4b:$imm))>;
+ def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc),
+ (v16i8 MQPR:$n), (v16i8 MQPR:$m),
+ timm:$imm)),
+ (v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m,
+ imm_4b:$imm))>;
+}
+
+multiclass VCXPredicatedPat_m<MVEVectorVTInfo VTI> {
+ def : Pat<(VTI.Vec (int_arm_cde_vcx1q_predicated timm:$coproc,
+ (VTI.Vec MQPR:$inactive), timm:$imm,
+ (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm, ARMVCCThen,
+ (VTI.Pred VCCR:$pred),
+ (VTI.Vec MQPR:$inactive)))>;
+ def : Pat<(VTI.Vec (int_arm_cde_vcx1qa_predicated timm:$coproc,
+ (VTI.Vec MQPR:$acc), timm:$imm,
+ (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (CDE_VCX1A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
+ imm_12b:$imm, ARMVCCThen,
+ (VTI.Pred VCCR:$pred)))>;
+
+ def : Pat<(VTI.Vec (int_arm_cde_vcx2q_predicated timm:$coproc,
+ (VTI.Vec MQPR:$inactive),
+ (v16i8 MQPR:$n), timm:$imm,
+ (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (CDE_VCX2_vec p_imm:$coproc, (v16i8 MQPR:$n),
+ imm_7b:$imm, ARMVCCThen,
+ (VTI.Pred VCCR:$pred),
+ (VTI.Vec MQPR:$inactive)))>;
+ def : Pat<(VTI.Vec (int_arm_cde_vcx2qa_predicated timm:$coproc,
+ (VTI.Vec MQPR:$acc),
+ (v16i8 MQPR:$n), timm:$imm,
+ (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (CDE_VCX2A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
+ (v16i8 MQPR:$n), timm:$imm, ARMVCCThen,
+ (VTI.Pred VCCR:$pred)))>;
+
+ def : Pat<(VTI.Vec (int_arm_cde_vcx3q_predicated timm:$coproc,
+ (VTI.Vec MQPR:$inactive),
+ (v16i8 MQPR:$n), (v16i8 MQPR:$m),
+ timm:$imm,
+ (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (CDE_VCX3_vec p_imm:$coproc, (v16i8 MQPR:$n),
+ (v16i8 MQPR:$m),
+ imm_4b:$imm, ARMVCCThen,
+ (VTI.Pred VCCR:$pred),
+ (VTI.Vec MQPR:$inactive)))>;
+ def : Pat<(VTI.Vec (int_arm_cde_vcx3qa_predicated timm:$coproc,
+ (VTI.Vec MQPR:$acc),
+ (v16i8 MQPR:$n), (v16i8 MQPR:$m), timm:$imm,
+ (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (CDE_VCX3A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
+ (v16i8 MQPR:$n), (v16i8 MQPR:$m),
+ imm_4b:$imm, ARMVCCThen,
+ (VTI.Pred VCCR:$pred)))>;
+}
+
+let Predicates = [HasCDE, HasMVEInt] in
+ foreach VTI = [ MVE_v16i8, MVE_v8i16, MVE_v4i32, MVE_v2i64 ] in
+ defm : VCXPredicatedPat_m<VTI>;
+
+let Predicates = [HasCDE, HasMVEFloat] in
+ foreach VTI = [ MVE_v8f16, MVE_v4f32 ] in
+ defm : VCXPredicatedPat_m<VTI>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
index 1da32ad2af6c..e13f3437cc7b 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -205,7 +205,6 @@ def VPTPredROperand : AsmOperandClass {
let Name = "VPTPredR";
let PredicateMethod = "isVPTPred";
}
-def undef_tied_input;
// Operand classes for the cluster of MC operands describing a
// VPT-predicated MVE instruction.
@@ -409,6 +408,9 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
bit thumbArithFlagSetting = 0;
bit validForTailPredication = 0;
+ bit retainsPreviousHalfElement = 0;
+ bit horizontalReduction = 0;
+ bit doubleWidthResult = 0;
// If this is a pseudo instruction, mark it isCodeGenOnly.
let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
@@ -422,6 +424,9 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
let TSFlags{18-15} = D.Value;
let TSFlags{19} = thumbArithFlagSetting;
let TSFlags{20} = validForTailPredication;
+ let TSFlags{21} = retainsPreviousHalfElement;
+ let TSFlags{22} = horizontalReduction;
+ let TSFlags{23} = doubleWidthResult;
let Constraints = cstr;
let Itinerary = itin;
@@ -1123,6 +1128,9 @@ class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> {
class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> {
list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP];
}
+class FPRegs16Pat<dag pattern, dag result> : Pat<pattern, result> {
+ list<Predicate> Predicates = [HasFPRegs16];
+}
class FP16Pat<dag pattern, dag result> : Pat<pattern, result> {
list<Predicate> Predicates = [HasFP16];
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index a802d5a06f07..2790ac215f86 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -126,7 +126,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant;
MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
- MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, Align(4));
MIB.addMemOperand(MMO);
BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg)
.addReg(Reg, RegState::Kill)
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
index 3efe85a7d45c..da0a836c8f95 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -159,6 +159,8 @@ def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARMseretflag : SDNode<"ARMISD::SERET_FLAG", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
@@ -243,6 +245,12 @@ def ARMqsub8b : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>;
def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>;
def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>;
+def SDT_ARMldrd : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def ARMldrd : SDNode<"ARMISD::LDRD", SDT_ARMldrd, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def SDT_ARMstrd : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def ARMstrd : SDNode<"ARMISD::STRD", SDT_ARMstrd, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
// Vector operations shared between NEON and MVE
def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
@@ -258,7 +266,7 @@ def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
def ARMvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
-def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>,
SDTCisVT<2, i32>]>;
def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
@@ -268,6 +276,10 @@ def ARMvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
def ARMvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
+def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisVT<2, i32>]>;
+def ARMvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
+def ARMvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
SDTCisVT<2, i32>]>;
@@ -279,6 +291,11 @@ def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>;
def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>;
def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>;
+def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisSameAs<1, 2>]>;
+def ARMvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
+def ARMvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
+
def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
SDTCisInt<3>]>;
def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
@@ -290,6 +307,36 @@ def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
+// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
+// vector register as a different vector type, without changing the contents of
+// the register. It differs from 'bitconvert' in that bitconvert reinterprets
+// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST
+// reinterprets the _register_ format - and in big-endian, the memory and
+// register formats are different, so they are different operations.
+//
+// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of
+// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
+// whereas 'bitconvert' will map it to the high byte in big-endian mode,
+// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the
+// bitconvert would have to emit a VREV16.8 instruction, whereas the
+// VECTOR_REG_CAST emits no code at all if the vector is already in a register.
+def ARMVectorRegCastImpl : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
+
+// In little-endian, VECTOR_REG_CAST is often turned into bitconvert during
+// lowering (because in that situation they're identical). So an isel pattern
+// that needs to match something that's _logically_ a VECTOR_REG_CAST must
+// _physically_ match a different node type depending on endianness.
+//
+// This 'PatFrags' instance is a centralized facility to make that easy. It
+// matches VECTOR_REG_CAST in either endianness, and also bitconvert in the
+// endianness where it's equivalent.
+def ARMVectorRegCast: PatFrags<
+ (ops node:$x), [(ARMVectorRegCastImpl node:$x), (bitconvert node:$x)], [{
+ // Reject a match against bitconvert (aka ISD::BITCAST) if big-endian
+ return !(CurDAG->getDataLayout().isBigEndian() &&
+ N->getOpcode() == ISD::BITCAST);
+ }]>;
+
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.
@@ -396,6 +443,62 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
return hasNoVMLxHazardUse(N);
}]>;
+def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
+def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
+
+//===----------------------------------------------------------------------===//
+// NEON/MVE pattern fragments
+//
+
+// Extract D sub-registers of Q registers.
+def DSubReg_i8_reg : SDNodeXForm<imm, [{
+ assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N),
+ MVT::i32);
+}]>;
+def DSubReg_i16_reg : SDNodeXForm<imm, [{
+ assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N),
+ MVT::i32);
+}]>;
+def DSubReg_i32_reg : SDNodeXForm<imm, [{
+ assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N),
+ MVT::i32);
+}]>;
+def DSubReg_f64_reg : SDNodeXForm<imm, [{
+ assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N),
+ MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers.
+def SSubReg_f32_reg : SDNodeXForm<imm, [{
+ assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N),
+ MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers containing a given f16/bf16 lane.
+def SSubReg_f16_reg : SDNodeXForm<imm, [{
+ assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+ return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N),
+ MVT::i32);
+}]>;
+
+// Translate lane numbers from Q registers to D subregs.
+def SubReg_i8_lane : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32);
+}]>;
+def SubReg_i16_lane : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32);
+}]>;
+def SubReg_i32_lane : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32);
+}]>;
+
+
+
//===----------------------------------------------------------------------===//
// Operand Definitions.
//
@@ -2695,6 +2798,14 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
Requires<[IsARM, HasV5TE]>;
}
+let mayLoad = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in {
+def LOADDUAL : ARMPseudoInst<(outs GPRPairOp:$Rt), (ins addrmode3:$addr),
+ 64, IIC_iLoad_d_r, []>,
+ Requires<[IsARM, HasV5TE]> {
+ let AM = AddrMode3;
+}
+}
+
def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
NoItinerary, "lda", "\t$Rt, $addr", []>;
def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
@@ -2766,7 +2877,7 @@ multiclass AI2_ldridx<bit isByte, string opc,
}
let mayLoad = 1, hasSideEffects = 0 in {
-// FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or
+// FIXME: for LDR_PRE_REG etc. the itinerary should be either IIC_iLoad_ru or
// IIC_iLoad_siu depending on whether it the offset register is shifted.
defm LDR : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>;
defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>;
@@ -2933,6 +3044,9 @@ multiclass AI3ldrT<bits<4> op, string opc> {
let Inst{3-0} = Rm{3-0};
let DecoderMethod = "DecodeLDR";
}
+
+ def ii : ARMAsmPseudo<!strconcat(opc, "${p} $Rt, $addr"),
+ (ins addr_offset_none:$addr, pred:$p), (outs GPR:$Rt)>;
}
defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">;
@@ -2970,6 +3084,14 @@ let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
}
}
+let mayStore = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in {
+def STOREDUAL : ARMPseudoInst<(outs), (ins GPRPairOp:$Rt, addrmode3:$addr),
+ 64, IIC_iStore_d_r, []>,
+ Requires<[IsARM, HasV5TE]> {
+ let AM = AddrMode3;
+}
+}
+
// Indexed stores
multiclass AI2_stridx<bit isByte, string opc,
InstrItinClass iii, InstrItinClass iir> {
@@ -3036,7 +3158,7 @@ multiclass AI2_stridx<bit isByte, string opc,
}
let mayStore = 1, hasSideEffects = 0 in {
-// FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or
+// FIXME: for STR_PRE_REG etc. the itinerary should be either IIC_iStore_ru or
// IIC_iStore_siu depending on whether it the offset register is shifted.
defm STR : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>;
defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>;
@@ -3770,9 +3892,8 @@ def QSUB16 : AAIIntrinsic<0b01100010, 0b11110111, "qsub16", int_arm_qsub16>;
def QSUB8 : AAIIntrinsic<0b01100010, 0b11111111, "qsub8", int_arm_qsub8>;
def QDADD : AAIRevOpr<0b00010100, 0b00000101, "qdadd",
- [(set GPRnopc:$Rd, (int_arm_qadd (int_arm_qadd GPRnopc:$Rm,
- GPRnopc:$Rm),
- GPRnopc:$Rn))]>;
+ [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm,
+ (int_arm_qadd GPRnopc:$Rn, GPRnopc:$Rn)))]>;
def QDSUB : AAIRevOpr<0b00010110, 0b00000101, "qdsub",
[(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm,
(int_arm_qadd GPRnopc:$Rn, GPRnopc:$Rn)))]>;
@@ -3787,7 +3908,7 @@ def : ARMV5TEPat<(saddsat GPR:$a, GPR:$b),
(QADD GPR:$a, GPR:$b)>;
def : ARMV5TEPat<(ssubsat GPR:$a, GPR:$b),
(QSUB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+def : ARMV5TEPat<(saddsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
(QDADD rGPR:$Rm, rGPR:$Rn)>;
def : ARMV5TEPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
(QDSUB rGPR:$Rm, rGPR:$Rn)>;
@@ -5414,7 +5535,8 @@ def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
(outs GPRwithAPSR:$Rt),
(ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
- imm0_7:$opc2), []>;
+ imm0_7:$opc2), []>,
+ ComplexDeprecationPredicate<"MRC">;
def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
(MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
c_imm:$CRm, 0, pred:$p)>;
@@ -5691,7 +5813,7 @@ def : ARMPat<(ARMthread_pointer), (MRC 15, 0, 13, 0, 3)>,
// when we get here from a longjmp(). We force everything out of registers
// except for our own input by listing the relevant registers in Defs. By
// doing so, we also cause the prologue/epilogue code to actively preserve
-// all of the callee-saved resgisters, which is exactly what we want.
+// all of the callee-saved registers, which is exactly what we want.
// A constant value is passed in $val, and we use the location as a scratch.
//
// These are pseudo-instructions and are lowered to individual MC-insts, so
@@ -5976,6 +6098,12 @@ include "ARMInstrNEON.td"
include "ARMInstrMVE.td"
//===----------------------------------------------------------------------===//
+// CDE (Custom Datapath Extension)
+//
+
+include "ARMInstrCDE.td"
+
+//===----------------------------------------------------------------------===//
// Assembler aliases
//
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
index 604291be822c..2a1f50d97e3b 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -10,44 +10,6 @@
//
//===----------------------------------------------------------------------===//
-class ExpandImmAsmOp<string shift> : AsmOperandClass {
- let Name = !strconcat("ExpandImm", shift);
- let PredicateMethod = !strconcat("isExpImm<", shift, ">");
- let RenderMethod = "addImmOperands";
-}
-class InvertedExpandImmAsmOp<string shift, string size> : AsmOperandClass {
- let Name = !strconcat("InvertedExpandImm", shift, "_", size);
- let PredicateMethod = !strconcat("isInvertedExpImm<", shift, ",", size, ">");
- let RenderMethod = "addImmOperands";
-}
-
-class ExpandImm<string shift> : Operand<i32> {
- let ParserMatchClass = ExpandImmAsmOp<shift>;
- let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",false>");
- let DecoderMethod = !strconcat("DecodeExpandedImmOperand<",shift,">");
- let PrintMethod = "printExpandedImmOperand";
-}
-class InvertedExpandImm<string shift, string size> : Operand<i32> {
- let ParserMatchClass = InvertedExpandImmAsmOp<shift, size>;
- let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",true>");
- let PrintMethod = "printExpandedImmOperand";
- // No decoder method needed, because this operand type is only used
- // by aliases (VAND and VORN)
-}
-
-def expzero00 : ExpandImm<"0">;
-def expzero08 : ExpandImm<"8">;
-def expzero16 : ExpandImm<"16">;
-def expzero24 : ExpandImm<"24">;
-
-def expzero00inv16 : InvertedExpandImm<"0", "16">;
-def expzero08inv16 : InvertedExpandImm<"8", "16">;
-
-def expzero00inv32 : InvertedExpandImm<"0", "32">;
-def expzero08inv32 : InvertedExpandImm<"8", "32">;
-def expzero16inv32 : InvertedExpandImm<"16", "32">;
-def expzero24inv32 : InvertedExpandImm<"24", "32">;
-
// VPT condition mask
def vpt_mask : Operand<i32> {
let PrintMethod = "printVPTMask";
@@ -277,7 +239,8 @@ class mve_addr_q_shift<int shift> : MemOperand {
// A family of classes wrapping up information about the vector types
// used by MVE.
-class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
+class MVEVectorVTInfo<ValueType vec, ValueType dblvec,
+ ValueType pred, ValueType dblpred,
bits<2> size, string suffixletter, bit unsigned> {
// The LLVM ValueType representing the vector, so we can use it in
// ISel patterns.
@@ -300,6 +263,9 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
// directly.
ValueType Pred = pred;
+ // Same as Pred but for DblVec rather than Vec.
+ ValueType DblPred = dblpred;
+
// The most common representation of the vector element size in MVE
// instruction encodings: a 2-bit value V representing an (8<<V)-bit
// vector element.
@@ -319,38 +285,38 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
!cast<string>(LaneBits));
// The suffix used on an instruction that mentions the whole type.
- string Suffix = suffixletter ## BitsSuffix;
+ string Suffix = suffixletter # BitsSuffix;
// The letter part of the suffix only.
string SuffixLetter = suffixletter;
}
// Integer vector types that don't treat signed and unsigned differently.
-def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "i", ?>;
-def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "i", ?>;
-def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "i", ?>;
-def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "i", ?>;
+def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "i", ?>;
+def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "i", ?>;
+def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "i", ?>;
+def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "i", ?>;
// Explicitly signed and unsigned integer vectors. They map to the
// same set of LLVM ValueTypes as above, but are represented
// differently in assembly and instruction encodings.
-def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "s", 0b0>;
-def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "s", 0b0>;
-def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "s", 0b0>;
-def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "s", 0b0>;
-def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "u", 0b1>;
-def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "u", 0b1>;
-def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "u", 0b1>;
-def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "u", 0b1>;
+def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "s", 0b0>;
+def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "s", 0b0>;
+def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "s", 0b0>;
+def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "s", 0b0>;
+def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "u", 0b1>;
+def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "u", 0b1>;
+def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "u", 0b1>;
+def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "u", 0b1>;
// FP vector types.
-def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, 0b01, "f", ?>;
-def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, 0b10, "f", ?>;
-def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, 0b11, "f", ?>;
+def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, v4i1, 0b01, "f", ?>;
+def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, v4i1, 0b10, "f", ?>;
+def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, ?, 0b11, "f", ?>;
// Polynomial vector types.
-def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b11, "p", 0b0>;
-def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b11, "p", 0b1>;
+def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>;
+def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b11, "p", 0b1>;
// --------- Start of base classes for the instructions themselves
@@ -473,6 +439,8 @@ class MVE_ScalarShiftDoubleReg<string iname, dag iops, string asm,
let Inst{19-17} = RdaLo{3-1};
let Inst{11-9} = RdaHi{3-1};
+
+ let hasSideEffects = 0;
}
class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16,
@@ -590,6 +558,7 @@ class MVE_VABAV<string suffix, bit U, bits<2> size>
let Inst{5} = Qm{3};
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b1;
+ let horizontalReduction = 1;
}
multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> {
@@ -639,38 +608,63 @@ class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
let Inst{5} = A;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
+ let horizontalReduction = 1;
+ let validForTailPredication = 1;
}
-multiclass MVE_VADDV_A<string suffix, bit U, bits<2> size,
- list<dag> pattern=[]> {
- def acc : MVE_VADDV<"vaddva", suffix,
+def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>;
+def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>;
+
+multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
+ def acc : MVE_VADDV<"vaddva", VTI.Suffix,
(ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src",
- 0b1, U, size, pattern>;
- def no_acc : MVE_VADDV<"vaddv", suffix,
+ 0b1, VTI.Unsigned, VTI.Size>;
+ def no_acc : MVE_VADDV<"vaddv", VTI.Suffix,
(ins MQPR:$Qm), "",
- 0b0, U, size, pattern>;
-}
+ 0b0, VTI.Unsigned, VTI.Size>;
-defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>;
-defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>;
-defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>;
-defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>;
-defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>;
-defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>;
+ defvar InstA = !cast<Instruction>(NAME # "acc");
+ defvar InstN = !cast<Instruction>(NAME # "no_acc");
-let Predicates = [HasMVEInt] in {
- def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>;
- def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>;
- def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>;
- def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))),
- (i32 (MVE_VADDVu32acc $src2, $src1))>;
- def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))),
- (i32 (MVE_VADDVu16acc $src2, $src1))>;
- def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))),
- (i32 (MVE_VADDVu8acc $src2, $src1))>;
+ let Predicates = [HasMVEInt] in {
+ if VTI.Unsigned then {
+ def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
+ (i32 (InstN $vec))>;
+ def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
+ (i32 (InstN $vec))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec))>;
+ def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec))>;
+ } else {
+ def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
+ (i32 (InstN $vec))>;
+ def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec))>;
+ }
+ def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
+ (i32 VTI.Unsigned),
+ (VTI.Pred VCCR:$pred))),
+ (i32 (InstN $vec, ARMVCCThen, $pred))>;
+ def : Pat<(i32 (add (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
+ (i32 VTI.Unsigned),
+ (VTI.Pred VCCR:$pred)),
+ (i32 tGPREven:$acc))),
+ (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
+ }
}
+defm MVE_VADDVs8 : MVE_VADDV_A<MVE_v16s8>;
+defm MVE_VADDVs16 : MVE_VADDV_A<MVE_v8s16>;
+defm MVE_VADDVs32 : MVE_VADDV_A<MVE_v4s32>;
+defm MVE_VADDVu8 : MVE_VADDV_A<MVE_v16u8>;
+defm MVE_VADDVu16 : MVE_VADDV_A<MVE_v8u16>;
+defm MVE_VADDVu32 : MVE_VADDV_A<MVE_v4u32>;
+
class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
bit A, bit U, list<dag> pattern=[]>
: MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
@@ -689,21 +683,58 @@ class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
let Inst{5} = A;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
-}
-
-multiclass MVE_VADDLV_A<string suffix, bit U, list<dag> pattern=[]> {
- def acc : MVE_VADDLV<"vaddlva", suffix,
+ let horizontalReduction = 1;
+}
+
+def SDTVecReduceL : SDTypeProfile<2, 1, [ // VADDLV
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>
+]>;
+def SDTVecReduceLA : SDTypeProfile<2, 3, [ // VADDLVA
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
+ SDTCisVec<4>
+]>;
+def SDTVecReduceLP : SDTypeProfile<2, 2, [ // VADDLVp
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<2>
+]>;
+def SDTVecReduceLPA : SDTypeProfile<2, 4, [ // VADDLVAp
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
+ SDTCisVec<4>, SDTCisVec<5>
+]>;
+
+multiclass MVE_VADDLV_A<MVEVectorVTInfo VTI> {
+ def acc : MVE_VADDLV<"vaddlva", VTI.Suffix,
(ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm),
"$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
- 0b1, U, pattern>;
- def no_acc : MVE_VADDLV<"vaddlv", suffix,
+ 0b1, VTI.Unsigned>;
+ def no_acc : MVE_VADDLV<"vaddlv", VTI.Suffix,
(ins MQPR:$Qm), "",
- 0b0, U, pattern>;
-}
+ 0b0, VTI.Unsigned>;
+
+ defvar InstA = !cast<Instruction>(NAME # "acc");
+ defvar InstN = !cast<Instruction>(NAME # "no_acc");
+ defvar letter = VTI.SuffixLetter;
+ defvar ARMVADDLV = SDNode<"ARMISD::VADDLV" # letter, SDTVecReduceL>;
+ defvar ARMVADDLVA = SDNode<"ARMISD::VADDLVA" # letter, SDTVecReduceLA>;
+ defvar ARMVADDLVp = SDNode<"ARMISD::VADDLVp" # letter, SDTVecReduceLP>;
+ defvar ARMVADDLVAp = SDNode<"ARMISD::VADDLVAp" # letter, SDTVecReduceLPA>;
-defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>;
-defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>;
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(ARMVADDLV (v4i32 MQPR:$vec)),
+ (InstN (v4i32 MQPR:$vec))>;
+ def : Pat<(ARMVADDLVA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec)),
+ (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec))>;
+ def : Pat<(ARMVADDLVp (v4i32 MQPR:$vec), (VTI.Pred VCCR:$pred)),
+ (InstN (v4i32 MQPR:$vec), ARMVCCThen, (VTI.Pred VCCR:$pred))>;
+ def : Pat<(ARMVADDLVAp tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec),
+ (VTI.Pred VCCR:$pred)),
+ (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec),
+ ARMVCCThen, (VTI.Pred VCCR:$pred))>;
+ }
+}
+
+defm MVE_VADDLVs32 : MVE_VADDLV_A<MVE_v4s32>;
+defm MVE_VADDLVu32 : MVE_VADDLV_A<MVE_v4u32>;
class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
bit bit_17, bit bit_7, list<dag> pattern=[]>
@@ -724,25 +755,48 @@ class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
let Inst{6-5} = 0b00;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
+ let horizontalReduction = 1;
let Predicates = [HasMVEFloat];
+ let hasSideEffects = 0;
}
-multiclass MVE_VMINMAXNMV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
- def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b1, bit_7, pattern>;
- def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b1, bit_7, pattern>;
-}
+multiclass MVE_VMINMAXNMV_p<string iname, bit notAbs, bit isMin,
+ MVEVectorVTInfo VTI, string intrBaseName,
+ ValueType Scalar, RegisterClass ScalarReg> {
+ def "": MVE_VMINMAXNMV<iname, VTI.Suffix, VTI.Size{0}, notAbs, isMin>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar unpred_intr = !cast<Intrinsic>(intrBaseName);
+ defvar pred_intr = !cast<Intrinsic>(intrBaseName#"_predicated");
-defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>;
-defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>;
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(Scalar (unpred_intr (Scalar ScalarReg:$prev),
+ (VTI.Vec MQPR:$vec))),
+ (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR),
+ (VTI.Vec MQPR:$vec)),
+ ScalarReg)>;
+ def : Pat<(Scalar (pred_intr (Scalar ScalarReg:$prev),
+ (VTI.Vec MQPR:$vec),
+ (VTI.Pred VCCR:$pred))),
+ (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR),
+ (VTI.Vec MQPR:$vec),
+ ARMVCCThen, (VTI.Pred VCCR:$pred)),
+ ScalarReg)>;
+ }
+}
-multiclass MVE_VMINMAXNMAV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
- def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b0, bit_7, pattern>;
- def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b0, bit_7, pattern>;
+multiclass MVE_VMINMAXNMV_fty<string iname, bit notAbs, bit isMin,
+ string intrBase> {
+ defm f32 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v4f32, intrBase,
+ f32, SPR>;
+ defm f16 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v8f16, intrBase,
+ f16, HPR>;
}
-defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>;
-defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>;
+defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 1, 1, "int_arm_mve_minnmv">;
+defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 1, 0, "int_arm_mve_maxnmv">;
+defm MVE_VMINNMAV: MVE_VMINMAXNMV_fty<"vminnmav", 0, 1, "int_arm_mve_minnmav">;
+defm MVE_VMAXNMAV: MVE_VMINMAXNMV_fty<"vmaxnmav", 0, 0, "int_arm_mve_maxnmav">;
class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
bit bit_17, bit bit_7, list<dag> pattern=[]>
@@ -762,33 +816,40 @@ class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
let Inst{6-5} = 0b00;
let Inst{3-1} = Qm{2-0};
let Inst{0} = 0b0;
+ let horizontalReduction = 1;
}
-multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7,
- MVEVectorVTInfo VTI, Intrinsic intr> {
+multiclass MVE_VMINMAXV_p<string iname, bit notAbs, bit isMin,
+ MVEVectorVTInfo VTI, string intrBaseName> {
def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
- bit_17, bit_7>;
- defvar Inst = !cast<Instruction>(NAME);
+ notAbs, isMin>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar unpred_intr = !cast<Intrinsic>(intrBaseName);
+ defvar pred_intr = !cast<Intrinsic>(intrBaseName#"_predicated");
+ defvar base_args = (? (i32 rGPR:$prev), (VTI.Vec MQPR:$vec));
+ defvar args = !if(notAbs, !con(base_args, (? (i32 VTI.Unsigned))),
+ base_args);
- let Predicates = [HasMVEInt] in
- def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))),
- (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 !con(args, (unpred_intr))),
+ (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
+ def : Pat<(i32 !con(args, (pred_intr (VTI.Pred VCCR:$pred)))),
+ (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec),
+ ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+ }
}
-multiclass MVE_VMINMAXV_ty<string iname, bit bit_7,
- Intrinsic intr_s, Intrinsic intr_u> {
- defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>;
- defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>;
- defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>;
- defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>;
- defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>;
- defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>;
+multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> {
+ defm s8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16s8, intrBaseName>;
+ defm s16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8s16, intrBaseName>;
+ defm s32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4s32, intrBaseName>;
+ defm u8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16u8, intrBaseName>;
+ defm u16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8u16, intrBaseName>;
+ defm u32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4u32, intrBaseName>;
}
-defm MVE_VMINV : MVE_VMINMAXV_ty<
- "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>;
-defm MVE_VMAXV : MVE_VMINMAXV_ty<
- "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>;
+defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">;
+defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">;
let Predicates = [HasMVEInt] in {
def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
@@ -819,14 +880,14 @@ let Predicates = [HasMVEInt] in {
}
-multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
- def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b0, bit_7>;
- def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>;
- def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b0, bit_7>;
+multiclass MVE_VMINMAXAV_ty<string iname, bit isMin, string intrBaseName> {
+ defm s8 : MVE_VMINMAXV_p<iname, 0, isMin, MVE_v16s8, intrBaseName>;
+ defm s16: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v8s16, intrBaseName>;
+ defm s32: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v4s32, intrBaseName>;
}
-defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>;
-defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>;
+defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 1, "int_arm_mve_minav">;
+defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0, "int_arm_mve_maxav">;
class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
@@ -847,6 +908,12 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
let Inst{5} = A;
let Inst{3-1} = Qm{2-0};
let Inst{0} = bit_0;
+ let horizontalReduction = 1;
+ // Allow tail predication for non-exchanging versions. As this is also a
+ // horizontalReduction, ARMLowOverheadLoops will also have to check that
+ // the vector operands contain zeros in their false lanes for the instruction
+ // to be properly valid.
+ let validForTailPredication = !eq(X, 0);
}
multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
@@ -932,6 +999,58 @@ defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v16s8, 0b0, 0b1>;
defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v8s16, 0b0, 0b0>;
defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v4s32, 0b1, 0b0>;
+def SDTVecReduce2 : SDTypeProfile<1, 2, [ // VMLAV
+ SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2>
+]>;
+def SDTVecReduce2L : SDTypeProfile<2, 2, [ // VMLALV
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<3>
+]>;
+def SDTVecReduce2LA : SDTypeProfile<2, 4, [ // VMLALVA
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
+ SDTCisVec<4>, SDTCisVec<5>
+]>;
+def ARMVMLAVs : SDNode<"ARMISD::VMLAVs", SDTVecReduce2>;
+def ARMVMLAVu : SDNode<"ARMISD::VMLAVu", SDTVecReduce2>;
+def ARMVMLALVs : SDNode<"ARMISD::VMLALVs", SDTVecReduce2L>;
+def ARMVMLALVu : SDNode<"ARMISD::VMLALVu", SDTVecReduce2L>;
+def ARMVMLALVAs : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>;
+def ARMVMLALVAu : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>;
+
+let Predicates = [HasMVEInt] in {
+ def : Pat<(i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
+ (i32 (MVE_VMLADAVu32 $src1, $src2))>;
+ def : Pat<(i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
+ (i32 (MVE_VMLADAVu16 $src1, $src2))>;
+ def : Pat<(i32 (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(i32 (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+ (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
+ (i32 (MVE_VMLADAVu8 $src1, $src2))>;
+ def : Pat<(i32 (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(i32 (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+ (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+
+ def : Pat<(i32 (add (i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
+ (i32 tGPREven:$src3))),
+ (i32 (MVE_VMLADAVau32 $src3, $src1, $src2))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
+ (i32 tGPREven:$src3))),
+ (i32 (MVE_VMLADAVau16 $src3, $src1, $src2))>;
+ def : Pat<(i32 (add (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
+ (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(i32 (add (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
+ (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+ def : Pat<(i32 (add (i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
+ (i32 tGPREven:$src3))),
+ (i32 (MVE_VMLADAVau8 $src3, $src1, $src2))>;
+ def : Pat<(i32 (add (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
+ (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+ def : Pat<(i32 (add (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
+ (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+}
+
// vmlav aliases vmladav
foreach acc = ["", "a"] in {
foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in {
@@ -963,6 +1082,14 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
let Inst{5} = A;
let Inst{3-1} = Qm{2-0};
let Inst{0} = bit_0;
+ let horizontalReduction = 1;
+ // Allow tail predication for non-exchanging versions. As this is also a
+ // horizontalReduction, ARMLowOverheadLoops will also have to check that
+ // the vector operands contain zeros in their false lanes for the instruction
+ // to be properly valid.
+ let validForTailPredication = !eq(X, 0);
+
+ let hasSideEffects = 0;
}
multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix,
@@ -1023,6 +1150,26 @@ multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> {
defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>;
defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(ARMVMLALVs (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
+ (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>;
+ def : Pat<(ARMVMLALVu (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
+ (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>;
+ def : Pat<(ARMVMLALVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
+ (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+ def : Pat<(ARMVMLALVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
+ (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+
+ def : Pat<(ARMVMLALVAs tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
+ (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>;
+ def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
+ (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>;
+ def : Pat<(ARMVMLALVAs tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
+ (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+ def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
+ (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+}
+
// vmlalv aliases vmlaldav
foreach acc = ["", "a"] in {
foreach suffix = ["s16", "s32", "u16", "u32"] in {
@@ -1244,28 +1391,29 @@ let Predicates = [HasMVEInt] in {
(v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>;
}
-let Predicates = [HasMVEInt] in {
- def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))),
- (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>;
- def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))),
- (v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>;
- def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))),
- (v16i8 (MVE_VREV64_8 (v16i8 MQPR:$src)))>;
+multiclass MVE_VREV_basic_patterns<int revbits, list<MVEVectorVTInfo> VTIs,
+ Instruction Inst> {
+ defvar unpred_op = !cast<SDNode>("ARMvrev" # revbits);
- def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))),
- (v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>;
- def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))),
- (v16i8 (MVE_VREV32_8 (v16i8 MQPR:$src)))>;
+ foreach VTI = VTIs in {
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$src))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$src)))>;
+ def : Pat<(VTI.Vec (int_arm_mve_vrev_predicated (VTI.Vec MQPR:$src),
+ revbits, (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$src), ARMVCCThen,
+ (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+ }
+}
+
+let Predicates = [HasMVEInt] in {
+ defm: MVE_VREV_basic_patterns<64, [MVE_v4i32, MVE_v4f32], MVE_VREV64_32>;
+ defm: MVE_VREV_basic_patterns<64, [MVE_v8i16, MVE_v8f16], MVE_VREV64_16>;
+ defm: MVE_VREV_basic_patterns<64, [MVE_v16i8 ], MVE_VREV64_8>;
- def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))),
- (v16i8 (MVE_VREV16_8 (v16i8 MQPR:$src)))>;
+ defm: MVE_VREV_basic_patterns<32, [MVE_v8i16, MVE_v8f16], MVE_VREV32_16>;
+ defm: MVE_VREV_basic_patterns<32, [MVE_v16i8 ], MVE_VREV32_8>;
- def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))),
- (v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>;
- def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))),
- (v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>;
- def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))),
- (v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>;
+ defm: MVE_VREV_basic_patterns<16, [MVE_v16i8 ], MVE_VREV16_8>;
}
def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
@@ -1280,14 +1428,14 @@ def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
}
let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (vnotq (v16i8 MQPR:$val1))),
- (v16i8 (MVE_VMVN (v16i8 MQPR:$val1)))>;
- def : Pat<(v8i16 (vnotq (v8i16 MQPR:$val1))),
- (v8i16 (MVE_VMVN (v8i16 MQPR:$val1)))>;
- def : Pat<(v4i32 (vnotq (v4i32 MQPR:$val1))),
- (v4i32 (MVE_VMVN (v4i32 MQPR:$val1)))>;
- def : Pat<(v2i64 (vnotq (v2i64 MQPR:$val1))),
- (v2i64 (MVE_VMVN (v2i64 MQPR:$val1)))>;
+ foreach VTI = [ MVE_v16i8, MVE_v8i16, MVE_v4i32, MVE_v2i64 ] in {
+ def : Pat<(VTI.Vec (vnotq (VTI.Vec MQPR:$val1))),
+ (VTI.Vec (MVE_VMVN (VTI.Vec MQPR:$val1)))>;
+ def : Pat<(VTI.Vec (int_arm_mve_mvn_predicated (VTI.Vec MQPR:$val1),
+ (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (MVE_VMVN (VTI.Vec MQPR:$val1), ARMVCCThen,
+ (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+ }
}
class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28>
@@ -1383,10 +1531,10 @@ defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>;
defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>;
defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>;
-class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
+class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps>
: MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
- bits<8> imm;
+ bits<12> imm;
bits<4> Qd;
let Inst{28} = imm{7};
@@ -1396,66 +1544,59 @@ class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
let Inst{18-16} = imm{6-4};
let Inst{15-13} = Qd{2-0};
let Inst{12} = 0b0;
- let Inst{11-8} = cmode;
+ let Inst{11} = halfword;
+ let Inst{10} = !if(halfword, 0, imm{10});
+ let Inst{9} = imm{9};
+ let Inst{8} = 0b1;
let Inst{7-6} = 0b01;
let Inst{4} = 0b1;
let Inst{3-0} = imm{3-0};
}
-class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type>
- : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
- let Inst{5} = 0b0;
- let validForTailPredication = 1;
-}
+multiclass MVE_bit_cmode_p<string iname, bit opcode,
+ MVEVectorVTInfo VTI, Operand imm_type, SDNode op> {
+ def "" : MVE_bit_cmode<iname, VTI.Suffix, VTI.Size{0},
+ (ins MQPR:$Qd_src, imm_type:$imm)> {
+ let Inst{5} = opcode;
+ let validForTailPredication = 1;
+ }
-def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>;
-def MVE_VORRIZ0v8i16 : MVE_VORR<"i16", 0b1001, expzero00>;
-def MVE_VORRIZ8v4i32 : MVE_VORR<"i32", 0b0011, expzero08>;
-def MVE_VORRIZ8v8i16 : MVE_VORR<"i16", 0b1011, expzero08>;
-def MVE_VORRIZ16v4i32 : MVE_VORR<"i32", 0b0101, expzero16>;
-def MVE_VORRIZ24v4i32 : MVE_VORR<"i32", 0b0111, expzero24>;
-
-def MVE_VORNIZ0v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
- (ins MQPR:$Qd_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ0v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm",
- (ins MQPR:$Qd_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ8v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
- (ins MQPR:$Qd_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ8v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm",
- (ins MQPR:$Qd_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ16v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
- (ins MQPR:$Qd_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ24v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
- (ins MQPR:$Qd_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar UnpredPat = (VTI.Vec (op (VTI.Vec MQPR:$src), timm:$simm));
-def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm",
- (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>;
+ let Predicates = [HasMVEInt] in {
+ def : Pat<UnpredPat, (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>;
+ def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+ UnpredPat, (VTI.Vec MQPR:$src))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm,
+ ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+ }
+}
-class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type>
- : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
- let Inst{5} = 0b1;
- let validForTailPredication = 1;
+multiclass MVE_VORRimm<MVEVectorVTInfo VTI, Operand imm_type> {
+ defm "": MVE_bit_cmode_p<"vorr", 0, VTI, imm_type, ARMvorrImm>;
+}
+multiclass MVE_VBICimm<MVEVectorVTInfo VTI, Operand imm_type> {
+ defm "": MVE_bit_cmode_p<"vbic", 1, VTI, imm_type, ARMvbicImm>;
}
-def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>;
-def MVE_VBICIZ0v8i16 : MVE_VBIC<"i16", 0b1001, expzero00>;
-def MVE_VBICIZ8v4i32 : MVE_VBIC<"i32", 0b0011, expzero08>;
-def MVE_VBICIZ8v8i16 : MVE_VBIC<"i16", 0b1011, expzero08>;
-def MVE_VBICIZ16v4i32 : MVE_VBIC<"i32", 0b0101, expzero16>;
-def MVE_VBICIZ24v4i32 : MVE_VBIC<"i32", 0b0111, expzero24>;
-
-def MVE_VANDIZ0v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
- (ins MQPR:$Qda_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ0v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm",
- (ins MQPR:$Qda_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ8v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
- (ins MQPR:$Qda_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ8v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm",
- (ins MQPR:$Qda_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ16v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
- (ins MQPR:$Qda_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ24v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
- (ins MQPR:$Qda_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
+defm MVE_VORRimmi16 : MVE_VORRimm<MVE_v8i16, nImmSplatI16>;
+defm MVE_VORRimmi32 : MVE_VORRimm<MVE_v4i32, nImmSplatI32>;
+defm MVE_VBICimmi16 : MVE_VBICimm<MVE_v8i16, nImmSplatI16>;
+defm MVE_VBICimmi32 : MVE_VBICimm<MVE_v4i32, nImmSplatI32>;
+
+def MVE_VORNimmi16 : MVEInstAlias<"vorn${vp}.i16\t$Qd, $imm",
+ (MVE_VORRimmi16 MQPR:$Qd, nImmSplatNotI16:$imm, vpred_n:$vp), 0>;
+def MVE_VORNimmi32 : MVEInstAlias<"vorn${vp}.i32\t$Qd, $imm",
+ (MVE_VORRimmi32 MQPR:$Qd, nImmSplatNotI32:$imm, vpred_n:$vp), 0>;
+
+def MVE_VANDimmi16 : MVEInstAlias<"vand${vp}.i16\t$Qd, $imm",
+ (MVE_VBICimmi16 MQPR:$Qd, nImmSplatNotI16:$imm, vpred_n:$vp), 0>;
+def MVE_VANDimmi32 : MVEInstAlias<"vand${vp}.i32\t$Qd, $imm",
+ (MVE_VBICimmi32 MQPR:$Qd, nImmSplatNotI32:$imm, vpred_n:$vp), 0>;
+
+def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm",
+ (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>;
class MVE_VMOV_lane_direction {
bit bit_20;
@@ -1494,6 +1635,8 @@ class MVE_VMOV_lane<string suffix, bit U, dag indexop,
let Inst{11-8} = 0b1011;
let Inst{7} = Qd{3};
let Inst{4-0} = 0b10000;
+
+ let hasSideEffects = 0;
}
class MVE_VMOV_lane_32<MVE_VMOV_lane_direction dir>
@@ -1557,10 +1700,14 @@ let Predicates = [HasMVEInt] in {
(MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>;
+ def : Pat<(ARMvgetlanes (v8f16 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>;
def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane),
(MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
+ def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
@@ -1575,8 +1722,8 @@ let Predicates = [HasMVEInt] in {
def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane),
(INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>;
- def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane),
- (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>;
+ def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm:$lane),
+ (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>;
def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
(EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
@@ -1588,8 +1735,8 @@ let Predicates = [HasMVEInt] in {
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
def : Pat<(v4f32 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
- def : Pat<(v8f16 (scalar_to_vector HPR:$src)),
- (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>;
+ def : Pat<(v8f16 (scalar_to_vector (f16 HPR:$src))),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), (f16 HPR:$src), ssub_0)>;
def : Pat<(v8f16 (scalar_to_vector GPR:$src)),
(MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
}
@@ -1882,6 +2029,26 @@ class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
let validForTailPredication = 1;
}
+def addnuw : PatFrag<(ops node:$lhs, node:$rhs),
+ (add node:$lhs, node:$rhs), [{
+ return N->getFlags().hasNoUnsignedWrap();
+}]>;
+
+def addnsw : PatFrag<(ops node:$lhs, node:$rhs),
+ (add node:$lhs, node:$rhs), [{
+ return N->getFlags().hasNoSignedWrap();
+}]>;
+
+def subnuw : PatFrag<(ops node:$lhs, node:$rhs),
+ (sub node:$lhs, node:$rhs), [{
+ return N->getFlags().hasNoUnsignedWrap();
+}]>;
+
+def subnsw : PatFrag<(ops node:$lhs, node:$rhs),
+ (sub node:$lhs, node:$rhs), [{
+ return N->getFlags().hasNoSignedWrap();
+}]>;
+
multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VRHADD_Base<VTI.Suffix, VTI.Unsigned, VTI.Size>;
@@ -1913,6 +2080,37 @@ defm MVE_VRHADDu8 : MVE_VRHADD<MVE_v16u8>;
defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16>;
defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>;
+// Rounding Halving Add perform the arithemtic operation with an extra bit of
+// precision, before performing the shift, to void clipping errors. We're not
+// modelling that here with these patterns, but we're using no wrap forms of
+// add to ensure that the extra bit of information is not needed for the
+// arithmetic or the rounding.
+def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+ (v16i8 (ARMvmovImm (i32 3585)))),
+ (i32 1))),
+ (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+ (v8i16 (ARMvmovImm (i32 2049)))),
+ (i32 1))),
+ (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+ (v4i32 (ARMvmovImm (i32 1)))),
+ (i32 1))),
+ (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+ (v16i8 (ARMvmovImm (i32 3585)))),
+ (i32 1))),
+ (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+ (v8i16 (ARMvmovImm (i32 2049)))),
+ (i32 1))),
+ (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+ (v4i32 (ARMvmovImm (i32 1)))),
+ (i32 1))),
+ (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
+
+
class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
bits<2> size, list<dag> pattern=[]>
: MVE_int<iname, suffix, size, pattern> {
@@ -1936,7 +2134,8 @@ class MVE_VHSUB_<string suffix, bit U, bits<2> size,
: MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>;
multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int> {
+ SDNode unpred_op, Intrinsic pred_int, PatFrag add_op,
+ SDNode shift_op> {
def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
defvar Inst = !cast<Instruction>(NAME);
@@ -1945,6 +2144,9 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))),
(VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+ def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))),
+ (Inst MQPR:$Qm, MQPR:$Qn)>;
+
// Predicated add-and-divide-by-two
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned),
(VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
@@ -1954,18 +2156,24 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
}
}
-multiclass MVE_VHADD<MVEVectorVTInfo VTI>
- : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>;
+multiclass MVE_VHADD<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op>
+ : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, add_op,
+ shift_op>;
-defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8>;
-defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16>;
-defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32>;
-defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8>;
-defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16>;
-defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32>;
+// Halving add/sub perform the arithemtic operation with an extra bit of
+// precision, before performing the shift, to void clipping errors. We're not
+// modelling that here with these patterns, but we're using no wrap forms of
+// add/sub to ensure that the extra bit of information is not needed.
+defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8, addnsw, ARMvshrsImm>;
+defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16, addnsw, ARMvshrsImm>;
+defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32, addnsw, ARMvshrsImm>;
+defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8, addnuw, ARMvshruImm>;
+defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16, addnuw, ARMvshruImm>;
+defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32, addnuw, ARMvshruImm>;
multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
- SDNode unpred_op, Intrinsic pred_int> {
+ SDNode unpred_op, Intrinsic pred_int, PatFrag sub_op,
+ SDNode shift_op> {
def "" : MVE_VHSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
defvar Inst = !cast<Instruction>(NAME);
@@ -1975,6 +2183,10 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
(i32 VTI.Unsigned))),
(VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+ def : Pat<(VTI.Vec (shift_op (sub_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))),
+ (Inst MQPR:$Qm, MQPR:$Qn)>;
+
+
// Predicated subtract-and-divide-by-two
def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
(i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
@@ -1985,15 +2197,16 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
}
}
-multiclass MVE_VHSUB<MVEVectorVTInfo VTI>
- : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated>;
+multiclass MVE_VHSUB<MVEVectorVTInfo VTI, PatFrag sub_op, SDNode shift_op>
+ : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated, sub_op,
+ shift_op>;
-defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8>;
-defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16>;
-defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32>;
-defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8>;
-defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16>;
-defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32>;
+defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8, subnsw, ARMvshrsImm>;
+defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16, subnsw, ARMvshrsImm>;
+defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32, subnsw, ARMvshrsImm>;
+defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8, subnuw, ARMvshruImm>;
+defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16, subnuw, ARMvshruImm>;
+defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32, subnuw, ARMvshruImm>;
class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
@@ -2028,24 +2241,37 @@ let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))),
(MVE_VDUP32 rGPR:$elem)>;
- def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)),
- (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
- // For the 16-bit and 8-bit vduplanes we don't care about the signedness
- // of the lane move operation as we only want the lowest 8/16 bits anyway.
- def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)),
- (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
- def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)),
- (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>;
-
- def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))),
- (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>;
- def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))),
- (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>;
+ def : Pat<(v8f16 (ARMvdup (i32 rGPR:$elem))),
+ (MVE_VDUP16 rGPR:$elem)>;
+ def : Pat<(v4f32 (ARMvdup (i32 rGPR:$elem))),
+ (MVE_VDUP32 rGPR:$elem)>;
- def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)),
- (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
- def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)),
- (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
+ // Match a vselect with an ARMvdup as a predicated MVE_VDUP
+ def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred),
+ (v16i8 (ARMvdup (i32 rGPR:$elem))),
+ (v16i8 MQPR:$inactive))),
+ (MVE_VDUP8 rGPR:$elem, ARMVCCThen, (v16i1 VCCR:$pred),
+ (v16i8 MQPR:$inactive))>;
+ def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred),
+ (v8i16 (ARMvdup (i32 rGPR:$elem))),
+ (v8i16 MQPR:$inactive))),
+ (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred),
+ (v8i16 MQPR:$inactive))>;
+ def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred),
+ (v4i32 (ARMvdup (i32 rGPR:$elem))),
+ (v4i32 MQPR:$inactive))),
+ (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred),
+ (v4i32 MQPR:$inactive))>;
+ def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred),
+ (v4f32 (ARMvdup (i32 rGPR:$elem))),
+ (v4f32 MQPR:$inactive))),
+ (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred),
+ (v4f32 MQPR:$inactive))>;
+ def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred),
+ (v8f16 (ARMvdup (i32 rGPR:$elem))),
+ (v8f16 MQPR:$inactive))),
+ (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred),
+ (v8f16 MQPR:$inactive))>;
}
@@ -2079,32 +2305,43 @@ class MVE_VCLSCLZ<string iname, string suffix, bits<2> size,
let validForTailPredication = 1;
}
-def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>;
-def MVE_VCLSs16 : MVE_VCLSCLZ<"vcls", "s16", 0b01, 0b0>;
-def MVE_VCLSs32 : MVE_VCLSCLZ<"vcls", "s32", 0b10, 0b0>;
+multiclass MVE_VCLSCLZ_p<string opname, bit opcode, MVEVectorVTInfo VTI,
+ SDNode unpred_op> {
+ def "": MVE_VCLSCLZ<"v"#opname, VTI.Suffix, VTI.Size, opcode>;
-def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>;
-def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>;
-def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar pred_int = !cast<Intrinsic>("int_arm_mve_"#opname#"_predicated");
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))),
- (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>;
- def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))),
- (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>;
- def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))),
- (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>;
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$val))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$val)))>;
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen,
+ (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+ }
}
+defm MVE_VCLSs8 : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, int_arm_mve_vcls>;
+defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, int_arm_mve_vcls>;
+defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, int_arm_mve_vcls>;
+
+defm MVE_VCLZs8 : MVE_VCLSCLZ_p<"clz", 1, MVE_v16i8, ctlz>;
+defm MVE_VCLZs16 : MVE_VCLSCLZ_p<"clz", 1, MVE_v8i16, ctlz>;
+defm MVE_VCLZs32 : MVE_VCLSCLZ_p<"clz", 1, MVE_v4i32, ctlz>;
+
class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
- list<dag> pattern=[]>
+ bit saturate, list<dag> pattern=[]>
: MVEIntSingleSrc<iname, suffix, size, pattern> {
let Inst{28} = 0b1;
let Inst{25-23} = 0b111;
let Inst{21-20} = 0b11;
- let Inst{17-16} = 0b01;
- let Inst{12-8} = 0b00011;
+ let Inst{17} = 0b0;
+ let Inst{16} = !eq(saturate, 0);
+ let Inst{12-11} = 0b00;
+ let Inst{10} = saturate;
+ let Inst{9-8} = 0b11;
let Inst{7} = negate;
let Inst{6} = 0b1;
let Inst{4} = 0b0;
@@ -2112,61 +2349,40 @@ class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
let validForTailPredication = 1;
}
-def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>;
-def MVE_VABSs16 : MVE_VABSNEG_int<"vabs", "s16", 0b01, 0b0>;
-def MVE_VABSs32 : MVE_VABSNEG_int<"vabs", "s32", 0b10, 0b0>;
-
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (abs (v16i8 MQPR:$v))),
- (v16i8 (MVE_VABSs8 $v))>;
- def : Pat<(v8i16 (abs (v8i16 MQPR:$v))),
- (v8i16 (MVE_VABSs16 $v))>;
- def : Pat<(v4i32 (abs (v4i32 MQPR:$v))),
- (v4i32 (MVE_VABSs32 $v))>;
-}
+multiclass MVE_VABSNEG_int_m<string iname, bit negate, bit saturate,
+ SDNode unpred_op, Intrinsic pred_int,
+ MVEVectorVTInfo VTI> {
+ def "" : MVE_VABSNEG_int<iname, VTI.Suffix, VTI.Size, negate, saturate>;
+ defvar Inst = !cast<Instruction>(NAME);
-def MVE_VNEGs8 : MVE_VABSNEG_int<"vneg", "s8", 0b00, 0b1>;
-def MVE_VNEGs16 : MVE_VABSNEG_int<"vneg", "s16", 0b01, 0b1>;
-def MVE_VNEGs32 : MVE_VABSNEG_int<"vneg", "s32", 0b10, 0b1>;
+ let Predicates = [HasMVEInt] in {
+ // VQABS and VQNEG have more difficult isel patterns defined elsewhere
+ if !eq(saturate, 0) then {
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>;
+ }
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (vnegq (v16i8 MQPR:$v))),
- (v16i8 (MVE_VNEGs8 $v))>;
- def : Pat<(v8i16 (vnegq (v8i16 MQPR:$v))),
- (v8i16 (MVE_VNEGs16 $v))>;
- def : Pat<(v4i32 (vnegq (v4i32 MQPR:$v))),
- (v4i32 (MVE_VNEGs32 $v))>;
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>;
+ }
}
-class MVE_VQABSNEG<string iname, string suffix, bits<2> size,
- bit negate, list<dag> pattern=[]>
- : MVEIntSingleSrc<iname, suffix, size, pattern> {
-
- let Inst{28} = 0b1;
- let Inst{25-23} = 0b111;
- let Inst{21-20} = 0b11;
- let Inst{17-16} = 0b00;
- let Inst{12-8} = 0b00111;
- let Inst{7} = negate;
- let Inst{6} = 0b1;
- let Inst{4} = 0b0;
- let Inst{0} = 0b0;
- let validForTailPredication = 1;
+foreach VTI = [ MVE_v16s8, MVE_v8s16, MVE_v4s32 ] in {
+ defm "MVE_VABS" # VTI.Suffix : MVE_VABSNEG_int_m<
+ "vabs", 0, 0, abs, int_arm_mve_abs_predicated, VTI>;
+ defm "MVE_VQABS" # VTI.Suffix : MVE_VABSNEG_int_m<
+ "vqabs", 0, 1, ?, int_arm_mve_qabs_predicated, VTI>;
+ defm "MVE_VNEG" # VTI.Suffix : MVE_VABSNEG_int_m<
+ "vneg", 1, 0, vnegq, int_arm_mve_neg_predicated, VTI>;
+ defm "MVE_VQNEG" # VTI.Suffix : MVE_VABSNEG_int_m<
+ "vqneg", 1, 1, ?, int_arm_mve_qneg_predicated, VTI>;
}
-def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>;
-def MVE_VQABSs16 : MVE_VQABSNEG<"vqabs", "s16", 0b01, 0b0>;
-def MVE_VQABSs32 : MVE_VQABSNEG<"vqabs", "s32", 0b10, 0b0>;
-
-def MVE_VQNEGs8 : MVE_VQABSNEG<"vqneg", "s8", 0b00, 0b1>;
-def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>;
-def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>;
-
// int_min/int_max: vector containing INT_MIN/INT_MAX VTI.Size times
// zero_vec: v4i32-initialized zero vector, potentially wrapped in a bitconvert
multiclass vqabsneg_pattern<MVEVectorVTInfo VTI, dag int_min, dag int_max,
- dag zero_vec, MVE_VQABSNEG vqabs_instruction,
- MVE_VQABSNEG vqneg_instruction> {
+ dag zero_vec, MVE_VABSNEG_int vqabs_instruction,
+ MVE_VABSNEG_int vqneg_instruction> {
let Predicates = [HasMVEInt] in {
// The below tree can be replaced by a vqabs instruction, as it represents
// the following vectorized expression (r being the value in $reg):
@@ -2257,6 +2473,8 @@ let Predicates = [HasMVEInt] in {
(v8i16 (MVE_VMOVimmi16 nImmSplatI16:$simm))>;
def : Pat<(v4i32 (ARMvmovImm timm:$simm)),
(v4i32 (MVE_VMOVimmi32 nImmVMOVI32:$simm))>;
+ def : Pat<(v2i64 (ARMvmovImm timm:$simm)),
+ (v2i64 (MVE_VMOVimmi64 nImmSplatI64:$simm))>;
def : Pat<(v8i16 (ARMvmvnImm timm:$simm)),
(v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm))>;
@@ -2265,6 +2483,15 @@ let Predicates = [HasMVEInt] in {
def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)),
(v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>;
+
+ def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (ARMvmvnImm timm:$simm),
+ MQPR:$inactive)),
+ (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm,
+ ARMVCCThen, VCCR:$pred, MQPR:$inactive))>;
+ def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (ARMvmvnImm timm:$simm),
+ MQPR:$inactive)),
+ (v4i32 (MVE_VMVNimmi32 nImmSplatI32:$simm,
+ ARMVCCThen, VCCR:$pred, MQPR:$inactive))>;
}
class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
@@ -2291,13 +2518,37 @@ class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
let validForTailPredication = 1;
}
-def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>;
-def MVE_VMAXAs16 : MVE_VMINMAXA<"vmaxa", "s16", 0b01, 0b0>;
-def MVE_VMAXAs32 : MVE_VMINMAXA<"vmaxa", "s32", 0b10, 0b0>;
+multiclass MVE_VMINMAXA_m<string iname, MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int, bit bit_12> {
+ def "" : MVE_VMINMAXA<iname, VTI.Suffix, VTI.Size, bit_12>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated v(min|max)a
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qd), (abs (VTI.Vec MQPR:$Qm)))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>;
+
+ // Predicated v(min|max)a
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+ }
+}
+
+multiclass MVE_VMINA<MVEVectorVTInfo VTI>
+ : MVE_VMINMAXA_m<"vmina", VTI, umin, int_arm_mve_vmina_predicated, 0b1>;
+
+defm MVE_VMINAs8 : MVE_VMINA<MVE_v16s8>;
+defm MVE_VMINAs16 : MVE_VMINA<MVE_v8s16>;
+defm MVE_VMINAs32 : MVE_VMINA<MVE_v4s32>;
-def MVE_VMINAs8 : MVE_VMINMAXA<"vmina", "s8", 0b00, 0b1>;
-def MVE_VMINAs16 : MVE_VMINMAXA<"vmina", "s16", 0b01, 0b1>;
-def MVE_VMINAs32 : MVE_VMINMAXA<"vmina", "s32", 0b10, 0b1>;
+multiclass MVE_VMAXA<MVEVectorVTInfo VTI>
+ : MVE_VMINMAXA_m<"vmaxa", VTI, umax, int_arm_mve_vmaxa_predicated, 0b0>;
+
+defm MVE_VMAXAs8 : MVE_VMAXA<MVE_v16s8>;
+defm MVE_VMAXAs16 : MVE_VMAXA<MVE_v8s16>;
+defm MVE_VMAXAs32 : MVE_VMAXA<MVE_v4s32>;
// end of MVE Integer instructions
@@ -2334,7 +2585,7 @@ class MVE_shift_imm<dag oops, dag iops, string iname, string suffix,
let Inst{3-1} = Qm{2-0};
}
-class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U,
+class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, bit top,
list<dag> pattern=[]>
: MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
iname, suffix, "$Qd, $Qm", vpred_r, "",
@@ -2344,25 +2595,36 @@ class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U,
let Inst{21} = 0b1;
let Inst{20-19} = sz{1-0};
let Inst{18-16} = 0b000;
+ let Inst{12} = top;
let Inst{11-6} = 0b111101;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let doubleWidthResult = 1;
}
-multiclass MVE_VMOVL_shift_half<string iname, string suffix, bits<2> sz, bit U,
- list<dag> pattern=[]> {
- def bh : MVE_VMOVL<!strconcat(iname, "b"), suffix, sz, U, pattern> {
- let Inst{12} = 0b0;
- }
- def th : MVE_VMOVL<!strconcat(iname, "t"), suffix, sz, U, pattern> {
- let Inst{12} = 0b1;
- }
+multiclass MVE_VMOVL_m<bit top, string chr, MVEVectorVTInfo OutVTI,
+ MVEVectorVTInfo InVTI> {
+ def "": MVE_VMOVL<"vmovl" # chr, InVTI.Suffix, OutVTI.Size,
+ InVTI.Unsigned, top>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ def : Pat<(OutVTI.Vec (int_arm_mve_vmovl_predicated (InVTI.Vec MQPR:$src),
+ (i32 InVTI.Unsigned), (i32 top),
+ (OutVTI.Pred VCCR:$pred),
+ (OutVTI.Vec MQPR:$inactive))),
+ (OutVTI.Vec (Inst (InVTI.Vec MQPR:$src), ARMVCCThen,
+ (OutVTI.Pred VCCR:$pred),
+ (OutVTI.Vec MQPR:$inactive)))>;
}
-defm MVE_VMOVLs8 : MVE_VMOVL_shift_half<"vmovl", "s8", 0b01, 0b0>;
-defm MVE_VMOVLu8 : MVE_VMOVL_shift_half<"vmovl", "u8", 0b01, 0b1>;
-defm MVE_VMOVLs16 : MVE_VMOVL_shift_half<"vmovl", "s16", 0b10, 0b0>;
-defm MVE_VMOVLu16 : MVE_VMOVL_shift_half<"vmovl", "u16", 0b10, 0b1>;
+defm MVE_VMOVLs8bh : MVE_VMOVL_m<0, "b", MVE_v8s16, MVE_v16s8>;
+defm MVE_VMOVLs8th : MVE_VMOVL_m<1, "t", MVE_v8s16, MVE_v16s8>;
+defm MVE_VMOVLu8bh : MVE_VMOVL_m<0, "b", MVE_v8u16, MVE_v16u8>;
+defm MVE_VMOVLu8th : MVE_VMOVL_m<1, "t", MVE_v8u16, MVE_v16u8>;
+defm MVE_VMOVLs16bh : MVE_VMOVL_m<0, "b", MVE_v4s32, MVE_v8s16>;
+defm MVE_VMOVLs16th : MVE_VMOVL_m<1, "t", MVE_v4s32, MVE_v8s16>;
+defm MVE_VMOVLu16bh : MVE_VMOVL_m<0, "b", MVE_v4s32, MVE_v8u16>;
+defm MVE_VMOVLu16th : MVE_VMOVL_m<1, "t", MVE_v4s32, MVE_v8u16>;
let Predicates = [HasMVEInt] in {
def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i16),
@@ -2372,12 +2634,23 @@ let Predicates = [HasMVEInt] in {
def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8),
(MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>;
+ def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), v8i8),
+ (MVE_VMOVLs8th MQPR:$src)>;
+ def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), v4i16),
+ (MVE_VMOVLs16th MQPR:$src)>;
+
+ // zext_inreg 8 -> 16
+ def : Pat<(ARMvbicImm (v8i16 MQPR:$src), (i32 0xAFF)),
+ (MVE_VMOVLu8bh MQPR:$src)>;
// zext_inreg 16 -> 32
def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))),
(MVE_VMOVLu16bh MQPR:$src)>;
- // zext_inreg 8 -> 16
- def : Pat<(and (v8i16 MQPR:$src), (v8i16 (ARMvmovImm (i32 0x8FF)))),
- (MVE_VMOVLu8bh MQPR:$src)>;
+ // Same zext_inreg with vrevs, picking the top half
+ def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), (i32 0xAFF)),
+ (MVE_VMOVLu8th MQPR:$src)>;
+ def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))),
+ (v4i32 (ARMvmovImm (i32 0xCFF)))),
+ (MVE_VMOVLu16th MQPR:$src)>;
}
@@ -2395,6 +2668,8 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
// For the MVE_VSHLL_patterns multiclass to refer to
Operand immediateType = immtype;
+
+ let doubleWidthResult = 1;
}
// The immediate VSHLL instructions accept shift counts from 1 up to
@@ -2438,6 +2713,7 @@ class MVE_VSHLL_by_lane_width<string iname, string suffix, bits<2> size,
let Inst{11-6} = 0b111000;
let Inst{4} = 0b0;
let Inst{0} = 0b1;
+ let doubleWidthResult = 1;
}
multiclass MVE_VSHLL_lw<string iname, string suffix, bits<2> sz, bit U,
@@ -2472,17 +2748,17 @@ multiclass MVE_VSHLL_patterns<MVEVectorVTInfo VTI, int top> {
def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), imm:$imm,
(i32 VTI.Unsigned), (i32 top),
- (VTI.Pred VCCR:$mask),
+ (VTI.DblPred VCCR:$mask),
(VTI.DblVec MQPR:$inactive))),
(VTI.DblVec (inst_imm (VTI.Vec MQPR:$src), imm:$imm,
- ARMVCCThen, (VTI.Pred VCCR:$mask),
+ ARMVCCThen, (VTI.DblPred VCCR:$mask),
(VTI.DblVec MQPR:$inactive)))>;
def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits),
(i32 VTI.Unsigned), (i32 top),
- (VTI.Pred VCCR:$mask),
+ (VTI.DblPred VCCR:$mask),
(VTI.DblVec MQPR:$inactive))),
(VTI.DblVec (inst_lw (VTI.Vec MQPR:$src), ARMVCCThen,
- (VTI.Pred VCCR:$mask),
+ (VTI.DblPred VCCR:$mask),
(VTI.DblVec MQPR:$inactive)))>;
}
@@ -2509,6 +2785,8 @@ class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
let Inst{11-6} = 0b111111;
let Inst{4} = 0b0;
let Inst{0} = 0b1;
+ let validForTailPredication = 1;
+ let retainsPreviousHalfElement = 1;
}
def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> {
@@ -2550,6 +2828,8 @@ class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12,
let Inst{11-6} = 0b111111;
let Inst{4} = 0b0;
let Inst{0} = 0b0;
+ let validForTailPredication = 1;
+ let retainsPreviousHalfElement = 1;
}
def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN<
@@ -2598,6 +2878,8 @@ class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
let Inst{11-6} = 0b111101;
let Inst{4} = 0b0;
let Inst{0} = bit_0;
+ let validForTailPredication = 1;
+ let retainsPreviousHalfElement = 1;
}
multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> {
@@ -3131,41 +3413,34 @@ class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size,
}
-multiclass MVE_VRINT_ops<string suffix, bits<2> size, list<dag> pattern=[]> {
- def N : MVE_VRINT<"n", 0b000, suffix, size, pattern>;
- def X : MVE_VRINT<"x", 0b001, suffix, size, pattern>;
- def A : MVE_VRINT<"a", 0b010, suffix, size, pattern>;
- def Z : MVE_VRINT<"z", 0b011, suffix, size, pattern>;
- def M : MVE_VRINT<"m", 0b101, suffix, size, pattern>;
- def P : MVE_VRINT<"p", 0b111, suffix, size, pattern>;
-}
+multiclass MVE_VRINT_m<MVEVectorVTInfo VTI, string suffix, bits<3> opcode,
+ SDNode unpred_op> {
+ def "": MVE_VRINT<suffix, opcode, VTI.Suffix, VTI.Size>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar pred_int = !cast<Intrinsic>("int_arm_mve_vrint"#suffix#"_predicated");
-defm MVE_VRINTf16 : MVE_VRINT_ops<"f16", 0b01>;
-defm MVE_VRINTf32 : MVE_VRINT_ops<"f32", 0b10>;
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$val))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$val)))>;
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen,
+ (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+ }
+}
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v4f32 (frint (v4f32 MQPR:$val1))),
- (v4f32 (MVE_VRINTf32X (v4f32 MQPR:$val1)))>;
- def : Pat<(v8f16 (frint (v8f16 MQPR:$val1))),
- (v8f16 (MVE_VRINTf16X (v8f16 MQPR:$val1)))>;
- def : Pat<(v4f32 (fround (v4f32 MQPR:$val1))),
- (v4f32 (MVE_VRINTf32A (v4f32 MQPR:$val1)))>;
- def : Pat<(v8f16 (fround (v8f16 MQPR:$val1))),
- (v8f16 (MVE_VRINTf16A (v8f16 MQPR:$val1)))>;
- def : Pat<(v4f32 (ftrunc (v4f32 MQPR:$val1))),
- (v4f32 (MVE_VRINTf32Z (v4f32 MQPR:$val1)))>;
- def : Pat<(v8f16 (ftrunc (v8f16 MQPR:$val1))),
- (v8f16 (MVE_VRINTf16Z (v8f16 MQPR:$val1)))>;
- def : Pat<(v4f32 (ffloor (v4f32 MQPR:$val1))),
- (v4f32 (MVE_VRINTf32M (v4f32 MQPR:$val1)))>;
- def : Pat<(v8f16 (ffloor (v8f16 MQPR:$val1))),
- (v8f16 (MVE_VRINTf16M (v8f16 MQPR:$val1)))>;
- def : Pat<(v4f32 (fceil (v4f32 MQPR:$val1))),
- (v4f32 (MVE_VRINTf32P (v4f32 MQPR:$val1)))>;
- def : Pat<(v8f16 (fceil (v8f16 MQPR:$val1))),
- (v8f16 (MVE_VRINTf16P (v8f16 MQPR:$val1)))>;
+multiclass MVE_VRINT_ops<MVEVectorVTInfo VTI> {
+ defm N : MVE_VRINT_m<VTI, "n", 0b000, int_arm_mve_vrintn>;
+ defm X : MVE_VRINT_m<VTI, "x", 0b001, frint>;
+ defm A : MVE_VRINT_m<VTI, "a", 0b010, fround>;
+ defm Z : MVE_VRINT_m<VTI, "z", 0b011, ftrunc>;
+ defm M : MVE_VRINT_m<VTI, "m", 0b101, ffloor>;
+ defm P : MVE_VRINT_m<VTI, "p", 0b111, fceil>;
}
+defm MVE_VRINTf16 : MVE_VRINT_ops<MVE_v8f16>;
+defm MVE_VRINTf32 : MVE_VRINT_ops<MVE_v4f32>;
+
class MVEFloatArithNeon<string iname, string suffix, bit size,
dag oops, dag iops, string ops,
vpred_ops vpred, string cstr, list<dag> pattern=[]>
@@ -3281,29 +3556,40 @@ class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
let Inst{8} = bit_8;
let Inst{7} = Qn{3};
let Inst{4} = bit_4;
+ let validForTailPredication = 1;
}
-def MVE_VFMAf32 : MVE_VADDSUBFMA_fp<"vfma", "f32", 0b0, 0b1, 0b0, 0b0,
- (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-def MVE_VFMAf16 : MVE_VADDSUBFMA_fp<"vfma", "f16", 0b1, 0b1, 0b0, 0b0,
- (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-
-def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
- (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
- (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
+ def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0b1, 0b0, fms,
+ (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar pred_int = int_arm_mve_fma_predicated;
+ defvar m1 = (VTI.Vec MQPR:$m1);
+ defvar m2 = (VTI.Vec MQPR:$m2);
+ defvar add = (VTI.Vec MQPR:$add);
+ defvar pred = (VTI.Pred VCCR:$pred);
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
- (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
- def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
- (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
- def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
- (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>;
- def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
- (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>;
+ let Predicates = [HasMVEFloat] in {
+ if fms then {
+ def : Pat<(VTI.Vec (fma (fneg m1), m2, add)), (Inst $add, $m1, $m2)>;
+ def : Pat<(VTI.Vec (fma m1, (fneg m2), add)), (Inst $add, $m1, $m2)>;
+ def : Pat<(VTI.Vec (pred_int (fneg m1), m2, add, pred)),
+ (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+ def : Pat<(VTI.Vec (pred_int m1, (fneg m2), add, pred)),
+ (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+ } else {
+ def : Pat<(VTI.Vec (fma m1, m2, add)), (Inst $add, $m1, $m2)>;
+ def : Pat<(VTI.Vec (pred_int m1, m2, add, pred)),
+ (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+ }
+ }
}
+defm MVE_VFMAf32 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v4f32>;
+defm MVE_VFMAf16 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v8f16>;
+defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>;
+defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
+
multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
SDNode unpred_op, Intrinsic pred_int> {
def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
@@ -3423,10 +3709,10 @@ defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>;
defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
- Operand imm_operand_type, list<dag> pattern=[]>
+ Operand imm_operand_type>
: MVE_float<"vcvt", suffix,
(outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6),
- "$Qd, $Qm, $imm6", vpred_r, "", pattern> {
+ "$Qd, $Qm, $imm6", vpred_r, "", []> {
bits<4> Qd;
bits<6> imm6;
@@ -3468,14 +3754,43 @@ class MVE_VCVT_fix_f16<string suffix, bit U, bit op>
let Inst{20} = 0b1;
}
-def MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16<"f16.s16", 0b0, 0b0>;
-def MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16<"s16.f16", 0b0, 0b1>;
-def MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16<"f16.u16", 0b1, 0b0>;
-def MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16<"u16.f16", 0b1, 0b1>;
-def MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32<"f32.s32", 0b0, 0b0>;
-def MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32<"s32.f32", 0b0, 0b1>;
-def MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32<"f32.u32", 0b1, 0b0>;
-def MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32<"u32.f32", 0b1, 0b1>;
+multiclass MVE_VCVT_fix_patterns<Instruction Inst, bit U, MVEVectorVTInfo DestVTI,
+ MVEVectorVTInfo SrcVTI> {
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(DestVTI.Vec (int_arm_mve_vcvt_fix
+ (i32 U), (SrcVTI.Vec MQPR:$Qm), imm:$scale)),
+ (DestVTI.Vec (Inst (SrcVTI.Vec MQPR:$Qm), imm:$scale))>;
+ def : Pat<(DestVTI.Vec (int_arm_mve_vcvt_fix_predicated (i32 U),
+ (DestVTI.Vec MQPR:$inactive),
+ (SrcVTI.Vec MQPR:$Qm),
+ imm:$scale,
+ (DestVTI.Pred VCCR:$mask))),
+ (DestVTI.Vec (Inst (SrcVTI.Vec MQPR:$Qm), imm:$scale,
+ ARMVCCThen, (DestVTI.Pred VCCR:$mask),
+ (DestVTI.Vec MQPR:$inactive)))>;
+ }
+}
+
+multiclass MVE_VCVT_fix_f32_m<bit U, bit op,
+ MVEVectorVTInfo DestVTI, MVEVectorVTInfo SrcVTI> {
+ def "" : MVE_VCVT_fix_f32<DestVTI.Suffix#"."#SrcVTI.Suffix, U, op>;
+ defm : MVE_VCVT_fix_patterns<!cast<Instruction>(NAME), U, DestVTI, SrcVTI>;
+}
+
+multiclass MVE_VCVT_fix_f16_m<bit U, bit op,
+ MVEVectorVTInfo DestVTI, MVEVectorVTInfo SrcVTI> {
+ def "" : MVE_VCVT_fix_f16<DestVTI.Suffix#"."#SrcVTI.Suffix, U, op>;
+ defm : MVE_VCVT_fix_patterns<!cast<Instruction>(NAME), U, DestVTI, SrcVTI>;
+}
+
+defm MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16_m<0b0, 0b0, MVE_v8f16, MVE_v8s16>;
+defm MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16_m<0b0, 0b1, MVE_v8s16, MVE_v8f16>;
+defm MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16_m<0b1, 0b0, MVE_v8f16, MVE_v8u16>;
+defm MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16_m<0b1, 0b1, MVE_v8u16, MVE_v8f16>;
+defm MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32_m<0b0, 0b0, MVE_v4f32, MVE_v4s32>;
+defm MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32_m<0b0, 0b1, MVE_v4s32, MVE_v4f32>;
+defm MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32_m<0b1, 0b0, MVE_v4f32, MVE_v4u32>;
+defm MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32_m<0b1, 0b1, MVE_v4u32, MVE_v4f32>;
class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
bits<2> rm, list<dag> pattern=[]>
@@ -3497,23 +3812,44 @@ class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
let validForTailPredication = 1;
}
-multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op,
- list<dag> pattern=[]> {
- def a : MVE_VCVT_fp_int_anpm<suffix, size, op, "a", 0b00>;
- def n : MVE_VCVT_fp_int_anpm<suffix, size, op, "n", 0b01>;
- def p : MVE_VCVT_fp_int_anpm<suffix, size, op, "p", 0b10>;
- def m : MVE_VCVT_fp_int_anpm<suffix, size, op, "m", 0b11>;
+multiclass MVE_VCVT_fp_int_anpm_inner<MVEVectorVTInfo Int, MVEVectorVTInfo Flt,
+ string anpm, bits<2> rm> {
+ def "": MVE_VCVT_fp_int_anpm<Int.Suffix # "." # Flt.Suffix, Int.Size,
+ Int.Unsigned, anpm, rm>;
+
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar IntrBaseName = "int_arm_mve_vcvt" # anpm;
+ defvar UnpredIntr = !cast<Intrinsic>(IntrBaseName);
+ defvar PredIntr = !cast<Intrinsic>(IntrBaseName # "_predicated");
+
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(Int.Vec (UnpredIntr (i32 Int.Unsigned), (Flt.Vec MQPR:$in))),
+ (Int.Vec (Inst (Flt.Vec MQPR:$in)))>;
+
+ def : Pat<(Int.Vec (PredIntr (i32 Int.Unsigned), (Int.Vec MQPR:$inactive),
+ (Flt.Vec MQPR:$in), (Flt.Pred VCCR:$pred))),
+ (Int.Vec (Inst (Flt.Vec MQPR:$in), ARMVCCThen,
+ (Flt.Pred VCCR:$pred), (Int.Vec MQPR:$inactive)))>;
+ }
+}
+
+multiclass MVE_VCVT_fp_int_anpm_outer<MVEVectorVTInfo Int,
+ MVEVectorVTInfo Flt> {
+ defm a : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "a", 0b00>;
+ defm n : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "n", 0b01>;
+ defm p : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "p", 0b10>;
+ defm m : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "m", 0b11>;
}
// This defines instructions such as MVE_VCVTu16f16a, with an explicit
// rounding-mode suffix on the mnemonic. The class below will define
// the bare MVE_VCVTu16f16 (with implied rounding toward zero).
-defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_multi<"s16.f16", 0b01, 0b0>;
-defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_multi<"u16.f16", 0b01, 0b1>;
-defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_multi<"s32.f32", 0b10, 0b0>;
-defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_multi<"u32.f32", 0b10, 0b1>;
+defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_outer<MVE_v8s16, MVE_v8f16>;
+defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_outer<MVE_v8u16, MVE_v8f16>;
+defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4s32, MVE_v4f32>;
+defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4u32, MVE_v4f32>;
-class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op,
+class MVE_VCVT_fp_int<string suffix, bits<2> size, bit toint, bit unsigned,
list<dag> pattern=[]>
: MVE_float<"vcvt", suffix, (outs MQPR:$Qd),
(ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
@@ -3527,41 +3863,43 @@ class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op,
let Inst{17-16} = 0b11;
let Inst{15-13} = Qd{2-0};
let Inst{12-9} = 0b0011;
- let Inst{8-7} = op;
+ let Inst{8} = toint;
+ let Inst{7} = unsigned;
let Inst{4} = 0b0;
let validForTailPredication = 1;
}
+multiclass MVE_VCVT_fp_int_m<MVEVectorVTInfo Dest, MVEVectorVTInfo Src,
+ SDNode unpred_op> {
+ defvar Unsigned = !or(!eq(Dest.SuffixLetter,"u"), !eq(Src.SuffixLetter,"u"));
+ defvar ToInt = !eq(Src.SuffixLetter,"f");
+
+ def "" : MVE_VCVT_fp_int<Dest.Suffix # "." # Src.Suffix, Dest.Size,
+ ToInt, Unsigned>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(Dest.Vec (unpred_op (Src.Vec MQPR:$src))),
+ (Dest.Vec (Inst (Src.Vec MQPR:$src)))>;
+ def : Pat<(Dest.Vec (int_arm_mve_vcvt_fp_int_predicated
+ (Src.Vec MQPR:$src), (i32 Unsigned),
+ (Src.Pred VCCR:$mask), (Dest.Vec MQPR:$inactive))),
+ (Dest.Vec (Inst (Src.Vec MQPR:$src), ARMVCCThen,
+ (Src.Pred VCCR:$mask),
+ (Dest.Vec MQPR:$inactive)))>;
+ }
+}
// The unsuffixed VCVT for float->int implicitly rounds toward zero,
// which I reflect here in the llvm instruction names
-def MVE_VCVTs16f16z : MVE_VCVT_fp_int<"s16.f16", 0b01, 0b10>;
-def MVE_VCVTu16f16z : MVE_VCVT_fp_int<"u16.f16", 0b01, 0b11>;
-def MVE_VCVTs32f32z : MVE_VCVT_fp_int<"s32.f32", 0b10, 0b10>;
-def MVE_VCVTu32f32z : MVE_VCVT_fp_int<"u32.f32", 0b10, 0b11>;
+defm MVE_VCVTs16f16z : MVE_VCVT_fp_int_m<MVE_v8s16, MVE_v8f16, fp_to_sint>;
+defm MVE_VCVTu16f16z : MVE_VCVT_fp_int_m<MVE_v8u16, MVE_v8f16, fp_to_uint>;
+defm MVE_VCVTs32f32z : MVE_VCVT_fp_int_m<MVE_v4s32, MVE_v4f32, fp_to_sint>;
+defm MVE_VCVTu32f32z : MVE_VCVT_fp_int_m<MVE_v4u32, MVE_v4f32, fp_to_uint>;
// Whereas VCVT for int->float rounds to nearest
-def MVE_VCVTf16s16n : MVE_VCVT_fp_int<"f16.s16", 0b01, 0b00>;
-def MVE_VCVTf16u16n : MVE_VCVT_fp_int<"f16.u16", 0b01, 0b01>;
-def MVE_VCVTf32s32n : MVE_VCVT_fp_int<"f32.s32", 0b10, 0b00>;
-def MVE_VCVTf32u32n : MVE_VCVT_fp_int<"f32.u32", 0b10, 0b01>;
-
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v4i32 (fp_to_sint (v4f32 MQPR:$src))),
- (v4i32 (MVE_VCVTs32f32z (v4f32 MQPR:$src)))>;
- def : Pat<(v4i32 (fp_to_uint (v4f32 MQPR:$src))),
- (v4i32 (MVE_VCVTu32f32z (v4f32 MQPR:$src)))>;
- def : Pat<(v8i16 (fp_to_sint (v8f16 MQPR:$src))),
- (v8i16 (MVE_VCVTs16f16z (v8f16 MQPR:$src)))>;
- def : Pat<(v8i16 (fp_to_uint (v8f16 MQPR:$src))),
- (v8i16 (MVE_VCVTu16f16z (v8f16 MQPR:$src)))>;
- def : Pat<(v4f32 (sint_to_fp (v4i32 MQPR:$src))),
- (v4f32 (MVE_VCVTf32s32n (v4i32 MQPR:$src)))>;
- def : Pat<(v4f32 (uint_to_fp (v4i32 MQPR:$src))),
- (v4f32 (MVE_VCVTf32u32n (v4i32 MQPR:$src)))>;
- def : Pat<(v8f16 (sint_to_fp (v8i16 MQPR:$src))),
- (v8f16 (MVE_VCVTf16s16n (v8i16 MQPR:$src)))>;
- def : Pat<(v8f16 (uint_to_fp (v8i16 MQPR:$src))),
- (v8f16 (MVE_VCVTf16u16n (v8i16 MQPR:$src)))>;
-}
+defm MVE_VCVTf16s16n : MVE_VCVT_fp_int_m<MVE_v8f16, MVE_v8s16, sint_to_fp>;
+defm MVE_VCVTf16u16n : MVE_VCVT_fp_int_m<MVE_v8f16, MVE_v8u16, uint_to_fp>;
+defm MVE_VCVTf32s32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4s32, sint_to_fp>;
+defm MVE_VCVTf32u32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4u32, uint_to_fp>;
class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
list<dag> pattern=[]>
@@ -3582,26 +3920,29 @@ class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
let validForTailPredication = 1;
}
-def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>;
-def MVE_VABSf32 : MVE_VABSNEG_fp<"vabs", "f32", 0b10, 0b0>;
-
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v8f16 (fabs MQPR:$src)),
- (MVE_VABSf16 MQPR:$src)>;
- def : Pat<(v4f32 (fabs MQPR:$src)),
- (MVE_VABSf32 MQPR:$src)>;
-}
+multiclass MVE_VABSNEG_fp_m<string iname, SDNode unpred_op, Intrinsic pred_int,
+ MVEVectorVTInfo VTI, bit opcode> {
+ def "" : MVE_VABSNEG_fp<iname, VTI.Suffix, VTI.Size, opcode>;
+ defvar Inst = !cast<Instruction>(NAME);
-def MVE_VNEGf16 : MVE_VABSNEG_fp<"vneg", "f16", 0b01, 0b1>;
-def MVE_VNEGf32 : MVE_VABSNEG_fp<"vneg", "f32", 0b10, 0b1>;
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>;
-let Predicates = [HasMVEFloat] in {
- def : Pat<(v8f16 (fneg MQPR:$src)),
- (MVE_VNEGf16 MQPR:$src)>;
- def : Pat<(v4f32 (fneg MQPR:$src)),
- (MVE_VNEGf32 MQPR:$src)>;
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive))),
+ (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>;
+ }
}
+defm MVE_VABSf16 : MVE_VABSNEG_fp_m<"vabs", fabs, int_arm_mve_abs_predicated,
+ MVE_v8f16, 0>;
+defm MVE_VABSf32 : MVE_VABSNEG_fp_m<"vabs", fabs, int_arm_mve_abs_predicated,
+ MVE_v4f32, 0>;
+defm MVE_VNEGf16 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated,
+ MVE_v8f16, 1>;
+defm MVE_VNEGf32 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated,
+ MVE_v4f32, 1>;
+
class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
list<dag> pattern=[]>
: MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
@@ -3623,11 +3964,37 @@ class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
let Inst{0} = 0b1;
}
-def MVE_VMAXNMAf32 : MVE_VMAXMINNMA<"vmaxnma", "f32", 0b0, 0b0>;
-def MVE_VMAXNMAf16 : MVE_VMAXMINNMA<"vmaxnma", "f16", 0b1, 0b0>;
+multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
+ SDNode unpred_op, Intrinsic pred_int,
+ bit bit_12> {
+ def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size{0}, bit_12>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated v(max|min)nma
+ def : Pat<(VTI.Vec (unpred_op (fabs (VTI.Vec MQPR:$Qd)),
+ (fabs (VTI.Vec MQPR:$Qm)))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>;
+
+ // Predicated v(max|min)nma
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
+ (VTI.Pred VCCR:$mask))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
+ ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+ }
+}
+
+multiclass MVE_VMAXNMA<MVEVectorVTInfo VTI, bit bit_12>
+ : MVE_VMAXMINNMA_m<"vmaxnma", VTI, fmaxnum, int_arm_mve_vmaxnma_predicated, bit_12>;
+
+defm MVE_VMAXNMAf32 : MVE_VMAXNMA<MVE_v4f32, 0b0>;
+defm MVE_VMAXNMAf16 : MVE_VMAXNMA<MVE_v8f16, 0b0>;
-def MVE_VMINNMAf32 : MVE_VMAXMINNMA<"vminnma", "f32", 0b0, 0b1>;
-def MVE_VMINNMAf16 : MVE_VMAXMINNMA<"vminnma", "f16", 0b1, 0b1>;
+multiclass MVE_VMINNMA<MVEVectorVTInfo VTI, bit bit_12>
+ : MVE_VMAXMINNMA_m<"vminnma", VTI, fminnum, int_arm_mve_vminnma_predicated, bit_12>;
+
+defm MVE_VMINNMAf32 : MVE_VMINNMA<MVE_v4f32, 0b1>;
+defm MVE_VMINNMAf16 : MVE_VMINNMA<MVE_v8f16, 0b1>;
// end of MVE Floating Point instructions
@@ -3796,12 +4163,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>;
- def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)),
- (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>;
- def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)),
- (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>;
- def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)),
- (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>;
+ def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc))>;
+ def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc))>;
+ def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc))>;
def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))),
(v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
@@ -3810,12 +4177,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))),
(v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
- def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))),
- (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
- def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))),
- (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
- def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))),
- (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)))),
+ (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)))),
+ (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)))),
+ (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
}
multiclass unpred_vcmpf_z<PatLeaf fc> {
@@ -3825,31 +4192,31 @@ multiclass unpred_vcmpf_z<PatLeaf fc> {
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)))),
- (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)))),
(v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
}
multiclass unpred_vcmpf_r<int fc> {
- def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
- (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
- def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
- (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
+ def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
+ (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
+ def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
+ (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
- def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)),
- (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>;
- def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)),
- (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>;
+ def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc))>;
+ def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc))>;
def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))),
(v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))),
(v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
- def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))),
- (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
- def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))),
- (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)))),
+ (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+ def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)))),
+ (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
}
let Predicates = [HasMVEInt] in {
@@ -3889,7 +4256,7 @@ let Predicates = [HasMVEFloat] in {
}
-// Extra "worst case" and/or/xor partterns, going into and out of GRP
+// Extra "worst case" and/or/xor patterns, going into and out of GRP
multiclass two_predops<SDPatternOperator opnode, Instruction insn> {
def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))),
(v16i1 (COPY_TO_REGCLASS
@@ -3918,7 +4285,6 @@ let Predicates = [HasMVEInt] in {
// example when moving between rGPR and VPR.P0 as part of predicate vector
// shuffles. We also sometimes need to cast between different predicate
// vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
-
def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
let Predicates = [HasMVEInt] in {
@@ -3932,6 +4298,16 @@ let Predicates = [HasMVEInt] in {
def : Pat<(VT (predicate_cast (VT2 VCCR:$src))),
(VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
}
+
+ // Here we match the specific SDNode type 'ARMVectorRegCastImpl'
+ // rather than the more general 'ARMVectorRegCast' which would also
+ // match some bitconverts. If we use the latter in cases where the
+ // input and output types are the same, the bitconvert gets elided
+ // and we end up generating a nonsense match of nothing.
+
+ foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+ foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+ def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))), (VT MQPR:$src)>;
}
// end of MVE compares
@@ -3973,11 +4349,32 @@ class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract,
let Inst{0} = round;
}
+multiclass MVE_VQxDMLxDH_p<string iname, bit exch, bit round, bit subtract,
+ MVEVectorVTInfo VTI> {
+ def "": MVE_VQxDMLxDH<iname, exch, round, subtract, VTI.Suffix, VTI.Size,
+ !if(!eq(VTI.LaneBits, 32), ",@earlyclobber $Qd", "")>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar ConstParams = (? (i32 exch), (i32 round), (i32 subtract));
+ defvar unpred_intr = int_arm_mve_vqdmlad;
+ defvar pred_intr = int_arm_mve_vqdmlad_predicated;
+
+ def : Pat<(VTI.Vec !con((unpred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b),
+ (VTI.Vec MQPR:$c)), ConstParams)),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b),
+ (VTI.Vec MQPR:$c)))>;
+ def : Pat<(VTI.Vec !con((pred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b),
+ (VTI.Vec MQPR:$c)), ConstParams,
+ (? (VTI.Pred VCCR:$pred)))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b),
+ (VTI.Vec MQPR:$c),
+ ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+}
+
multiclass MVE_VQxDMLxDH_multi<string iname, bit exch,
bit round, bit subtract> {
- def s8 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8", 0b00>;
- def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>;
- def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">;
+ defm s8 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v16s8>;
+ defm s16 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v8s16>;
+ defm s32 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v4s32>;
}
defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>;
@@ -4051,6 +4448,7 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
let Inst{7} = Qn{3};
let Inst{0} = 0b0;
let validForTailPredication = 1;
+ let doubleWidthResult = 1;
}
multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
@@ -4072,10 +4470,10 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
// Predicated multiply
def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm),
(VTI.Vec MQPR:$Qn)),
- uflag, (? (i32 Top), (VTI.Pred VCCR:$mask),
+ uflag, (? (i32 Top), (VTI.DblPred VCCR:$mask),
(VTI.DblVec MQPR:$inactive)))),
(VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
- ARMVCCThen, (VTI.Pred VCCR:$mask),
+ ARMVCCThen, (VTI.DblPred VCCR:$mask),
(VTI.DblVec MQPR:$inactive)))>;
}
}
@@ -4122,6 +4520,50 @@ defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
int_arm_mve_mull_poly_predicated, 0b1>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v2i64 (ARMvmulls (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
+ (MVE_VMULLBs32 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(v2i64 (ARMvmulls (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))),
+ (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))),
+ (MVE_VMULLTs32 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(mul (sext_inreg (v4i32 MQPR:$src1), v4i16),
+ (sext_inreg (v4i32 MQPR:$src2), v4i16)),
+ (MVE_VMULLBs16 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(mul (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), v4i16),
+ (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), v4i16)),
+ (MVE_VMULLTs16 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(mul (sext_inreg (v8i16 MQPR:$src1), v8i8),
+ (sext_inreg (v8i16 MQPR:$src2), v8i8)),
+ (MVE_VMULLBs8 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(mul (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), v8i8),
+ (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), v8i8)),
+ (MVE_VMULLTs8 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(v2i64 (ARMvmullu (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
+ (MVE_VMULLBu32 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(v2i64 (ARMvmullu (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))),
+ (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))),
+ (MVE_VMULLTu32 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(mul (and (v4i32 MQPR:$src1), (v4i32 (ARMvmovImm (i32 0xCFF)))),
+ (and (v4i32 MQPR:$src2), (v4i32 (ARMvmovImm (i32 0xCFF))))),
+ (MVE_VMULLBu16 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(mul (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))),
+ (v4i32 (ARMvmovImm (i32 0xCFF)))),
+ (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))),
+ (v4i32 (ARMvmovImm (i32 0xCFF))))),
+ (MVE_VMULLTu16 MQPR:$src1, MQPR:$src2)>;
+
+ def : Pat<(mul (ARMvbicImm (v8i16 MQPR:$src1), (i32 0xAFF)),
+ (ARMvbicImm (v8i16 MQPR:$src2), (i32 0xAFF))),
+ (MVE_VMULLBu8 MQPR:$src1, MQPR:$src2)>;
+ def : Pat<(mul (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), (i32 0xAFF)),
+ (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), (i32 0xAFF))),
+ (MVE_VMULLTu8 MQPR:$src1, MQPR:$src2)>;
+}
+
class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round,
list<dag> pattern=[]>
: MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
@@ -4195,6 +4637,8 @@ class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
let Inst{8} = 0b0;
let Inst{7} = !if(!eq(bit_17, 0), 1, 0);
let Inst{0} = 0b1;
+ let validForTailPredication = 1;
+ let retainsPreviousHalfElement = 1;
}
multiclass MVE_VxMOVxN_halves<string iname, string suffix,
@@ -4213,21 +4657,121 @@ defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>;
defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>;
def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>;
+
+multiclass MVE_VMOVN_p<Instruction Inst, bit top,
+ MVEVectorVTInfo VTI, MVEVectorVTInfo InVTI> {
+ // Match the most obvious MVEvmovn(a,b,t), which overwrites the odd or even
+ // lanes of a (depending on t) with the even lanes of b.
+ def : Pat<(VTI.Vec (MVEvmovn (VTI.Vec MQPR:$Qd_src),
+ (VTI.Vec MQPR:$Qm), (i32 top))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>;
+
+ if !eq(top, 0) then {
+ // If we see MVEvmovn(a,ARMvrev(b),1), that wants to overwrite the odd
+ // lanes of a with the odd lanes of b. In other words, the lanes we're
+ // _keeping_ from a are the even ones. So we can flip it round and say that
+ // this is the same as overwriting the even lanes of b with the even lanes
+ // of a, i.e. it's a VMOVNB with the operands reversed.
+ defvar vrev = !cast<SDNode>("ARMvrev" # InVTI.LaneBits);
+ def : Pat<(VTI.Vec (MVEvmovn (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (vrev MQPR:$Qd_src)), (i32 1))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>;
+ }
+
+ // Match the IR intrinsic for a predicated VMOVN. This regards the Qm input
+ // as having wider lanes that we're narrowing, instead of already-narrow
+ // lanes that we're taking every other one of.
+ def : Pat<(VTI.Vec (int_arm_mve_vmovn_predicated (VTI.Vec MQPR:$Qd_src),
+ (InVTI.Vec MQPR:$Qm), (i32 top),
+ (InVTI.Pred VCCR:$pred))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src),
+ (InVTI.Vec MQPR:$Qm),
+ ARMVCCThen, (InVTI.Pred VCCR:$pred)))>;
+}
+
+defm : MVE_VMOVN_p<MVE_VMOVNi32bh, 0, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VMOVN_p<MVE_VMOVNi32th, 1, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VMOVN_p<MVE_VMOVNi16bh, 0, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VMOVN_p<MVE_VMOVNi16th, 1, MVE_v16i8, MVE_v8i16>;
+
+multiclass MVE_VQMOVN_p<Instruction Inst, bit outU, bit inU, bit top,
+ MVEVectorVTInfo VTI, MVEVectorVTInfo InVTI> {
+ def : Pat<(VTI.Vec (int_arm_mve_vqmovn (VTI.Vec MQPR:$Qd_src),
+ (InVTI.Vec MQPR:$Qm),
+ (i32 outU), (i32 inU), (i32 top))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src),
+ (InVTI.Vec MQPR:$Qm)))>;
+
+ def : Pat<(VTI.Vec (int_arm_mve_vqmovn_predicated (VTI.Vec MQPR:$Qd_src),
+ (InVTI.Vec MQPR:$Qm),
+ (i32 outU), (i32 inU), (i32 top),
+ (InVTI.Pred VCCR:$pred))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src),
+ (InVTI.Vec MQPR:$Qm),
+ ARMVCCThen, (InVTI.Pred VCCR:$pred)))>;
+}
+
+defm : MVE_VQMOVN_p<MVE_VQMOVNs32bh, 0, 0, 0, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNs32th, 0, 0, 1, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNs16bh, 0, 0, 0, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNs16th, 0, 0, 1, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNu32bh, 1, 1, 0, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNu32th, 1, 1, 1, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNu16bh, 1, 1, 0, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNu16th, 1, 1, 1, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVUNs32bh, 1, 0, 0, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVUNs32th, 1, 0, 1, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVUNs16bh, 1, 0, 0, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVUNs16th, 1, 0, 1, MVE_v16i8, MVE_v8i16>;
+
+def SDTARMVMOVNQ : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisVec<2>, SDTCisVT<3, i32>]>;
+def MVEvqmovns : SDNode<"ARMISD::VQMOVNs", SDTARMVMOVNQ>;
+def MVEvqmovnu : SDNode<"ARMISD::VQMOVNu", SDTARMVMOVNQ>;
+
let Predicates = [HasMVEInt] in {
- def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
- (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
- def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
- (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
- def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))),
- (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
- def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))),
- (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+ def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))),
+ (v8i16 (MVE_VQMOVNs32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
+ def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))),
+ (v8i16 (MVE_VQMOVNs32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
+ def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
+ (v16i8 (MVE_VQMOVNs16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+ def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
+ (v16i8 (MVE_VQMOVNs16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+
+ def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))),
+ (v8i16 (MVE_VQMOVNu32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
+ def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))),
+ (v8i16 (MVE_VQMOVNu32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
+ def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
+ (v16i8 (MVE_VQMOVNu16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+ def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
+ (v16i8 (MVE_VQMOVNu16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+
+ def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshrsImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 0))),
+ (v8i16 (MVE_VQSHRNbhs32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>;
+ def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshrsImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 0))),
+ (v16i8 (MVE_VQSHRNbhs16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>;
+ def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshrsImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 1))),
+ (v8i16 (MVE_VQSHRNths32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>;
+ def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshrsImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 1))),
+ (v16i8 (MVE_VQSHRNths16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>;
+
+ def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshruImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 0))),
+ (v8i16 (MVE_VQSHRNbhu32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>;
+ def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshruImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 0))),
+ (v16i8 (MVE_VQSHRNbhu16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>;
+ def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshruImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 1))),
+ (v8i16 (MVE_VQSHRNthu32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>;
+ def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshruImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 1))),
+ (v16i8 (MVE_VQSHRNthu16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>;
}
class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
- list<dag> pattern=[]>
- : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
- "$Qd, $Qm", vpred_n, "$Qd = $Qd_src", pattern> {
+ dag iops_extra, vpred_ops vpred, string cstr>
+ : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
+ !con(iops_extra, (ins MQPR:$Qm)), "$Qd, $Qm",
+ vpred, cstr, []> {
let Inst{28} = op;
let Inst{21-16} = 0b111111;
let Inst{12} = T;
@@ -4235,10 +4779,17 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
let Inst{0} = 0b1;
let Predicates = [HasMVEFloat];
+ let retainsPreviousHalfElement = 1;
}
+def SDTARMVCVTL : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisVT<2, i32>]>;
+def MVEvcvtn : SDNode<"ARMISD::VCVTN", SDTARMVMOVNQ>;
+def MVEvcvtl : SDNode<"ARMISD::VCVTL", SDTARMVCVTL>;
+
multiclass MVE_VCVT_f2h_m<string iname, int half> {
- def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half>;
+ def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half,
+ (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
defvar Inst = !cast<Instruction>(NAME);
let Predicates = [HasMVEFloat] in {
@@ -4250,11 +4801,28 @@ multiclass MVE_VCVT_f2h_m<string iname, int half> {
(v4i1 VCCR:$mask))),
(v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm),
ARMVCCThen, (v4i1 VCCR:$mask)))>;
+
+ def : Pat<(v8f16 (MVEvcvtn (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))),
+ (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>;
}
}
multiclass MVE_VCVT_h2f_m<string iname, int half> {
- def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half>;
+ def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half, (ins), vpred_r, "">;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEFloat] in {
+ def : Pat<(v4f32 (int_arm_mve_vcvt_widen (v8f16 MQPR:$Qm), (i32 half))),
+ (v4f32 (Inst (v8f16 MQPR:$Qm)))>;
+ def : Pat<(v4f32 (int_arm_mve_vcvt_widen_predicated
+ (v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half),
+ (v4i1 VCCR:$mask))),
+ (v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen,
+ (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>;
+
+ def : Pat<(v4f32 (MVEvcvtl (v8f16 MQPR:$Qm), (i32 half))),
+ (v4f32 (Inst (v8f16 MQPR:$Qm)))>;
+ }
}
defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;
@@ -4353,15 +4921,37 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
let Inst{7} = Qn{3};
let Inst{0} = 0b1;
let validForTailPredication = 1;
+ let doubleWidthResult = 1;
+}
+
+multiclass MVE_VQDMULL_m<string iname, MVEVectorVTInfo VTI, bit size, bit T,
+ string cstr> {
+ def "" : MVE_VQDMULL<iname, VTI.Suffix, size, T, cstr>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated saturating multiply
+ def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm),
+ (VTI.Vec MQPR:$Qn), (i32 T))),
+ (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+ // Predicated saturating multiply
+ def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated
+ (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ (i32 T), (VTI.DblPred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive))),
+ (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+ ARMVCCThen, (VTI.DblPred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive)))>;
+ }
}
-multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> {
- def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>;
- def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>;
+multiclass MVE_VQDMULL_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> {
+ defm bh : MVE_VQDMULL_m<"vqdmullb", VTI, size, 0b0, cstr>;
+ defm th : MVE_VQDMULL_m<"vqdmullt", VTI, size, 0b1, cstr>;
}
-defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>;
-defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">;
+defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<MVE_v8s16, 0b0>;
+defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
// end of mve_qDest_qSrc
@@ -4407,10 +4997,61 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
let Inst{3-0} = Rm{3-0};
}
+// Patterns for vector-scalar instructions with integer operands
+multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI,
+ SDNode unpred_op, SDNode pred_op,
+ bit unpred_has_sign = 0,
+ bit pred_has_sign = 0> {
+ defvar UnpredSign = !if(unpred_has_sign, (? (i32 VTI.Unsigned)), (?));
+ defvar PredSign = !if(pred_has_sign, (? (i32 VTI.Unsigned)), (?));
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated version
+ def : Pat<(VTI.Vec !con((unpred_op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (ARMvdup rGPR:$val))),
+ UnpredSign)),
+ (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>;
+ // Predicated version
+ def : Pat<(VTI.Vec !con((pred_op (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (ARMvdup rGPR:$val))),
+ PredSign,
+ (pred_op (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))),
+ (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+ }
+}
+
+// Patterns for vector-scalar instructions with FP operands
+multiclass MVE_vec_scalar_fp_pat_m<SDNode unpred_op, Intrinsic pred_int,
+ Instruction instr_f16,
+ Instruction instr_f32> {
+ let Predicates = [HasMVEFloat] in {
+ // Unpredicated F16
+ def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)))),
+ (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val)))>;
+ // Unpredicated F32
+ def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)))),
+ (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val)))>;
+ // Predicated F16
+ def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)),
+ (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+ (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val),
+ ARMVCCThen, (v8i1 VCCR:$mask),
+ (v8f16 MQPR:$inactive)))>;
+ // Predicated F32
+ def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)),
+ (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+ (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val),
+ ARMVCCThen, (v4i1 VCCR:$mask),
+ (v4f32 MQPR:$inactive)))>;
+ }
+}
+
class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
- bit bit_5, bit bit_12, bit bit_16,
- bit bit_28, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+ bit bit_5, bit bit_12, bit bit_16, bit bit_28>
+ : MVE_qDest_rSrc<iname, suffix, ""> {
let Inst{28} = bit_28;
let Inst{21-20} = size;
@@ -4421,42 +5062,60 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
let validForTailPredication = 1;
}
-multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix,
- bit bit_5, bit bit_12, bit bit_16,
- bit bit_28, list<dag> pattern=[]> {
- def "8" : MVE_VADDSUB_qr<iname, suffix#"8", 0b00,
- bit_5, bit_12, bit_16, bit_28>;
- def "16" : MVE_VADDSUB_qr<iname, suffix#"16", 0b01,
- bit_5, bit_12, bit_16, bit_28>;
- def "32" : MVE_VADDSUB_qr<iname, suffix#"32", 0b10,
- bit_5, bit_12, bit_16, bit_28>;
-}
-
-defm MVE_VADD_qr_i : MVE_VADDSUB_qr_sizes<"vadd", "i", 0b0, 0b0, 0b1, 0b0>;
-defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>;
-defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>;
-
-defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>;
-defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>;
-defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>;
-
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
- (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
- (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
- (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
-}
-
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
- (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
- (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
- (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
-}
+// Vector-scalar add/sub
+multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+ SDNode unpred_op, Intrinsic pred_int> {
+ def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>;
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+ unpred_op, pred_int>;
+}
+
+multiclass MVE_VADD_qr_m<MVEVectorVTInfo VTI>
+ : MVE_VADDSUB_qr_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>;
+
+multiclass MVE_VSUB_qr_m<MVEVectorVTInfo VTI>
+ : MVE_VADDSUB_qr_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>;
+
+defm MVE_VADD_qr_i8 : MVE_VADD_qr_m<MVE_v16i8>;
+defm MVE_VADD_qr_i16 : MVE_VADD_qr_m<MVE_v8i16>;
+defm MVE_VADD_qr_i32 : MVE_VADD_qr_m<MVE_v4i32>;
+
+defm MVE_VSUB_qr_i8 : MVE_VSUB_qr_m<MVE_v16i8>;
+defm MVE_VSUB_qr_i16 : MVE_VSUB_qr_m<MVE_v8i16>;
+defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m<MVE_v4i32>;
+
+// Vector-scalar saturating add/sub
+multiclass MVE_VQADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+ SDNode unpred_op_s, SDNode unpred_op_u,
+ Intrinsic pred_int> {
+ def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b1, subtract,
+ 0b0, VTI.Unsigned>;
+ defvar unpred_op = !if(VTI.Unsigned, unpred_op_u, unpred_op_s);
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+ unpred_op, pred_int, 0, 1>;
+}
+
+multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI>
+ : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, saddsat, uaddsat,
+ int_arm_mve_qadd_predicated>;
+
+multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI>
+ : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, ssubsat, usubsat,
+ int_arm_mve_qsub_predicated>;
+
+defm MVE_VQADD_qr_s8 : MVE_VQADD_qr_m<MVE_v16s8>;
+defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16>;
+defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32>;
+defm MVE_VQADD_qr_u8 : MVE_VQADD_qr_m<MVE_v16u8>;
+defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16>;
+defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32>;
+
+defm MVE_VQSUB_qr_s8 : MVE_VQSUB_qr_m<MVE_v16s8>;
+defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16>;
+defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32>;
+defm MVE_VQSUB_qr_u8 : MVE_VQSUB_qr_m<MVE_v16u8>;
+defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16>;
+defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32>;
class MVE_VQDMULL_qr<string iname, string suffix, bit size,
bit T, string cstr="", list<dag> pattern=[]>
@@ -4469,15 +5128,40 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size,
let Inst{8} = 0b1;
let Inst{5} = 0b1;
let validForTailPredication = 1;
+ let doubleWidthResult = 1;
}
-multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> {
- def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>;
- def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>;
+multiclass MVE_VQDMULL_qr_m<string iname, MVEVectorVTInfo VTI, bit size,
+ bit T, string cstr> {
+ def "" : MVE_VQDMULL_qr<iname, VTI.Suffix, size, T, cstr>;
+ defvar Inst = !cast<Instruction>(NAME);
+
+ let Predicates = [HasMVEInt] in {
+ // Unpredicated saturating multiply
+ def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (ARMvdup rGPR:$val)),
+ (i32 T))),
+ (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>;
+ // Predicated saturating multiply
+ def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated
+ (VTI.Vec MQPR:$Qm),
+ (VTI.Vec (ARMvdup rGPR:$val)),
+ (i32 T),
+ (VTI.DblPred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive))),
+ (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val),
+ ARMVCCThen, (VTI.DblPred VCCR:$mask),
+ (VTI.DblVec MQPR:$inactive)))>;
+ }
}
-defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>;
-defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">;
+multiclass MVE_VQDMULL_qr_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> {
+ defm bh : MVE_VQDMULL_qr_m<"vqdmullb", VTI, size, 0b0, cstr>;
+ defm th : MVE_VQDMULL_qr_m<"vqdmullt", VTI, size, 0b1, cstr>;
+}
+
+defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<MVE_v8s16, 0b0>;
+defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
class MVE_VxADDSUB_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, bit subtract,
@@ -4493,19 +5177,34 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
let validForTailPredication = 1;
}
-def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>;
-def MVE_VHADD_qr_s16 : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>;
-def MVE_VHADD_qr_s32 : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>;
-def MVE_VHADD_qr_u8 : MVE_VxADDSUB_qr<"vhadd", "u8", 0b1, 0b00, 0b0>;
-def MVE_VHADD_qr_u16 : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>;
-def MVE_VHADD_qr_u32 : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>;
+multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+ Intrinsic unpred_int, Intrinsic pred_int> {
+ def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract>;
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME),
+ VTI, unpred_int, pred_int, 1, 1>;
+}
+
+multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI> :
+ MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd,
+ int_arm_mve_hadd_predicated>;
+
+multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI> :
+ MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub,
+ int_arm_mve_hsub_predicated>;
-def MVE_VHSUB_qr_s8 : MVE_VxADDSUB_qr<"vhsub", "s8", 0b0, 0b00, 0b1>;
-def MVE_VHSUB_qr_s16 : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>;
-def MVE_VHSUB_qr_s32 : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>;
-def MVE_VHSUB_qr_u8 : MVE_VxADDSUB_qr<"vhsub", "u8", 0b1, 0b00, 0b1>;
-def MVE_VHSUB_qr_u16 : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>;
-def MVE_VHSUB_qr_u32 : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>;
+defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m<MVE_v16s8>;
+defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16>;
+defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32>;
+defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m<MVE_v16u8>;
+defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16>;
+defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32>;
+
+defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m<MVE_v16s8>;
+defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16>;
+defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32>;
+defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m<MVE_v16u8>;
+defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>;
+defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
let Predicates = [HasMVEFloat] in {
def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>;
@@ -4515,6 +5214,11 @@ let Predicates = [HasMVEFloat] in {
def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>;
}
+defm : MVE_vec_scalar_fp_pat_m<fadd, int_arm_mve_add_predicated,
+ MVE_VADD_qr_f16, MVE_VADD_qr_f32>;
+defm : MVE_vec_scalar_fp_pat_m<fsub, int_arm_mve_sub_predicated,
+ MVE_VSUB_qr_f16, MVE_VSUB_qr_f32>;
+
class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
bit bit_7, bit bit_17, list<dag> pattern=[]>
: MVE_qDest_single_rSrc<iname, suffix, pattern> {
@@ -4563,19 +5267,19 @@ defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>;
defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>;
let Predicates = [HasMVEInt] in {
- def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
- (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
- def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
- (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
- def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
- (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
+ def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))),
+ (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), rGPR:$Rm))>;
+ def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))),
+ (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), rGPR:$Rm))>;
+ def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))),
+ (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), rGPR:$Rm))>;
- def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
- (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
- def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
- (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
- def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
- (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
+ def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))),
+ (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), rGPR:$Rm))>;
+ def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))),
+ (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), rGPR:$Rm))>;
+ def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))),
+ (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), rGPR:$Rm))>;
}
class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
@@ -4594,6 +5298,20 @@ def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>;
def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>;
def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>;
+multiclass MVE_VBRSR_pat_m<MVEVectorVTInfo VTI, Instruction Inst> {
+ // Unpredicated
+ def : Pat<(VTI.Vec (int_arm_mve_vbrsr (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm)))>;
+ // Predicated
+ def : Pat<(VTI.Vec (int_arm_mve_vbrsr_predicated
+ (VTI.Vec MQPR:$inactive),
+ (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm),
+ (VTI.Pred VCCR:$mask))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm),
+ ARMVCCThen, (VTI.Pred VCCR:$mask),
+ (VTI.Vec MQPR:$inactive)))>;
+}
+
let Predicates = [HasMVEInt] in {
def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))),
(v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>;
@@ -4603,11 +5321,19 @@ let Predicates = [HasMVEInt] in {
def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))),
(v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>;
+
+ defm : MVE_VBRSR_pat_m<MVE_v16i8, MVE_VBRSR8>;
+ defm : MVE_VBRSR_pat_m<MVE_v8i16, MVE_VBRSR16>;
+ defm : MVE_VBRSR_pat_m<MVE_v4i32, MVE_VBRSR32>;
}
-class MVE_VMUL_qr_int<string iname, string suffix,
- bits<2> size, list<dag> pattern=[]>
- : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+let Predicates = [HasMVEFloat] in {
+ defm : MVE_VBRSR_pat_m<MVE_v8f16, MVE_VBRSR16>;
+ defm : MVE_VBRSR_pat_m<MVE_v4f32, MVE_VBRSR32>;
+}
+
+class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size>
+ : MVE_qDest_rSrc<iname, suffix, ""> {
let Inst{28} = 0b0;
let Inst{21-20} = size;
@@ -4618,19 +5344,16 @@ class MVE_VMUL_qr_int<string iname, string suffix,
let validForTailPredication = 1;
}
-def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>;
-def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>;
-def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>;
-
-let Predicates = [HasMVEInt] in {
- def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
- (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
- (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
- def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
- (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> {
+ def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>;
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+ mul, int_arm_mve_mul_predicated>;
}
+defm MVE_VMUL_qr_i8 : MVE_VMUL_qr_int_m<MVE_v16i8>;
+defm MVE_VMUL_qr_i16 : MVE_VMUL_qr_int_m<MVE_v8i16>;
+defm MVE_VMUL_qr_i32 : MVE_VMUL_qr_int_m<MVE_v4i32>;
+
class MVE_VxxMUL_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
: MVE_qDest_rSrc<iname, suffix, "", pattern> {
@@ -4643,19 +5366,37 @@ class MVE_VxxMUL_qr<string iname, string suffix,
let Inst{5} = 0b1;
}
-def MVE_VQDMULH_qr_s8 : MVE_VxxMUL_qr<"vqdmulh", "s8", 0b0, 0b00>;
-def MVE_VQDMULH_qr_s16 : MVE_VxxMUL_qr<"vqdmulh", "s16", 0b0, 0b01>;
-def MVE_VQDMULH_qr_s32 : MVE_VxxMUL_qr<"vqdmulh", "s32", 0b0, 0b10>;
+multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28,
+ Intrinsic int_unpred, Intrinsic int_pred> {
+ def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>;
+ defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+ int_unpred, int_pred>;
+}
+
+multiclass MVE_VQDMULH_qr_m<MVEVectorVTInfo VTI> :
+ MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0,
+ int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>;
+
+multiclass MVE_VQRDMULH_qr_m<MVEVectorVTInfo VTI> :
+ MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1,
+ int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>;
-def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>;
-def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>;
-def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>;
+defm MVE_VQDMULH_qr_s8 : MVE_VQDMULH_qr_m<MVE_v16s8>;
+defm MVE_VQDMULH_qr_s16 : MVE_VQDMULH_qr_m<MVE_v8s16>;
+defm MVE_VQDMULH_qr_s32 : MVE_VQDMULH_qr_m<MVE_v4s32>;
+
+defm MVE_VQRDMULH_qr_s8 : MVE_VQRDMULH_qr_m<MVE_v16s8>;
+defm MVE_VQRDMULH_qr_s16 : MVE_VQRDMULH_qr_m<MVE_v8s16>;
+defm MVE_VQRDMULH_qr_s32 : MVE_VQRDMULH_qr_m<MVE_v4s32>;
let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
}
+defm : MVE_vec_scalar_fp_pat_m<fmul, int_arm_mve_mul_predicated,
+ MVE_VMUL_qr_f16, MVE_VMUL_qr_f32>;
+
class MVE_VFMAMLA_qr<string iname, string suffix,
bit bit_28, bits<2> bits_21_20, bit S,
list<dag> pattern=[]>
@@ -4668,42 +5409,87 @@ class MVE_VFMAMLA_qr<string iname, string suffix,
let Inst{8} = 0b0;
let Inst{5} = 0b0;
let validForTailPredication = 1;
+ let hasSideEffects = 0;
}
-def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>;
-def MVE_VMLA_qr_s16 : MVE_VFMAMLA_qr<"vmla", "s16", 0b0, 0b01, 0b0>;
-def MVE_VMLA_qr_s32 : MVE_VFMAMLA_qr<"vmla", "s32", 0b0, 0b10, 0b0>;
-def MVE_VMLA_qr_u8 : MVE_VFMAMLA_qr<"vmla", "u8", 0b1, 0b00, 0b0>;
-def MVE_VMLA_qr_u16 : MVE_VFMAMLA_qr<"vmla", "u16", 0b1, 0b01, 0b0>;
-def MVE_VMLA_qr_u32 : MVE_VFMAMLA_qr<"vmla", "u32", 0b1, 0b10, 0b0>;
+multiclass MVE_VMLA_qr_multi<string iname, MVEVectorVTInfo VTI,
+ bit scalar_addend> {
+ def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
+ scalar_addend>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_n_predicated");
+ defvar v1 = (VTI.Vec MQPR:$v1);
+ defvar v2 = (VTI.Vec MQPR:$v2);
+ defvar vs = (VTI.Vec (ARMvdup rGPR:$s));
+ defvar s = (i32 rGPR:$s);
+ defvar pred = (VTI.Pred VCCR:$pred);
+
+ // The signed and unsigned variants of this instruction have different
+ // encodings, but they're functionally identical. For the sake of
+ // determinism, we generate only the unsigned variant.
+ if VTI.Unsigned then let Predicates = [HasMVEInt] in {
+ if scalar_addend then {
+ def : Pat<(VTI.Vec (add (mul v1, v2), vs)),
+ (VTI.Vec (Inst v1, v2, s))>;
+ } else {
+ def : Pat<(VTI.Vec (add (mul v2, vs), v1)),
+ (VTI.Vec (Inst v1, v2, s))>;
+ }
-def MVE_VMLAS_qr_s8 : MVE_VFMAMLA_qr<"vmlas", "s8", 0b0, 0b00, 0b1>;
-def MVE_VMLAS_qr_s16 : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>;
-def MVE_VMLAS_qr_s32 : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>;
-def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>;
-def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>;
-def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>;
+ def : Pat<(VTI.Vec (pred_int v1, v2, s, pred)),
+ (VTI.Vec (Inst v1, v2, s, ARMVCCThen, pred))>;
+ }
+}
-let Predicates = [HasMVEInt] in {
- def : Pat<(v4i32 (add (v4i32 MQPR:$src1),
- (v4i32 (mul (v4i32 MQPR:$src2),
- (v4i32 (ARMvdup (i32 rGPR:$x))))))),
- (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>;
- def : Pat<(v8i16 (add (v8i16 MQPR:$src1),
- (v8i16 (mul (v8i16 MQPR:$src2),
- (v8i16 (ARMvdup (i32 rGPR:$x))))))),
- (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>;
- def : Pat<(v16i8 (add (v16i8 MQPR:$src1),
- (v16i8 (mul (v16i8 MQPR:$src2),
- (v16i8 (ARMvdup (i32 rGPR:$x))))))),
- (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>;
+defm MVE_VMLA_qr_s8 : MVE_VMLA_qr_multi<"vmla", MVE_v16s8, 0b0>;
+defm MVE_VMLA_qr_s16 : MVE_VMLA_qr_multi<"vmla", MVE_v8s16, 0b0>;
+defm MVE_VMLA_qr_s32 : MVE_VMLA_qr_multi<"vmla", MVE_v4s32, 0b0>;
+defm MVE_VMLA_qr_u8 : MVE_VMLA_qr_multi<"vmla", MVE_v16u8, 0b0>;
+defm MVE_VMLA_qr_u16 : MVE_VMLA_qr_multi<"vmla", MVE_v8u16, 0b0>;
+defm MVE_VMLA_qr_u32 : MVE_VMLA_qr_multi<"vmla", MVE_v4u32, 0b0>;
+
+defm MVE_VMLAS_qr_s8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16s8, 0b1>;
+defm MVE_VMLAS_qr_s16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8s16, 0b1>;
+defm MVE_VMLAS_qr_s32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4s32, 0b1>;
+defm MVE_VMLAS_qr_u8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16u8, 0b1>;
+defm MVE_VMLAS_qr_u16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8u16, 0b1>;
+defm MVE_VMLAS_qr_u32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4u32, 0b1>;
+
+multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
+ bit scalar_addend> {
+ def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar pred_int = int_arm_mve_fma_predicated;
+ defvar v1 = (VTI.Vec MQPR:$v1);
+ defvar v2 = (VTI.Vec MQPR:$v2);
+ defvar vs = (VTI.Vec (ARMvdup (i32 rGPR:$s)));
+ defvar is = (i32 rGPR:$s);
+ defvar pred = (VTI.Pred VCCR:$pred);
+
+ let Predicates = [HasMVEFloat] in {
+ if scalar_addend then {
+ def : Pat<(VTI.Vec (fma v1, v2, vs)),
+ (VTI.Vec (Inst v1, v2, is))>;
+ def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)),
+ (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>;
+ } else {
+ def : Pat<(VTI.Vec (fma v1, vs, v2)),
+ (VTI.Vec (Inst v2, v1, is))>;
+ def : Pat<(VTI.Vec (fma vs, v1, v2)),
+ (VTI.Vec (Inst v2, v1, is))>;
+ def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)),
+ (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
+ def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)),
+ (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
+ }
+ }
}
let Predicates = [HasMVEFloat] in {
- def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>;
- def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>;
- def MVE_VFMA_qr_Sf16 : MVE_VFMAMLA_qr<"vfmas", "f16", 0b1, 0b11, 0b1>;
- def MVE_VFMA_qr_Sf32 : MVE_VFMAMLA_qr<"vfmas", "f32", 0b0, 0b11, 0b1>;
+ defm MVE_VFMA_qr_f16 : MVE_VFMA_qr_multi<"vfma", MVE_v8f16, 0>;
+ defm MVE_VFMA_qr_f32 : MVE_VFMA_qr_multi<"vfma", MVE_v4f32, 0>;
+ defm MVE_VFMA_qr_Sf16 : MVE_VFMA_qr_multi<"vfmas", MVE_v8f16, 1>;
+ defm MVE_VFMA_qr_Sf32 : MVE_VFMA_qr_multi<"vfmas", MVE_v4f32, 1>;
}
class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size,
@@ -4718,10 +5504,30 @@ class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size,
let Inst{5} = bit_5;
}
+multiclass MVE_VQDMLAH_qr_multi<string iname, MVEVectorVTInfo VTI,
+ bit bit_5, bit bit_12> {
+ def "": MVE_VQDMLAH_qr<iname, VTI.Suffix, 0b0, VTI.Size, bit_5, bit_12>;
+ defvar Inst = !cast<Instruction>(NAME);
+ defvar unpred_int = !cast<Intrinsic>("int_arm_mve_" # iname);
+ defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_predicated");
+
+ let Predicates = [HasMVEInt] in {
+ def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2),
+ (i32 rGPR:$s))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2),
+ (i32 rGPR:$s)))>;
+ def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2),
+ (i32 rGPR:$s), (VTI.Pred VCCR:$pred))),
+ (VTI.Vec (Inst (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2),
+ (i32 rGPR:$s), ARMVCCThen,
+ (VTI.Pred VCCR:$pred)))>;
+ }
+}
+
multiclass MVE_VQDMLAH_qr_types<string iname, bit bit_5, bit bit_12> {
- def s8 : MVE_VQDMLAH_qr<iname, "s8", 0b0, 0b00, bit_5, bit_12>;
- def s16 : MVE_VQDMLAH_qr<iname, "s16", 0b0, 0b01, bit_5, bit_12>;
- def s32 : MVE_VQDMLAH_qr<iname, "s32", 0b0, 0b10, bit_5, bit_12>;
+ defm s8 : MVE_VQDMLAH_qr_multi<iname, MVE_v16s8, bit_5, bit_12>;
+ defm s16 : MVE_VQDMLAH_qr_multi<iname, MVE_v8s16, bit_5, bit_12>;
+ defm s32 : MVE_VQDMLAH_qr_multi<iname, MVE_v4s32, bit_5, bit_12>;
}
defm MVE_VQDMLAH_qr : MVE_VQDMLAH_qr_types<"vqdmlah", 0b1, 0b0>;
@@ -4752,6 +5558,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
let Inst{6-1} = 0b110111;
let Inst{0} = imm{0};
let validForTailPredication = 1;
+ let hasSideEffects = 0;
}
def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>;
@@ -4787,6 +5594,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
let Inst{3-1} = Rm{3-1};
let Inst{0} = imm{0};
let validForTailPredication = 1;
+ let hasSideEffects = 0;
}
def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>;
@@ -4855,6 +5663,8 @@ class MVE_VMOV_64bit<dag oops, dag iops, bit to_qreg, string ops, string cstr>
let Inst{12-5} = 0b01111000;
let Inst{4} = idx2;
let Inst{3-0} = Rt{3-0};
+
+ let hasSideEffects = 0;
}
// The assembly syntax for these instructions mentions the vector
@@ -4924,6 +5734,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
let mayLoad = load;
let mayStore = !eq(load,0);
+ let hasSideEffects = 0;
}
// A parameter class used to encapsulate all the ways the writeback
@@ -5004,22 +5815,44 @@ foreach wb = [MVE_vldst24_writeback<
"vst" # n.nvecs # stage # "." # s.lanesize>;
}
+def SDTARMVST2 : SDTypeProfile<1, 5, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>,
+ SDTCisSameAs<3, 4>, SDTCisVT<5, i32>]>;
+def SDTARMVST4 : SDTypeProfile<1, 7, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>,
+ SDTCisSameAs<3, 4>, SDTCisSameAs<3, 5>,
+ SDTCisSameAs<3, 6>, SDTCisVT<7, i32>]>;
+def MVEVST2UPD : SDNode<"ARMISD::VST2_UPD", SDTARMVST2, [SDNPHasChain]>;
+def MVEVST4UPD : SDNode<"ARMISD::VST4_UPD", SDTARMVST4, [SDNPHasChain]>;
+
multiclass MVE_vst24_patterns<int lanesize, ValueType VT> {
foreach stage = [0,1] in
def : Pat<(int_arm_mve_vst2q i32:$addr,
- (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)),
+ (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)),
(!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize)
- (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
- t2_addr_offset_none:$addr)>;
+ (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
+ t2_addr_offset_none:$addr)>;
+ foreach stage = [0,1] in
+ def : Pat<(i32 (MVEVST2UPD i32:$addr, (i32 32),
+ (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage))),
+ (i32 (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize#_wb)
+ (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
+ t2_addr_offset_none:$addr))>;
foreach stage = [0,1,2,3] in
def : Pat<(int_arm_mve_vst4q i32:$addr,
- (VT MQPR:$v0), (VT MQPR:$v1),
- (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)),
+ (VT MQPR:$v0), (VT MQPR:$v1),
+ (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)),
(!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize)
- (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
- VT:$v2, qsub_2, VT:$v3, qsub_3),
- t2_addr_offset_none:$addr)>;
+ (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
+ VT:$v2, qsub_2, VT:$v3, qsub_3),
+ t2_addr_offset_none:$addr)>;
+ foreach stage = [0,1,2,3] in
+ def : Pat<(i32 (MVEVST4UPD i32:$addr, (i32 64),
+ (VT MQPR:$v0), (VT MQPR:$v1),
+ (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage))),
+ (i32 (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize#_wb)
+ (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
+ VT:$v2, qsub_2, VT:$v3, qsub_3),
+ t2_addr_offset_none:$addr))>;
}
defm : MVE_vst24_patterns<8, v16i8>;
defm : MVE_vst24_patterns<16, v8i16>;
@@ -5097,6 +5930,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
let mayLoad = dir.load;
let mayStore = !eq(dir.load,0);
+ let hasSideEffects = 0;
let validForTailPredication = 1;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
index 6244d8d9e27e..1b3f6075c0e9 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -509,11 +509,6 @@ def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>;
def NEONvsliImm : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>;
def NEONvsriImm : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>;
-def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
- SDTCisVT<2, i32>]>;
-def NEONvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
-def NEONvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
-
def NEONvbsl : SDNode<"ARMISD::VBSL",
SDTypeProfile<1, 3, [SDTCisVec<0>,
SDTCisSameAs<0, 1>,
@@ -531,11 +526,6 @@ def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
-def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
- SDTCisSameAs<1, 2>]>;
-def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
-def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
-
def SDTARMVTBL1 : SDTypeProfile<1, 2, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
SDTCisVT<2, v8i8>]>;
def SDTARMVTBL2 : SDTypeProfile<1, 3, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
@@ -1084,6 +1074,12 @@ def : Pat<(vector_insert (v4f16 DPR:$src),
def : Pat<(vector_insert (v8f16 QPR:$src),
(f16 (load addrmode6:$addr)), imm:$lane),
(VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v4bf16 DPR:$src),
+ (bf16 (load addrmode6:$addr)), imm:$lane),
+ (VLD1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v8bf16 QPR:$src),
+ (bf16 (load addrmode6:$addr)), imm:$lane),
+ (VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
def : Pat<(vector_insert (v2f32 DPR:$src),
(f32 (load addrmode6:$addr)), imm:$lane),
(VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
@@ -2459,57 +2455,6 @@ def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
}
//===----------------------------------------------------------------------===//
-// NEON pattern fragments
-//===----------------------------------------------------------------------===//
-
-// Extract D sub-registers of Q registers.
-def DSubReg_i8_reg : SDNodeXForm<imm, [{
- assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
- return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N),
- MVT::i32);
-}]>;
-def DSubReg_i16_reg : SDNodeXForm<imm, [{
- assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
- return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N),
- MVT::i32);
-}]>;
-def DSubReg_i32_reg : SDNodeXForm<imm, [{
- assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
- return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N),
- MVT::i32);
-}]>;
-def DSubReg_f64_reg : SDNodeXForm<imm, [{
- assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
- return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N),
- MVT::i32);
-}]>;
-
-// Extract S sub-registers of Q/D registers.
-def SSubReg_f32_reg : SDNodeXForm<imm, [{
- assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
- return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N),
- MVT::i32);
-}]>;
-
-// Extract S sub-registers of Q/D registers containing a given f16 lane.
-def SSubReg_f16_reg : SDNodeXForm<imm, [{
- assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
- return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N),
- MVT::i32);
-}]>;
-
-// Translate lane numbers from Q registers to D subregs.
-def SubReg_i8_lane : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32);
-}]>;
-def SubReg_i16_lane : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32);
-}]>;
-def SubReg_i32_lane : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32);
-}]>;
-
-//===----------------------------------------------------------------------===//
// Instruction Classes
//===----------------------------------------------------------------------===//
@@ -4367,7 +4312,7 @@ def : Pat<(v2f32 (fmul DPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
(i32 0))>;
def : Pat<(v4f16 (fmul DPR:$Rn, (ARMvdup (f16 HPR:$Rm)))),
(VMULslhd DPR:$Rn,
- (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), (f16 HPR:$Rm), ssub_0),
(i32 0))>;
def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
(VMULslfq QPR:$Rn,
@@ -4375,7 +4320,7 @@ def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
(i32 0))>;
def : Pat<(v8f16 (fmul QPR:$Rn, (ARMvdup (f16 HPR:$Rm)))),
(VMULslhq QPR:$Rn,
- (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
+ (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), (f16 HPR:$Rm), ssub_0),
(i32 0))>;
}
@@ -4433,17 +4378,17 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
DecoderNamespace = "NEONData" in {
defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
- "vmull", "s", NEONvmulls, 1>;
+ "vmull", "s", ARMvmulls, 1>;
defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
- "vmull", "u", NEONvmullu, 1>;
+ "vmull", "u", ARMvmullu, 1>;
def VMULLp8 : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
v8i16, v8i8, int_arm_neon_vmullp, 1>;
def VMULLp64 : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
"vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
Requires<[HasV8, HasCrypto]>;
}
-defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
-defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
+defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", ARMvmulls>;
+defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", ARMvmullu>;
// VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D)
defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
@@ -4513,12 +4458,12 @@ def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
// VMLAL : Vector Multiply Accumulate Long (Q += D * D)
defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
- "vmlal", "s", NEONvmulls, add>;
+ "vmlal", "s", ARMvmulls, add>;
defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
- "vmlal", "u", NEONvmullu, add>;
+ "vmlal", "u", ARMvmullu, add>;
-defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
-defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
+defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", ARMvmulls, add>;
+defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", ARMvmullu, add>;
let Predicates = [HasNEON, HasV8_1a] in {
// v8.1a Neon Rounding Double Multiply-Op vector operations,
@@ -4746,12 +4691,12 @@ def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
// VMLSL : Vector Multiply Subtract Long (Q -= D * D)
defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
- "vmlsl", "s", NEONvmulls, sub>;
+ "vmlsl", "s", ARMvmulls, sub>;
defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
- "vmlsl", "u", NEONvmullu, sub>;
+ "vmlsl", "u", ARMvmullu, sub>;
-defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
-defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
+defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", ARMvmulls, sub>;
+defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", ARMvmullu, sub>;
// VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
@@ -4833,10 +4778,10 @@ def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
// We put them in the VFPV8 decoder namespace because the ARM and Thumb
// encodings are the same and thus no further bit twiddling is necessary
// in the disassembler.
-class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy,
- ValueType AccumTy, ValueType InputTy,
+class VDOT<bit op6, bit op4, bit op23, RegisterClass RegTy, string Asm,
+ string AsmTy, ValueType AccumTy, ValueType InputTy,
SDPatternOperator OpNode> :
- N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
+ N3Vnp<{0b1100, op23}, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
(ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD,
Asm, AsmTy,
[(set (AccumTy RegTy:$dst),
@@ -4848,10 +4793,10 @@ class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy,
let Constraints = "$dst = $Vd";
}
-def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8, int_arm_neon_udot>;
-def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8, int_arm_neon_sdot>;
-def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
-def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
+def VUDOTD : VDOT<0, 1, 0, DPR, "vudot", "u8", v2i32, v8i8, int_arm_neon_udot>;
+def VSDOTD : VDOT<0, 0, 0, DPR, "vsdot", "s8", v2i32, v8i8, int_arm_neon_sdot>;
+def VUDOTQ : VDOT<1, 1, 0, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
+def VSDOTQ : VDOT<1, 0, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
// Indexed dot product instructions:
multiclass DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty,
@@ -4886,6 +4831,68 @@ defm VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR, v4i32, v16i8,
defm VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR, v4i32, v16i8,
int_arm_neon_sdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+// v8.6A matrix multiplication extension
+let Predicates = [HasMatMulInt8] in {
+ class N3VMatMul<bit B, bit U, string Asm, string AsmTy,
+ SDPatternOperator OpNode>
+ : N3Vnp<{0b1100, B}, 0b10, 0b1100, 1, U, (outs QPR:$dst),
+ (ins QPR:$Vd, QPR:$Vn, QPR:$Vm), N3RegFrm, NoItinerary,
+ Asm, AsmTy,
+ [(set (v4i32 QPR:$dst), (OpNode (v4i32 QPR:$Vd),
+ (v16i8 QPR:$Vn),
+ (v16i8 QPR:$Vm)))]> {
+ let DecoderNamespace = "VFPV8";
+ let Constraints = "$dst = $Vd";
+ }
+
+ multiclass N3VMixedDotLane<bit Q, bit U, string Asm, string AsmTy, RegisterClass RegTy,
+ ValueType AccumTy, ValueType InputTy, SDPatternOperator OpNode,
+ dag RHS> {
+
+ def "" : N3Vnp<0b11101, 0b00, 0b1101, Q, U, (outs RegTy:$dst),
+ (ins RegTy:$Vd, RegTy:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), N3RegFrm,
+ NoItinerary, Asm, AsmTy, []> {
+ bit lane;
+ let Inst{5} = lane;
+ let AsmString = !strconcat(Asm, ".", AsmTy, "\t$Vd, $Vn, $Vm$lane");
+ let DecoderNamespace = "VFPV8";
+ let Constraints = "$dst = $Vd";
+ }
+
+ def : Pat<
+ (AccumTy (OpNode (AccumTy RegTy:$Vd),
+ (InputTy RegTy:$Vn),
+ (InputTy (bitconvert (AccumTy
+ (ARMvduplane (AccumTy RegTy:$Vm),
+ VectorIndex32:$lane)))))),
+ (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
+
+ }
+
+ multiclass SUDOTLane<bit Q, RegisterClass RegTy, ValueType AccumTy, ValueType InputTy, dag RHS>
+ : N3VMixedDotLane<Q, 1, "vsudot", "u8", RegTy, AccumTy, InputTy, null_frag, null_frag> {
+ def : Pat<
+ (AccumTy (int_arm_neon_usdot (AccumTy RegTy:$Vd),
+ (InputTy (bitconvert (AccumTy
+ (ARMvduplane (AccumTy RegTy:$Vm),
+ VectorIndex32:$lane)))),
+ (InputTy RegTy:$Vn))),
+ (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
+ }
+
+ def VSMMLA : N3VMatMul<0, 0, "vsmmla", "s8", int_arm_neon_smmla>;
+ def VUMMLA : N3VMatMul<0, 1, "vummla", "u8", int_arm_neon_ummla>;
+ def VUSMMLA : N3VMatMul<1, 0, "vusmmla", "s8", int_arm_neon_usmmla>;
+ def VUSDOTD : VDOT<0, 0, 1, DPR, "vusdot", "s8", v2i32, v8i8, int_arm_neon_usdot>;
+ def VUSDOTQ : VDOT<1, 0, 1, QPR, "vusdot", "s8", v4i32, v16i8, int_arm_neon_usdot>;
+
+ defm VUSDOTDI : N3VMixedDotLane<0, 0, "vusdot", "s8", DPR, v2i32, v8i8,
+ int_arm_neon_usdot, (v2i32 DPR_VFP2:$Vm)>;
+ defm VUSDOTQI : N3VMixedDotLane<1, 0, "vusdot", "s8", QPR, v4i32, v16i8,
+ int_arm_neon_usdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+ defm VSUDOTDI : SUDOTLane<0, DPR, v2i32, v8i8, (v2i32 DPR_VFP2:$Vm)>;
+ defm VSUDOTQI : SUDOTLane<1, QPR, v4i32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+}
// ARMv8.3 complex operations
class BaseN3VCP8ComplexTied<bit op21, bit op4, bit s, bit q,
@@ -5232,7 +5239,6 @@ class VFMQ<string opc, string type, bits<2> S>
let Inst{3} = idx{0};
}
-let hasNoSchedulingInfo = 1 in {
// op1 op2 op3
def VFMALD : N3VCP8F16Q0<"vfmal", DPR, SPR, SPR, 0b00, 0b10, 1>;
def VFMSLD : N3VCP8F16Q0<"vfmsl", DPR, SPR, SPR, 0b01, 0b10, 1>;
@@ -5242,7 +5248,6 @@ def VFMALDI : VFMD<"vfmal", "f16", 0b00>;
def VFMSLDI : VFMD<"vfmsl", "f16", 0b01>;
def VFMALQI : VFMQ<"vfmal", "f16", 0b00>;
def VFMSLQI : VFMQ<"vfmsl", "f16", 0b01>;
-}
} // HasNEON, HasFP16FML
@@ -5296,7 +5301,7 @@ def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1,
IIC_VMOVImm,
"vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
[(set DPR:$Vd,
- (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+ (v4i16 (ARMvorrImm DPR:$src, timm:$SIMM)))]> {
let Inst{9} = SIMM{9};
}
@@ -5305,7 +5310,7 @@ def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1,
IIC_VMOVImm,
"vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
[(set DPR:$Vd,
- (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+ (v2i32 (ARMvorrImm DPR:$src, timm:$SIMM)))]> {
let Inst{10-9} = SIMM{10-9};
}
@@ -5314,7 +5319,7 @@ def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1,
IIC_VMOVImm,
"vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
[(set QPR:$Vd,
- (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+ (v8i16 (ARMvorrImm QPR:$src, timm:$SIMM)))]> {
let Inst{9} = SIMM{9};
}
@@ -5323,7 +5328,7 @@ def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
IIC_VMOVImm,
"vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
[(set QPR:$Vd,
- (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+ (v4i32 (ARMvorrImm QPR:$src, timm:$SIMM)))]> {
let Inst{10-9} = SIMM{10-9};
}
@@ -5347,7 +5352,7 @@ def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
IIC_VMOVImm,
"vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
[(set DPR:$Vd,
- (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+ (v4i16 (ARMvbicImm DPR:$src, timm:$SIMM)))]> {
let Inst{9} = SIMM{9};
}
@@ -5356,7 +5361,7 @@ def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1,
IIC_VMOVImm,
"vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
[(set DPR:$Vd,
- (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+ (v2i32 (ARMvbicImm DPR:$src, timm:$SIMM)))]> {
let Inst{10-9} = SIMM{10-9};
}
@@ -5365,7 +5370,7 @@ def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1,
IIC_VMOVImm,
"vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
[(set QPR:$Vd,
- (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+ (v8i16 (ARMvbicImm QPR:$src, timm:$SIMM)))]> {
let Inst{9} = SIMM{9};
}
@@ -5374,7 +5379,7 @@ def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1,
IIC_VMOVImm,
"vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
[(set QPR:$Vd,
- (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+ (v4i32 (ARMvbicImm QPR:$src, timm:$SIMM)))]> {
let Inst{10-9} = SIMM{10-9};
}
@@ -6354,32 +6359,57 @@ def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
(EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
}
-def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
-def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
-
-let Predicates = [HasNEON] in {
-def : Pat<(extractelt (v4f16 DPR:$src), imm_even:$lane),
- (EXTRACT_SUBREG
- (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
- (SSubReg_f16_reg imm_even:$lane))>;
+multiclass ExtractEltEvenF16<ValueType VT4, ValueType VT8> {
+ def : Pat<(extractelt (VT4 DPR:$src), imm_even:$lane),
+ (EXTRACT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (VT4 DPR:$src), DPR_VFP2)),
+ (SSubReg_f16_reg imm_even:$lane))>;
+ def : Pat<(extractelt (VT8 QPR:$src), imm_even:$lane),
+ (EXTRACT_SUBREG
+ (v4f32 (COPY_TO_REGCLASS (VT8 QPR:$src), QPR_VFP2)),
+ (SSubReg_f16_reg imm_even:$lane))>;
+}
-def : Pat<(extractelt (v4f16 DPR:$src), imm_odd:$lane),
+multiclass ExtractEltOddF16VMOVH<ValueType VT4, ValueType VT8> {
+ def : Pat<(extractelt (VT4 DPR:$src), imm_odd:$lane),
(COPY_TO_REGCLASS
(VMOVH (EXTRACT_SUBREG
- (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
- (SSubReg_f16_reg imm_odd:$lane))),
+ (v2f32 (COPY_TO_REGCLASS (VT4 DPR:$src), DPR_VFP2)),
+ (SSubReg_f16_reg imm_odd:$lane))),
HPR)>;
+ def : Pat<(extractelt (VT8 QPR:$src), imm_odd:$lane),
+ (COPY_TO_REGCLASS
+ (VMOVH (EXTRACT_SUBREG
+ (v4f32 (COPY_TO_REGCLASS (VT8 QPR:$src), QPR_VFP2)),
+ (SSubReg_f16_reg imm_odd:$lane))),
+ HPR)>;
+}
+
+let Predicates = [HasNEON] in {
+ defm : ExtractEltEvenF16<v4f16, v8f16>;
+ defm : ExtractEltOddF16VMOVH<v4f16, v8f16>;
+}
+
+let AddedComplexity = 1, Predicates = [HasNEON, HasBF16, HasFullFP16] in {
+ // If VMOVH (vmovx.f16) is available use it to extract BF16 from the odd lanes
+ defm : ExtractEltOddF16VMOVH<v4bf16, v8bf16>;
+}
-def : Pat<(extractelt (v8f16 QPR:$src), imm_even:$lane),
- (EXTRACT_SUBREG
- (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
- (SSubReg_f16_reg imm_even:$lane))>;
+let Predicates = [HasBF16, HasNEON] in {
+ defm : ExtractEltEvenF16<v4bf16, v8bf16>;
-def : Pat<(extractelt (v8f16 QPR:$src), imm_odd:$lane),
+ // Otherwise, if VMOVH is not available resort to extracting the odd lane
+ // into a GPR and then moving to HPR
+ def : Pat<(extractelt (v4bf16 DPR:$src), imm_odd:$lane),
(COPY_TO_REGCLASS
- (VMOVH (EXTRACT_SUBREG
- (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
- (SSubReg_f16_reg imm_odd:$lane))),
+ (VGETLNu16 (v4bf16 DPR:$src), imm:$lane),
+ HPR)>;
+
+ def : Pat<(extractelt (v8bf16 QPR:$src), imm_odd:$lane),
+ (COPY_TO_REGCLASS
+ (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)),
HPR)>;
}
@@ -6415,6 +6445,21 @@ def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
}
}
+// TODO: for odd lanes we could optimize this a bit by using the VINS
+// FullFP16 instruction when it is available
+multiclass InsertEltF16<ValueType VTScalar, ValueType VT4, ValueType VT8> {
+ def : Pat<(insertelt (VT4 DPR:$src1), (VTScalar HPR:$src2), imm:$lane),
+ (VT4 (VSETLNi16 DPR:$src1,
+ (COPY_TO_REGCLASS HPR:$src2, GPR), imm:$lane))>;
+ def : Pat<(insertelt (VT8 QPR:$src1), (VTScalar HPR:$src2), imm:$lane),
+ (VT8 (INSERT_SUBREG QPR:$src1,
+ (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+ (DSubReg_i16_reg imm:$lane))),
+ (COPY_TO_REGCLASS HPR:$src2, GPR),
+ (SubReg_i16_lane imm:$lane))),
+ (DSubReg_i16_reg imm:$lane)))>;
+}
+
let Predicates = [HasNEON] in {
def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
(v16i8 (INSERT_SUBREG QPR:$src1,
@@ -6442,14 +6487,7 @@ def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
(INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
-def : Pat<(insertelt (v4f16 DPR:$src1), HPR:$src2, imm:$lane),
- (v4f16 (VSETLNi16 DPR:$src1, (VMOVRH $src2), imm:$lane))>;
-def : Pat<(insertelt (v8f16 QPR:$src1), HPR:$src2, imm:$lane),
- (v8f16 (INSERT_SUBREG QPR:$src1,
- (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
- (DSubReg_i16_reg imm:$lane))),
- (VMOVRH $src2), (SubReg_i16_lane imm:$lane))),
- (DSubReg_i16_reg imm:$lane)))>;
+defm : InsertEltF16<f16, v4f16, v8f16>;
//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
// (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
@@ -6484,6 +6522,9 @@ def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
dsub_0)>;
}
+let Predicates = [HasNEON, HasBF16] in
+defm : InsertEltF16<bf16, v4bf16, v8bf16>;
+
// VDUP : Vector Duplicate (from ARM core register to all elements)
class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
@@ -6588,18 +6629,35 @@ def : Pat<(v4f32 (ARMvduplane (v4f32 QPR:$src), imm:$lane)),
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
-def : Pat<(v4f16 (ARMvdup HPR:$src)),
+def : Pat<(v4f16 (ARMvdup (f16 HPR:$src))),
(v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
- HPR:$src, ssub_0), (i32 0)))>;
+ (f16 HPR:$src), ssub_0), (i32 0)))>;
def : Pat<(v2f32 (ARMvdup (f32 SPR:$src))),
(v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
SPR:$src, ssub_0), (i32 0)))>;
def : Pat<(v4f32 (ARMvdup (f32 SPR:$src))),
(v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
SPR:$src, ssub_0), (i32 0)))>;
-def : Pat<(v8f16 (ARMvdup HPR:$src)),
+def : Pat<(v8f16 (ARMvdup (f16 HPR:$src))),
(v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
- HPR:$src, ssub_0), (i32 0)))>;
+ (f16 HPR:$src), ssub_0), (i32 0)))>;
+}
+
+let Predicates = [HasNEON, HasBF16] in {
+def : Pat<(v4bf16 (ARMvduplane (v4bf16 DPR:$Vm), imm:$lane)),
+ (VDUPLN16d DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v8bf16 (ARMvduplane (v8bf16 QPR:$src), imm:$lane)),
+ (v8bf16 (VDUPLN16q (v4bf16 (EXTRACT_SUBREG QPR:$src,
+ (DSubReg_i16_reg imm:$lane))),
+ (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4bf16 (ARMvdup (bf16 HPR:$src))),
+ (v4bf16 (VDUPLN16d (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)),
+ (bf16 HPR:$src), ssub_0), (i32 0)))>;
+def : Pat<(v8bf16 (ARMvdup (bf16 HPR:$src))),
+ (v8bf16 (VDUPLN16q (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)),
+ (bf16 HPR:$src), ssub_0), (i32 0)))>;
}
// VMOVN : Vector Narrowing Move
@@ -7330,7 +7388,7 @@ def : Pat<(arm_vmovsr GPR:$a),
Requires<[HasNEON, DontUseVMOVSR]>;
//===----------------------------------------------------------------------===//
-// Non-Instruction Patterns or Endiness - Revert Patterns
+// Non-Instruction Patterns or Endianess - Revert Patterns
//===----------------------------------------------------------------------===//
// bit_convert
@@ -7345,6 +7403,9 @@ def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v4f16 DPR:$src))), (v4i16 DPR:$src)>;
def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (v4f16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v4bf16 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4i16 DPR:$src))), (v4bf16 DPR:$src)>;
+
// 128 bit conversions
def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
@@ -7354,6 +7415,9 @@ def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>;
def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v8bf16 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8i16 QPR:$src))), (v8bf16 QPR:$src)>;
}
let Predicates = [IsLE,HasNEON] in {
@@ -7361,24 +7425,28 @@ let Predicates = [IsLE,HasNEON] in {
def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>;
def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>;
def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4bf16 DPR:$src))), (f64 DPR:$src)>;
def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>;
def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (v1i64 DPR:$src)>;
+ def : Pat<(v1i64 (bitconvert (v4bf16 DPR:$src))), (v1i64 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (v2f32 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v4bf16 DPR:$src))), (v2f32 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (v2i32 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v4bf16 DPR:$src))), (v2i32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>;
@@ -7388,6 +7456,12 @@ let Predicates = [IsLE,HasNEON] in {
def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (v4f16 DPR:$src)>;
def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (v4f16 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (f64 DPR:$src))), (v4bf16 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (v1i64 DPR:$src))), (v4bf16 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (v2f32 DPR:$src))), (v4bf16 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (v2i32 DPR:$src))), (v4bf16 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (v8i8 DPR:$src))), (v4bf16 DPR:$src)>;
+
def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
@@ -7399,30 +7473,35 @@ let Predicates = [IsLE,HasNEON] in {
def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (v8i8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v4bf16 DPR:$src))), (v8i8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>;
// 128 bit conversions
def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8bf16 QPR:$src))), (v2f64 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8bf16 QPR:$src))), (v2i64 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8bf16 QPR:$src))), (v4f32 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8bf16 QPR:$src))), (v4i32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
@@ -7432,6 +7511,12 @@ let Predicates = [IsLE,HasNEON] in {
def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>;
def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v2f64 QPR:$src))), (v8bf16 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v2i64 QPR:$src))), (v8bf16 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v4f32 QPR:$src))), (v8bf16 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v4i32 QPR:$src))), (v8bf16 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v16i8 QPR:$src))), (v8bf16 QPR:$src)>;
+
def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
@@ -7443,6 +7528,7 @@ let Predicates = [IsLE,HasNEON] in {
def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8bf16 QPR:$src))), (v16i8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
}
@@ -7451,24 +7537,28 @@ let Predicates = [IsBE,HasNEON] in {
def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(f64 (bitconvert (v4bf16 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(v1i64 (bitconvert (v4bf16 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v2f32 (bitconvert (v4bf16 DPR:$src))), (VREV32d16 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v2i32 (bitconvert (v4bf16 DPR:$src))), (VREV32d16 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>;
@@ -7478,6 +7568,12 @@ let Predicates = [IsBE,HasNEON] in {
def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
+ def : Pat<(v4bf16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>;
+
def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
@@ -7489,30 +7585,35 @@ let Predicates = [IsBE,HasNEON] in {
def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (VREV16d8 DPR:$src)>;
+ def : Pat<(v8i8 (bitconvert (v4bf16 DPR:$src))), (VREV16d8 DPR:$src)>;
def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>;
// 128 bit conversions
def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8bf16 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8bf16 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8bf16 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8bf16 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>;
@@ -7522,6 +7623,12 @@ let Predicates = [IsBE,HasNEON] in {
def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
+ def : Pat<(v8bf16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>;
+
def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
@@ -7533,9 +7640,26 @@ let Predicates = [IsBE,HasNEON] in {
def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8 QPR:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8bf16 QPR:$src))), (VREV16q8 QPR:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>;
}
+let Predicates = [HasNEON] in {
+ // Here we match the specific SDNode type 'ARMVectorRegCastImpl'
+ // rather than the more general 'ARMVectorRegCast' which would also
+ // match some bitconverts. If we use the latter in cases where the
+ // input and output types are the same, the bitconvert gets elided
+ // and we end up generating a nonsense match of nothing.
+
+ foreach VT = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in
+ foreach VT2 = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in
+ def : Pat<(VT (ARMVectorRegCastImpl (VT2 QPR:$src))), (VT QPR:$src)>;
+
+ foreach VT = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, f64 ] in
+ foreach VT2 = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, f64 ] in
+ def : Pat<(VT (ARMVectorRegCastImpl (VT2 DPR:$src))), (VT DPR:$src)>;
+}
+
// Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
let Predicates = [IsBE,HasNEON] in {
def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
@@ -7863,6 +7987,8 @@ def : Pat<(v4f32 (concat_vectors DPR:$Dn, DPR:$Dm)),
(REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
def : Pat<(v8f16 (concat_vectors DPR:$Dn, DPR:$Dm)),
(REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v8bf16 (concat_vectors DPR:$Dn, DPR:$Dm)),
+ (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
}
//===----------------------------------------------------------------------===//
@@ -8915,3 +9041,115 @@ def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
(VMOVv4i32 QPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
(VMOVv2i32 DPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
+
+// ARMv8.6a BFloat16 instructions.
+let Predicates = [HasBF16, HasNEON] in {
+class BF16VDOT<bits<5> op27_23, bits<2> op21_20, bit op6,
+ dag oops, dag iops, list<dag> pattern>
+ : N3Vnp<op27_23, op21_20, 0b1101, op6, 0, oops, iops,
+ N3RegFrm, IIC_VDOTPROD, "", "", pattern>
+{
+ let DecoderNamespace = "VFPV8";
+}
+
+class BF16VDOTS<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy, ValueType InputTy>
+ : BF16VDOT<0b11000, 0b00, Q, (outs RegTy:$dst),
+ (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
+ [(set (AccumTy RegTy:$dst),
+ (int_arm_neon_bfdot (AccumTy RegTy:$Vd),
+ (InputTy RegTy:$Vn),
+ (InputTy RegTy:$Vm)))]> {
+ let Constraints = "$dst = $Vd";
+ let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
+ let DecoderNamespace = "VFPV8";
+}
+
+multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
+ ValueType InputTy, dag RHS> {
+
+ def "" : BF16VDOT<0b11100, 0b00, Q, (outs RegTy:$dst),
+ (ins RegTy:$Vd, RegTy:$Vn,
+ DPR_VFP2:$Vm, VectorIndex32:$lane), []> {
+ bit lane;
+ let Inst{5} = lane;
+ let Constraints = "$dst = $Vd";
+ let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm$lane");
+ let DecoderNamespace = "VFPV8";
+ }
+
+ def : Pat<
+ (AccumTy (int_arm_neon_bfdot (AccumTy RegTy:$Vd),
+ (InputTy RegTy:$Vn),
+ (InputTy (bitconvert (AccumTy
+ (ARMvduplane (AccumTy RegTy:$Vm),
+ VectorIndex32:$lane)))))),
+ (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
+}
+
+def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
+def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
+
+defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
+defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+
+class BF16MM<bit Q, RegisterClass RegTy,
+ string opc>
+ : N3Vnp<0b11000, 0b00, 0b1100, Q, 0,
+ (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
+ N3RegFrm, IIC_VDOTPROD, "", "",
+ [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
+ (v16i8 QPR:$Vn),
+ (v16i8 QPR:$Vm)))]> {
+ let Constraints = "$dst = $Vd";
+ let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
+ let DecoderNamespace = "VFPV8";
+}
+
+def VMMLA : BF16MM<1, QPR, "vmmla">;
+
+class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
+ : N3VCP8<0b00, 0b11, T, 1,
+ (outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, QPR:$Vm),
+ NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
+ [(set (v4f32 QPR:$dst),
+ (OpNode (v4f32 QPR:$Vd),
+ (v16i8 QPR:$Vn),
+ (v16i8 QPR:$Vm)))]> {
+ let Constraints = "$dst = $Vd";
+ let DecoderNamespace = "VFPV8";
+}
+
+def VBF16MALTQ: VBF16MALQ<1, "t", int_arm_neon_bfmlalt>;
+def VBF16MALBQ: VBF16MALQ<0, "b", int_arm_neon_bfmlalb>;
+
+multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
+ def "" : N3VLaneCP8<0, 0b11, T, 1, (outs QPR:$dst),
+ (ins QPR:$Vd, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx),
+ IIC_VMACD, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm$idx", "", []> {
+ bits<2> idx;
+ let Inst{5} = idx{1};
+ let Inst{3} = idx{0};
+ let Constraints = "$dst = $Vd";
+ let DecoderNamespace = "VFPV8";
+ }
+
+ def : Pat<
+ (v4f32 (OpNode (v4f32 QPR:$Vd),
+ (v16i8 QPR:$Vn),
+ (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
+ VectorIndex16:$lane)))))),
+ (!cast<Instruction>(NAME) QPR:$Vd,
+ QPR:$Vn,
+ (EXTRACT_SUBREG QPR:$Vm,
+ (DSubReg_i16_reg VectorIndex16:$lane)),
+ (SubReg_i16_lane VectorIndex16:$lane))>;
+}
+
+defm VBF16MALTQI: VBF16MALQI<1, "t", int_arm_neon_bfmlalt>;
+defm VBF16MALBQI: VBF16MALQI<0, "b", int_arm_neon_bfmlalb>;
+
+def BF16_VCVT : N2V<0b11, 0b11, 0b01, 0b10, 0b01100, 1, 0,
+ (outs DPR:$Vd), (ins QPR:$Vm),
+ NoItinerary, "vcvt", "bf16.f32", "$Vd, $Vm", "", []>;
+}
+// End of BFloat16 instructions
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
index 18bcbda44580..7fae32117243 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -14,6 +14,10 @@
// Thumb specific DAG Nodes.
//
+def ARMtsecall : SDNode<"ARMISD::tSECALL", SDT_ARMcall,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+ SDNPVariadic]>;
+
def imm_sr_XFORM: SDNodeXForm<imm, [{
unsigned Imm = N->getZExtValue();
return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
@@ -499,6 +503,10 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
[(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>;
+ // alternative return for CMSE entry functions
+ def tBXNS_RET : tPseudoInst<(outs), (ins), 2, IIC_Br,
+ [(ARMseretflag)]>, Sched<[WriteBr]>;
+
// Alternative return instruction used by vararg functions.
def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
2, IIC_Br, [],
@@ -560,6 +568,10 @@ let isCall = 1,
let Unpredictable{1-0} = 0b11;
}
+ def tBLXNS_CALL : PseudoInst<(outs), (ins GPRnopc:$func), IIC_Br,
+ [(ARMtsecall GPRnopc:$func)]>,
+ Requires<[IsThumb, Has8MSecExt]>, Sched<[WriteBr]>;
+
// ARMv4T
def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
4, IIC_Br,
@@ -1513,7 +1525,7 @@ def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
// tromped upon when we get here from a longjmp(). We force everything out of
// registers except for our own input by listing the relevant registers in
// Defs. By doing so, we also cause the prologue/epilogue code to actively
-// preserve all of the callee-saved resgisters, which is exactly what we want.
+// preserve all of the callee-saved registers, which is exactly what we want.
// $val is a scratch register for our use.
let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ],
hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
index c5aae235f25d..7137e8ee66b8 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -270,7 +270,8 @@ def t2am_imm8_offset : MemOperand,
// t2addrmode_imm8s4 := reg +/- (imm8 << 2)
def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
-class T2AddrMode_Imm8s4 : MemOperand {
+class T2AddrMode_Imm8s4 : MemOperand,
+ ComplexPattern<i32, 2, "SelectT2AddrModeImm8<2>", []> {
let EncoderMethod = "getT2AddrModeImm8s4OpValue";
let DecoderMethod = "DecodeT2AddrModeImm8s4";
let ParserMatchClass = MemImm8s4OffsetAsmOperand;
@@ -1448,7 +1449,8 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
// Load doubleword
def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
(ins t2addrmode_imm8s4:$addr),
- IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>,
+ IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "",
+ [(set rGPR:$Rt, rGPR:$Rt2, (ARMldrd t2addrmode_imm8s4:$addr))]>,
Sched<[WriteLd]>;
} // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
@@ -1629,7 +1631,8 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
(ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
- IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>,
+ IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "",
+ [(ARMstrd rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr)]>,
Sched<[WriteST]>;
// Indexed stores
@@ -1745,7 +1748,7 @@ def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
// ldrd / strd pre / post variants
-let mayLoad = 1 in
+let mayLoad = 1, hasSideEffects = 0 in
def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
(ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
"ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []>,
@@ -1753,13 +1756,13 @@ def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
let DecoderMethod = "DecodeT2LDRDPreInstruction";
}
-let mayLoad = 1 in
+let mayLoad = 1, hasSideEffects = 0 in
def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
(ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
"$addr.base = $wb", []>, Sched<[WriteLd]>;
-let mayStore = 1 in
+let mayStore = 1, hasSideEffects = 0 in
def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
(ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
@@ -1767,7 +1770,7 @@ def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
let DecoderMethod = "DecodeT2STRDPreInstruction";
}
-let mayStore = 1 in
+let mayStore = 1, hasSideEffects = 0 in
def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
(ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
t2am_imm8s4_offset:$imm),
@@ -1871,6 +1874,34 @@ defm t2PLD : T2Ipl<0, 0, "pld">, Requires<[IsThumb2]>;
defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
defm t2PLI : T2Ipl<0, 1, "pli">, Requires<[IsThumb2,HasV7]>;
+// PLD/PLDW/PLI aliases w/ the optional .w suffix
+def : t2InstAlias<"pld${p}.w\t$addr",
+ (t2PLDi12 t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"pld${p}.w\t$addr",
+ (t2PLDi8 t2addrmode_negimm8:$addr, pred:$p)>;
+def : t2InstAlias<"pld${p}.w\t$addr",
+ (t2PLDs t2addrmode_so_reg:$addr, pred:$p)>;
+
+def : InstAlias<"pldw${p}.w\t$addr",
+ (t2PLDWi12 t2addrmode_imm12:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7,HasMP]>;
+def : InstAlias<"pldw${p}.w\t$addr",
+ (t2PLDWi8 t2addrmode_negimm8:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7,HasMP]>;
+def : InstAlias<"pldw${p}.w\t$addr",
+ (t2PLDWs t2addrmode_so_reg:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7,HasMP]>;
+
+def : InstAlias<"pli${p}.w\t$addr",
+ (t2PLIi12 t2addrmode_imm12:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7]>;
+def : InstAlias<"pli${p}.w\t$addr",
+ (t2PLIi8 t2addrmode_negimm8:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7]>;
+def : InstAlias<"pli${p}.w\t$addr",
+ (t2PLIs t2addrmode_so_reg:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7]>;
+
// pci variant is very similar to i12, but supports negative offsets
// from the PC. Only PLD and PLI have pci variants (not PLDW)
class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
@@ -1893,6 +1924,24 @@ class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
def t2PLDpci : T2Iplpci<0, "pld">, Requires<[IsThumb2]>;
def t2PLIpci : T2Iplpci<1, "pli">, Requires<[IsThumb2,HasV7]>;
+def : t2InstAlias<"pld${p}.w $addr",
+ (t2PLDpci t2ldrlabel:$addr, pred:$p)>;
+def : InstAlias<"pli${p}.w $addr",
+ (t2PLIpci t2ldrlabel:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7]>;
+
+// PLD/PLI with alternate literal form.
+def : t2InstAlias<"pld${p} $addr",
+ (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : InstAlias<"pli${p} $addr",
+ (t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7]>;
+def : t2InstAlias<"pld${p}.w $addr",
+ (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : InstAlias<"pli${p}.w $addr",
+ (t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
+ Requires<[IsThumb2,HasV7]>;
+
//===----------------------------------------------------------------------===//
// Load / store multiple Instructions.
//
@@ -2436,7 +2485,7 @@ def : Thumb2DSPPat<(int_arm_qadd rGPR:$Rm, rGPR:$Rn),
(t2QADD rGPR:$Rm, rGPR:$Rn)>;
def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, rGPR:$Rn),
(t2QSUB rGPR:$Rm, rGPR:$Rn)>;
-def : Thumb2DSPPat<(int_arm_qadd(int_arm_qadd rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+def : Thumb2DSPPat<(int_arm_qadd rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)),
(t2QDADD rGPR:$Rm, rGPR:$Rn)>;
def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)),
(t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
@@ -2445,7 +2494,7 @@ def : Thumb2DSPPat<(saddsat rGPR:$Rm, rGPR:$Rn),
(t2QADD rGPR:$Rm, rGPR:$Rn)>;
def : Thumb2DSPPat<(ssubsat rGPR:$Rm, rGPR:$Rn),
(t2QSUB rGPR:$Rm, rGPR:$Rn)>;
-def : Thumb2DSPPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+def : Thumb2DSPPat<(saddsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
(t2QDADD rGPR:$Rm, rGPR:$Rn)>;
def : Thumb2DSPPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
(t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
@@ -2716,6 +2765,8 @@ def t2SBFX: T2TwoRegBitFI<
let Inst{25} = 1;
let Inst{24-20} = 0b10100;
let Inst{15} = 0;
+
+ let hasSideEffects = 0;
}
def t2UBFX: T2TwoRegBitFI<
@@ -2725,6 +2776,8 @@ def t2UBFX: T2TwoRegBitFI<
let Inst{25} = 1;
let Inst{24-20} = 0b11100;
let Inst{15} = 0;
+
+ let hasSideEffects = 0;
}
// A8.8.247 UDF - Undefined (Encoding T2)
@@ -3708,7 +3761,7 @@ def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
// when we get here from a longjmp(). We force everything out of registers
// except for our own input by listing the relevant registers in Defs. By
// doing so, we also cause the prologue/epilogue code to actively preserve
-// all of the callee-saved resgisters, which is exactly what we want.
+// all of the callee-saved registers, which is exactly what we want.
// $val is a scratch register for our use.
let Defs =
[ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR,
@@ -4147,7 +4200,7 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
imm:$cp))]>,
Requires<[IsThumb2]>;
-// Pseudo isntruction that combines movs + predicated rsbmi
+// Pseudo instruction that combines movs + predicated rsbmi
// to implement integer ABS
let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
@@ -4848,9 +4901,15 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
(t2TSTrr rGPR:$Rn, rGPR:$Rm, pred:$p)>;
// Memory barriers
+def : InstAlias<"dmb${p}.w\t$opt", (t2DMB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dmb${p}.w", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}.w\t$opt", (t2DSB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}.w", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}.w\t$opt", (t2ISB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}.w", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
// Non-predicable aliases of a predicable DSB: the predicate is (14, 0) where
// 14 = AL (always execute) and 0 = "instruction doesn't read the CPSR".
@@ -5184,14 +5243,6 @@ def : t2InstAlias<"ldr${p}.w $Rt, $immediate",
(t2LDRConstPool GPRnopc:$Rt,
const_pool_asm_imm:$immediate, pred:$p)>;
-// PLD/PLDW/PLI with alternate literal form.
-def : t2InstAlias<"pld${p} $addr",
- (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
-def : InstAlias<"pli${p} $addr",
- (t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
- Requires<[IsThumb2,HasV7]>;
-
-
//===----------------------------------------------------------------------===//
// ARMv8.1m instructions
//
@@ -5204,7 +5255,7 @@ class V8_1MI<dag oops, dag iops, AddrMode am, InstrItinClass itin, string asm,
def t2CLRM : V8_1MI<(outs),
(ins pred:$p, reglist_with_apsr:$regs, variable_ops),
- AddrModeNone, NoItinerary, "clrm", "${p}\t$regs", "", []> {
+ AddrModeNone, NoItinerary, "clrm${p}", "$regs", "", []> {
bits<16> regs;
let Inst{31-16} = 0b1110100010011111;
@@ -5357,6 +5408,7 @@ def t2DoLoopStart :
t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br,
[(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>;
+let hasSideEffects = 0 in
def t2LoopDec :
t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
4, IIC_Br, []>, Sched<[WriteBr]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
index f1d1d8a89164..8a652c1d90f6 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -158,11 +158,24 @@ def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
let isUnpredicable = 1 in
def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr),
IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
- [(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>,
+ [(set HPR:$Sd, (f16 (alignedload16 addrmode5fp16:$addr)))]>,
Requires<[HasFPRegs16]>;
} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
+def : Pat<(bf16 (alignedload16 addrmode5fp16:$addr)),
+ (VLDRH addrmode5fp16:$addr)> {
+ let Predicates = [HasFPRegs16];
+}
+def : Pat<(bf16 (alignedload16 addrmode3:$addr)),
+ (COPY_TO_REGCLASS (LDRH addrmode3:$addr), HPR)> {
+ let Predicates = [HasNoFPRegs16, IsARM];
+}
+def : Pat<(bf16 (alignedload16 t2addrmode_imm12:$addr)),
+ (COPY_TO_REGCLASS (t2LDRHi12 t2addrmode_imm12:$addr), HPR)> {
+ let Predicates = [HasNoFPRegs16, IsThumb];
+}
+
def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
IIC_fpStore64, "vstr", "\t$Dd, $addr",
[(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>,
@@ -180,9 +193,22 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
let isUnpredicable = 1 in
def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
- [(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>,
+ [(alignedstore16 (f16 HPR:$Sd), addrmode5fp16:$addr)]>,
Requires<[HasFPRegs16]>;
+def : Pat<(alignedstore16 (bf16 HPR:$Sd), addrmode5fp16:$addr),
+ (VSTRH (bf16 HPR:$Sd), addrmode5fp16:$addr)> {
+ let Predicates = [HasFPRegs16];
+}
+def : Pat<(alignedstore16 (bf16 HPR:$Sd), addrmode3:$addr),
+ (STRH (COPY_TO_REGCLASS $Sd, GPR), addrmode3:$addr)> {
+ let Predicates = [HasNoFPRegs16, IsARM];
+}
+def : Pat<(alignedstore16 (bf16 HPR:$Sd), t2addrmode_imm12:$addr),
+ (t2STRHi12 (COPY_TO_REGCLASS $Sd, GPR), t2addrmode_imm12:$addr)> {
+ let Predicates = [HasNoFPRegs16, IsThumb];
+}
+
//===----------------------------------------------------------------------===//
// Load / store multiple Instructions.
//
@@ -277,7 +303,6 @@ def : MnemonicAlias<"vstm", "vstmia">;
//===----------------------------------------------------------------------===//
// Lazy load / store multiple Instructions
//
-let mayLoad = 1 in
def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
NoItinerary, "vlldm${p}\t$Rn", "", []>,
Requires<[HasV8MMainline, Has8MSecExt]> {
@@ -288,9 +313,9 @@ def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
let Inst{15-12} = 0;
let Inst{7-0} = 0;
let mayLoad = 1;
+ let Defs = [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, VPR, FPSCR, FPSCR_NZCV];
}
-let mayStore = 1 in
def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
NoItinerary, "vlstm${p}\t$Rn", "", []>,
Requires<[HasV8MMainline, Has8MSecExt]> {
@@ -387,7 +412,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VADDH : AHbI<0b11100, 0b11, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>,
+ [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPALU32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -412,7 +437,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VSUBH : AHbI<0b11100, 0b11, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>,
+ [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPALU32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -433,7 +458,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VDIVH : AHbI<0b11101, 0b00, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fdiv HPR:$Sn, HPR:$Sm))]>,
+ [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPDIV32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -458,7 +483,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VMULH : AHbI<0b11100, 0b10, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fmul HPR:$Sn, HPR:$Sm))]>,
+ [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
def VNMULD : ADbI<0b11100, 0b10, 1, 0,
@@ -480,7 +505,7 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
def VNMULH : AHbI<0b11100, 0b10, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fneg (fmul HPR:$Sn, HPR:$Sm)))]>,
+ [(set (f16 HPR:$Sd), (fneg (fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
multiclass vsel_inst<string op, bits<2> opc, int CC> {
@@ -489,7 +514,7 @@ multiclass vsel_inst<string op, bits<2> opc, int CC> {
def H : AHbInp<0b11100, opc, 0,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"),
- [(set HPR:$Sd, (ARMcmov HPR:$Sm, HPR:$Sn, CC))]>,
+ [(set (f16 HPR:$Sd), (ARMcmov (f16 HPR:$Sm), (f16 HPR:$Sn), CC))]>,
Requires<[HasFullFP16]>;
def S : ASbInp<0b11100, opc, 0,
@@ -518,7 +543,7 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
def H : AHbInp<0b11101, 0b00, opc,
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
- [(set HPR:$Sd, (SD HPR:$Sn, HPR:$Sm))]>,
+ [(set (f16 HPR:$Sd), (SD (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
Requires<[HasFullFP16]>;
def S : ASbInp<0b11101, 0b00, opc,
@@ -564,7 +589,7 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
(outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
- [(arm_cmpfpe HPR:$Sd, HPR:$Sm)]>;
+ [(arm_cmpfpe (f16 HPR:$Sd), (f16 HPR:$Sm))]>;
def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins DPR:$Dd, DPR:$Dm),
@@ -583,7 +608,7 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
- [(arm_cmpfp HPR:$Sd, HPR:$Sm)]>;
+ [(arm_cmpfp (f16 HPR:$Sd), (f16 HPR:$Sm))]>;
} // Defs = [FPSCR_NZCV]
//===----------------------------------------------------------------------===//
@@ -607,7 +632,7 @@ def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
def VABSH : AHuI<0b11101, 0b11, 0b0000, 0b11, 0,
(outs HPR:$Sd), (ins HPR:$Sm),
IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm",
- [(set HPR:$Sd, (fabs (f16 HPR:$Sm)))]>;
+ [(set (f16 HPR:$Sd), (fabs (f16 HPR:$Sm)))]>;
let Defs = [FPSCR_NZCV] in {
def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
@@ -633,7 +658,7 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
(outs), (ins HPR:$Sd),
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
- [(arm_cmpfpe0 HPR:$Sd)]> {
+ [(arm_cmpfpe0 (f16 HPR:$Sd))]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -661,7 +686,7 @@ def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
(outs), (ins HPR:$Sd),
IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
- [(arm_cmpfp0 HPR:$Sd)]> {
+ [(arm_cmpfp0 (f16 HPR:$Sd))]> {
let Inst{3-0} = 0b0000;
let Inst{5} = 0;
}
@@ -683,6 +708,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
let Inst{22} = Dd{4};
let Predicates = [HasVFP2, HasDPVFP];
+ let hasSideEffects = 0;
}
// Special case encoding: bits 11-8 is 0b1011.
@@ -707,20 +733,23 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
let Inst{4} = 0;
let Predicates = [HasVFP2, HasDPVFP];
+ let hasSideEffects = 0;
}
// Between half, single and double-precision.
+let hasSideEffects = 0 in
def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
[/* Intentionally left blank, see patterns below */]>,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
-def : FP16Pat<(f32 (fpextend HPR:$Sm)),
- (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
+def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))),
+ (VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>;
def : FP16Pat<(f16_to_fp GPR:$a),
(VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+let hasSideEffects = 0 in
def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
[/* Intentionally left blank, see patterns below */]>,
@@ -731,19 +760,41 @@ def : FP16Pat<(f16 (fpround SPR:$Sm)),
(COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>;
def : FP16Pat<(fp_to_f16 SPR:$a),
(i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
+ (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH SPR:$src2),
+ (SSubReg_f16_reg imm:$lane)))>;
+def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
+ (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH SPR:$src2),
+ (SSubReg_f16_reg imm:$lane)))>;
+let hasSideEffects = 0 in
def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
- [/* For disassembly only; pattern left blank */]>,
+ [/* Intentionally left blank, see patterns below */]>,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
+def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))),
+ (VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>;
+def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))),
+ (VCVTTHS (EXTRACT_SUBREG
+ (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
+ (SSubReg_f16_reg imm_odd:$lane)))>;
+
+let hasSideEffects = 0 in
def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
- [/* For disassembly only; pattern left blank */]>,
+ [/* Intentionally left blank, see patterns below */]>,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
+def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
+ (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2),
+ (SSubReg_f16_reg imm:$lane)))>;
+def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
+ (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH SPR:$src2),
+ (SSubReg_f16_reg imm:$lane)))>;
+
def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
(outs DPR:$Dd), (ins SPR:$Sm),
NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
@@ -756,10 +807,12 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
// Encode instruction operands.
let Inst{3-0} = Sm{4-1};
let Inst{5} = Sm{0};
+
+ let hasSideEffects = 0;
}
-def : FullFP16Pat<(f64 (fpextend HPR:$Sm)),
- (VCVTBHD (COPY_TO_REGCLASS HPR:$Sm, SPR))>,
+def : FullFP16Pat<(f64 (fpextend (f16 HPR:$Sm))),
+ (VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>,
Requires<[HasFPARMv8, HasDPVFP]>;
def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
(VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>,
@@ -779,6 +832,8 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
let Inst{5} = Dm{4};
let Inst{15-12} = Sd{4-1};
let Inst{22} = Sd{0};
+
+ let hasSideEffects = 0;
}
def : FullFP16Pat<(f16 (fpround DPR:$Dm)),
@@ -798,6 +853,8 @@ def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
// Encode instruction operands.
let Inst{3-0} = Sm{4-1};
let Inst{5} = Sm{0};
+
+ let hasSideEffects = 0;
}
def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
@@ -813,11 +870,13 @@ def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
let Inst{22} = Sd{0};
let Inst{3-0} = Dm{3-0};
let Inst{5} = Dm{4};
+
+ let hasSideEffects = 0;
}
multiclass vcvt_inst<string opc, bits<2> rm,
SDPatternOperator node = null_frag> {
- let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+ let PostEncoderMethod = "", DecoderNamespace = "VFPV8", hasSideEffects = 0 in {
def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
(outs SPR:$Sd), (ins HPR:$Sm),
NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
@@ -883,14 +942,14 @@ multiclass vcvt_inst<string opc, bits<2> rm,
let Predicates = [HasFPARMv8] in {
let Predicates = [HasFullFP16] in {
- def : Pat<(i32 (fp_to_sint (node HPR:$a))),
+ def : Pat<(i32 (fp_to_sint (node (f16 HPR:$a)))),
(COPY_TO_REGCLASS
- (!cast<Instruction>(NAME#"SH") HPR:$a),
+ (!cast<Instruction>(NAME#"SH") (f16 HPR:$a)),
GPR)>;
- def : Pat<(i32 (fp_to_uint (node HPR:$a))),
+ def : Pat<(i32 (fp_to_uint (node (f16 HPR:$a)))),
(COPY_TO_REGCLASS
- (!cast<Instruction>(NAME#"UH") HPR:$a),
+ (!cast<Instruction>(NAME#"UH") (f16 HPR:$a)),
GPR)>;
}
def : Pat<(i32 (fp_to_sint (node SPR:$a))),
@@ -936,7 +995,7 @@ def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
(outs HPR:$Sd), (ins HPR:$Sm),
IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
- [(set HPR:$Sd, (fneg HPR:$Sm))]>;
+ [(set (f16 HPR:$Sd), (fneg (f16 HPR:$Sm)))]>;
multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
@@ -1035,7 +1094,7 @@ def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
(outs HPR:$Sd), (ins HPR:$Sm),
IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
- [(set HPR:$Sd, (fsqrt (f16 HPR:$Sm)))]>;
+ [(set (f16 HPR:$Sd), (fsqrt (f16 HPR:$Sm)))]>;
let hasSideEffects = 0 in {
let isMoveReg = 1 in {
@@ -1250,7 +1309,7 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
def VMOVRH : AVConv2I<0b11100001, 0b1001,
(outs rGPR:$Rt), (ins HPR:$Sn),
IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
- [(set rGPR:$Rt, (arm_vmovrh HPR:$Sn))]>,
+ []>,
Requires<[HasFPRegs16]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
@@ -1272,7 +1331,7 @@ def VMOVRH : AVConv2I<0b11100001, 0b1001,
def VMOVHR : AVConv4I<0b11100000, 0b1001,
(outs HPR:$Sn), (ins rGPR:$Rt),
IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
- [(set HPR:$Sn, (arm_vmovhr rGPR:$Rt))]>,
+ []>,
Requires<[HasFPRegs16]>,
Sched<[WriteFPMOV]> {
// Instruction operands.
@@ -1290,6 +1349,11 @@ def VMOVHR : AVConv4I<0b11100000, 0b1001,
let isUnpredicable = 1;
}
+def : FPRegs16Pat<(arm_vmovrh (f16 HPR:$Sn)), (VMOVRH (f16 HPR:$Sn))>;
+def : FPRegs16Pat<(arm_vmovrh (bf16 HPR:$Sn)), (VMOVRH (bf16 HPR:$Sn))>;
+def : FPRegs16Pat<(f16 (arm_vmovhr rGPR:$Rt)), (VMOVHR rGPR:$Rt)>;
+def : FPRegs16Pat<(bf16 (arm_vmovhr rGPR:$Rt)), (VMOVHR rGPR:$Rt)>;
+
// FMRDH: SPR -> GPR
// FMRDL: SPR -> GPR
// FMRRS: SPR -> GPR
@@ -1317,6 +1381,7 @@ class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
let Inst{22} = Dd{4};
let Predicates = [HasVFP2, HasDPVFP];
+ let hasSideEffects = 0;
}
class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1333,6 +1398,8 @@ class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
let Inst{5} = Sm{0};
let Inst{15-12} = Sd{4-1};
let Inst{22} = Sd{0};
+
+ let hasSideEffects = 0;
}
class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1352,6 +1419,7 @@ class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
let Inst{22} = Sd{0};
let Predicates = [HasFullFP16];
+ let hasSideEffects = 0;
}
def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
@@ -1465,6 +1533,7 @@ class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
let Inst{22} = Sd{0};
let Predicates = [HasVFP2, HasDPVFP];
+ let hasSideEffects = 0;
}
class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1501,6 +1570,7 @@ class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
let Inst{22} = Sd{0};
let Predicates = [HasFullFP16];
+ let hasSideEffects = 0;
}
// Always set Z bit in the instruction, i.e. "round towards zero" variants.
@@ -1548,8 +1618,8 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
let isUnpredicable = 1;
}
-def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)),
- (COPY_TO_REGCLASS (VTOSIZH HPR:$a), GPR)>;
+def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))),
+ (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>;
def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
(outs SPR:$Sd), (ins DPR:$Dm),
@@ -1595,8 +1665,8 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
let isUnpredicable = 1;
}
-def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)),
- (COPY_TO_REGCLASS (VTOUIZH HPR:$a), GPR)>;
+def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))),
+ (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>;
// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
let Uses = [FPSCR] in {
@@ -1680,6 +1750,8 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
// if dp_operation then UInt(D:Vd) else UInt(Vd:D);
let Inst{22} = dst{0};
let Inst{15-12} = dst{4-1};
+
+ let hasSideEffects = 0;
}
// Double Precision register
@@ -1692,6 +1764,7 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
let Inst{22} = dst{4};
let Inst{15-12} = dst{3-0};
+ let hasSideEffects = 0;
let Predicates = [HasVFP2, HasDPVFP];
}
@@ -1867,6 +1940,37 @@ def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
} // End of 'let Constraints = "$a = $dst" in'
+// BFloat16 - Single precision, unary, predicated
+class BF16_VCVT<string opc, bits<2> op7_6>
+ : VFPAI<(outs SPR:$Sd), (ins SPR:$dst, SPR:$Sm),
+ VFPUnaryFrm, NoItinerary,
+ opc, ".bf16.f32\t$Sd, $Sm", []>,
+ RegConstraint<"$dst = $Sd">,
+ Requires<[HasBF16]>,
+ Sched<[]> {
+ bits<5> Sd;
+ bits<5> Sm;
+
+ // Encode instruction operands.
+ let Inst{3-0} = Sm{4-1};
+ let Inst{5} = Sm{0};
+ let Inst{15-12} = Sd{4-1};
+ let Inst{22} = Sd{0};
+
+ let Inst{27-23} = 0b11101; // opcode1
+ let Inst{21-20} = 0b11; // opcode2
+ let Inst{19-16} = 0b0011; // opcode3
+ let Inst{11-8} = 0b1001;
+ let Inst{7-6} = op7_6;
+ let Inst{4} = 0;
+
+ let DecoderNamespace = "VFPV8";
+ let hasSideEffects = 0;
+}
+
+def BF16_VCVTB : BF16_VCVT<"vcvtb", 0b01>;
+def BF16_VCVTT : BF16_VCVT<"vcvtt", 0b11>;
+
//===----------------------------------------------------------------------===//
// FP Multiply-Accumulate Operations.
//
@@ -1896,8 +2000,8 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
def VMLAH : AHbI<0b11100, 0b00, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
- HPR:$Sdin))]>,
+ [(set (f16 HPR:$Sd), (fadd_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)),
+ (f16 HPR:$Sdin)))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx]>;
@@ -1907,8 +2011,8 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>;
-def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
- (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
+ (VMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>;
@@ -1937,8 +2041,8 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
def VMLSH : AHbI<0b11100, 0b00, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
- HPR:$Sdin))]>,
+ [(set (f16 HPR:$Sd), (fadd_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))),
+ (f16 HPR:$Sdin)))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx]>;
@@ -1948,8 +2052,8 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
-def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
- (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
+ (VMLSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
@@ -1977,8 +2081,8 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
- HPR:$Sdin))]>,
+ [(set (f16 HPR:$Sd), (fsub_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))),
+ (f16 HPR:$Sdin)))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx]>;
@@ -1989,8 +2093,8 @@ def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
-def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
- (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx (fneg (fmul_su (f16 HPR:$a), HPR:$b)), HPR:$dstin),
+ (VNMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
// (-dst - (a * b)) -> -(dst + (a * b))
@@ -2000,8 +2104,8 @@ def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
-def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
- (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su (f16 HPR:$a), HPR:$b)),
+ (VNMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
@@ -2028,7 +2132,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
+ [(set (f16 HPR:$Sd), (fsub_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)), (f16 HPR:$Sdin)))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx]>;
@@ -2038,8 +2142,8 @@ def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
(VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
-def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
- (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx (fmul_su (f16 HPR:$a), HPR:$b), HPR:$dstin),
+ (VNMLSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
//===----------------------------------------------------------------------===//
@@ -2069,8 +2173,8 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
def VFMAH : AHbI<0b11101, 0b10, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
- HPR:$Sdin))]>,
+ [(set (f16 HPR:$Sd), (fadd_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)),
+ (f16 HPR:$Sdin)))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2081,8 +2185,8 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
-def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
- (VFMAH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
+ (VFMAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
// Match @llvm.fma.* intrinsics
@@ -2093,8 +2197,8 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
(VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, HPR:$Sdin)),
- (VFMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))),
+ (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
def VFMSD : ADbI<0b11101, 0b10, 1, 0,
@@ -2121,8 +2225,8 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
def VFMSH : AHbI<0b11101, 0b10, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
- HPR:$Sdin))]>,
+ [(set (f16 HPR:$Sd), (fadd_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))),
+ (f16 HPR:$Sdin)))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2133,8 +2237,8 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
-def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
- (VFMSH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
+ (VFMSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
// Match @llvm.fma.* intrinsics
@@ -2145,8 +2249,8 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
(VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin)),
- (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
+ (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
// (fma x, (fneg y), z) -> (vfms z, x, y)
def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
@@ -2155,8 +2259,8 @@ def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
(VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin)),
- (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin))),
+ (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
@@ -2183,8 +2287,8 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
- HPR:$Sdin))]>,
+ [(set (f16 HPR:$Sd), (fsub_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))),
+ (f16 HPR:$Sdin)))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2204,8 +2308,8 @@ def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
(VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
- (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))),
+ (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
// (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
@@ -2214,8 +2318,8 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
(VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, (fneg HPR:$Sdin))),
- (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
+ (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
@@ -2241,7 +2345,7 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
- [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
+ [(set (f16 HPR:$Sd), (fsub_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)), (f16 HPR:$Sdin)))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2262,8 +2366,8 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
(VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (fneg HPR:$Sdin))),
- (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
+ (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
// (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
@@ -2272,8 +2376,8 @@ def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
(VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(fneg (f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin))),
- (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))),
+ (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
// (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
@@ -2282,8 +2386,8 @@ def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
(VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
Requires<[HasVFP4]>;
-def : Pat<(fneg (f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin))),
- (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(fneg (f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin)))),
+ (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
Requires<[HasFullFP16]>;
//===----------------------------------------------------------------------===//
@@ -2306,7 +2410,7 @@ def VMOVScc : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
def VMOVHcc : PseudoInst<(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm, cmovpred:$p),
IIC_fpUNA16,
[(set (f16 HPR:$Sd),
- (ARMcmov HPR:$Sn, HPR:$Sm, cmovpred:$p))]>,
+ (ARMcmov (f16 HPR:$Sn), (f16 HPR:$Sm), cmovpred:$p))]>,
RegConstraint<"$Sd = $Sn">, Requires<[HasFPRegs]>;
} // hasSideEffects
@@ -2512,7 +2616,7 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm),
VFPMiscFrm, IIC_fpUNA16,
"vmov", ".f16\t$Sd, $imm",
- [(set HPR:$Sd, vfp_f16imm:$imm)]>,
+ [(set (f16 HPR:$Sd), vfp_f16imm:$imm)]>,
Requires<[HasFullFP16]> {
bits<5> Sd;
bits<8> imm;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index 67816bc2103f..c8a894fb11a8 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -239,17 +239,17 @@ static bool selectMergeValues(MachineInstrBuilder &MIB,
// We only support G_MERGE_VALUES as a way to stick together two scalar GPRs
// into one DPR.
- Register VReg0 = MIB->getOperand(0).getReg();
+ Register VReg0 = MIB.getReg(0);
(void)VReg0;
assert(MRI.getType(VReg0).getSizeInBits() == 64 &&
RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID &&
"Unsupported operand for G_MERGE_VALUES");
- Register VReg1 = MIB->getOperand(1).getReg();
+ Register VReg1 = MIB.getReg(1);
(void)VReg1;
assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
"Unsupported operand for G_MERGE_VALUES");
- Register VReg2 = MIB->getOperand(2).getReg();
+ Register VReg2 = MIB.getReg(2);
(void)VReg2;
assert(MRI.getType(VReg2).getSizeInBits() == 32 &&
RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID &&
@@ -271,17 +271,17 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB,
// We only support G_UNMERGE_VALUES as a way to break up one DPR into two
// GPRs.
- Register VReg0 = MIB->getOperand(0).getReg();
+ Register VReg0 = MIB.getReg(0);
(void)VReg0;
assert(MRI.getType(VReg0).getSizeInBits() == 32 &&
RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID &&
"Unsupported operand for G_UNMERGE_VALUES");
- Register VReg1 = MIB->getOperand(1).getReg();
+ Register VReg1 = MIB.getReg(1);
(void)VReg1;
assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
"Unsupported operand for G_UNMERGE_VALUES");
- Register VReg2 = MIB->getOperand(2).getReg();
+ Register VReg2 = MIB.getReg(2);
(void)VReg2;
assert(MRI.getType(VReg2).getSizeInBits() == 64 &&
RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::FPRRegBankID &&
@@ -530,7 +530,7 @@ bool ARMInstructionSelector::selectCmp(CmpConstants Helper,
MachineRegisterInfo &MRI) const {
const InsertInfo I(MIB);
- auto ResReg = MIB->getOperand(0).getReg();
+ auto ResReg = MIB.getReg(0);
if (!validReg(MRI, ResReg, 1, ARM::GPRRegBankID))
return false;
@@ -542,8 +542,8 @@ bool ARMInstructionSelector::selectCmp(CmpConstants Helper,
return true;
}
- auto LHSReg = MIB->getOperand(2).getReg();
- auto RHSReg = MIB->getOperand(3).getReg();
+ auto LHSReg = MIB.getReg(2);
+ auto RHSReg = MIB.getReg(3);
if (!validOpRegPair(MRI, LHSReg, RHSReg, Helper.OperandSize,
Helper.OperandRegBankID))
return false;
@@ -627,7 +627,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
bool UseMovt = STI.useMovt();
unsigned Size = TM.getPointerSize(0);
- unsigned Alignment = 4;
+ const Align Alignment(4);
auto addOpsForConstantPoolLoad = [&MF, Alignment,
Size](MachineInstrBuilder &MIB,
@@ -687,7 +687,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
if (Indirect) {
if (!UseOpcodeThatLoads) {
- auto ResultReg = MIB->getOperand(0).getReg();
+ auto ResultReg = MIB.getReg(0);
auto AddressReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
MIB->getOperand(0).setReg(AddressReg);
@@ -773,7 +773,7 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
auto &DbgLoc = MIB->getDebugLoc();
// Compare the condition to 1.
- auto CondReg = MIB->getOperand(1).getReg();
+ auto CondReg = MIB.getReg(1);
assert(validReg(MRI, CondReg, 1, ARM::GPRRegBankID) &&
"Unsupported types for select operation");
auto CmpI = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(Opcodes.TSTri))
@@ -785,9 +785,9 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
// Move a value into the result register based on the result of the
// comparison.
- auto ResReg = MIB->getOperand(0).getReg();
- auto TrueReg = MIB->getOperand(2).getReg();
- auto FalseReg = MIB->getOperand(3).getReg();
+ auto ResReg = MIB.getReg(0);
+ auto TrueReg = MIB.getReg(2);
+ auto FalseReg = MIB.getReg(3);
assert(validOpRegPair(MRI, ResReg, TrueReg, 32, ARM::GPRRegBankID) &&
validOpRegPair(MRI, TrueReg, FalseReg, 32, ARM::GPRRegBankID) &&
"Unsupported types for select operation");
@@ -990,7 +990,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) {
case G_FCONSTANT: {
// Load from constant pool
unsigned Size = MRI.getType(I.getOperand(0).getReg()).getSizeInBits() / 8;
- unsigned Alignment = Size;
+ Align Alignment(Size);
assert((Size == 4 || Size == 8) && "Unsupported FP constant type");
auto LoadOpcode = Size == 4 ? ARM::VLDRS : ARM::VLDRD;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index e2dff51ea61c..f3657155f47e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -357,13 +357,12 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate,
llvm_unreachable("Unsupported size for FCmp predicate");
}
-bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder,
- GISelChangeObserver &Observer) const {
+bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
using namespace TargetOpcode;
- MIRBuilder.setInstr(MI);
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
switch (MI.getOpcode()) {
@@ -445,8 +444,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
} else {
// We need to compare against 0.
assert(CmpInst::isIntPredicate(ResultPred) && "Unsupported predicate");
- auto Zero = MRI.createGenericVirtualRegister(LLT::scalar(32));
- MIRBuilder.buildConstant(Zero, 0);
+ auto Zero = MIRBuilder.buildConstant(LLT::scalar(32), 0);
MIRBuilder.buildICmp(ResultPred, ProcessedResult, LibcallResult, Zero);
}
Results.push_back(ProcessedResult);
@@ -462,7 +460,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
// Convert to integer constants, while preserving the binary representation.
auto AsInteger =
MI.getOperand(1).getFPImm()->getValueAPF().bitcastToAPInt();
- MIRBuilder.buildConstant(MI.getOperand(0).getReg(),
+ MIRBuilder.buildConstant(MI.getOperand(0),
*ConstantInt::get(Ctx, AsInteger));
break;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h
index e95f8cf76103..f1c2e9c94336 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h
@@ -28,9 +28,7 @@ class ARMLegalizerInfo : public LegalizerInfo {
public:
ARMLegalizerInfo(const ARMSubtarget &ST);
- bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder,
- GISelChangeObserver &Observer) const override;
+ bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
private:
void setFCmpLibcallsGNU();
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 12dddd29ca84..a84d23d3bb96 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -32,6 +32,7 @@
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -50,6 +51,7 @@
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
#include "llvm/Support/Allocator.h"
@@ -900,7 +902,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
unsigned Offset = getMemoryOpOffset(*First);
Register Base = getLoadStoreBaseOp(*First).getReg();
bool BaseKill = LatestMI->killsRegister(Base);
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
DebugLoc DL = First->getDebugLoc();
MachineInstr *Merged = nullptr;
@@ -991,7 +993,7 @@ static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
// Stack pointer alignment is out of the programmers control so we can trust
// SP-relative loads/stores.
if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
- STI.getFrameLowering()->getTransientStackAlignment() >= 4)
+ STI.getFrameLowering()->getTransientStackAlign() >= Align(4))
return true;
return false;
}
@@ -1183,8 +1185,8 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
/// Check if the given instruction increments or decrements a register and
/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
/// generated by the instruction are possibly read as well.
-static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
- ARMCC::CondCodes Pred, unsigned PredReg) {
+static int isIncrementOrDecrement(const MachineInstr &MI, Register Reg,
+ ARMCC::CondCodes Pred, Register PredReg) {
bool CheckCPSRDef;
int Scale;
switch (MI.getOpcode()) {
@@ -1201,7 +1203,7 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
default: return 0;
}
- unsigned MIPredReg;
+ Register MIPredReg;
if (MI.getOperand(0).getReg() != Reg ||
MI.getOperand(1).getReg() != Reg ||
getInstrPredicate(MI, MIPredReg) != Pred ||
@@ -1215,8 +1217,8 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
/// Searches for an increment or decrement of \p Reg before \p MBBI.
static MachineBasicBlock::iterator
-findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
- ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+findIncDecBefore(MachineBasicBlock::iterator MBBI, Register Reg,
+ ARMCC::CondCodes Pred, Register PredReg, int &Offset) {
Offset = 0;
MachineBasicBlock &MBB = *MBBI->getParent();
MachineBasicBlock::iterator BeginMBBI = MBB.begin();
@@ -1235,8 +1237,8 @@ findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
/// Searches for a increment or decrement of \p Reg after \p MBBI.
static MachineBasicBlock::iterator
-findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
- ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg,
+ ARMCC::CondCodes Pred, Register PredReg, int &Offset) {
Offset = 0;
MachineBasicBlock &MBB = *MBBI->getParent();
MachineBasicBlock::iterator EndMBBI = MBB.end();
@@ -1270,7 +1272,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
const MachineOperand &BaseOP = MI->getOperand(0);
Register Base = BaseOP.getReg();
bool BaseKill = BaseOP.isKill();
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
unsigned Opcode = MI->getOpcode();
DebugLoc DL = MI->getDebugLoc();
@@ -1383,6 +1385,38 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
case ARM::t2STRi8:
case ARM::t2STRi12:
return ARM::t2STR_POST;
+
+ case ARM::MVE_VLDRBS16:
+ return ARM::MVE_VLDRBS16_post;
+ case ARM::MVE_VLDRBS32:
+ return ARM::MVE_VLDRBS32_post;
+ case ARM::MVE_VLDRBU16:
+ return ARM::MVE_VLDRBU16_post;
+ case ARM::MVE_VLDRBU32:
+ return ARM::MVE_VLDRBU32_post;
+ case ARM::MVE_VLDRHS32:
+ return ARM::MVE_VLDRHS32_post;
+ case ARM::MVE_VLDRHU32:
+ return ARM::MVE_VLDRHU32_post;
+ case ARM::MVE_VLDRBU8:
+ return ARM::MVE_VLDRBU8_post;
+ case ARM::MVE_VLDRHU16:
+ return ARM::MVE_VLDRHU16_post;
+ case ARM::MVE_VLDRWU32:
+ return ARM::MVE_VLDRWU32_post;
+ case ARM::MVE_VSTRB16:
+ return ARM::MVE_VSTRB16_post;
+ case ARM::MVE_VSTRB32:
+ return ARM::MVE_VSTRB32_post;
+ case ARM::MVE_VSTRH32:
+ return ARM::MVE_VSTRH32_post;
+ case ARM::MVE_VSTRBU8:
+ return ARM::MVE_VSTRBU8_post;
+ case ARM::MVE_VSTRHU16:
+ return ARM::MVE_VSTRHU16_post;
+ case ARM::MVE_VSTRWU32:
+ return ARM::MVE_VSTRWU32_post;
+
default: llvm_unreachable("Unhandled opcode!");
}
}
@@ -1412,7 +1446,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
if (MI->getOperand(0).getReg() == Base)
return false;
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
int Bytes = getLSMultipleTransferSize(MI);
MachineBasicBlock &MBB = *MI->getParent();
@@ -1525,7 +1559,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
return false;
- unsigned PredReg;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
MachineBasicBlock::iterator MBBI(MI);
MachineBasicBlock &MBB = *MI.getParent();
@@ -1602,13 +1636,13 @@ static bool isMemoryOp(const MachineInstr &MI) {
// Don't touch volatile memory accesses - we may be changing their order.
// TODO: We could allow unordered and monotonic atomics here, but we need to
- // make sure the resulting ldm/stm is correctly marked as atomic.
+ // make sure the resulting ldm/stm is correctly marked as atomic.
if (MMO.isVolatile() || MMO.isAtomic())
return false;
// Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
// not.
- if (MMO.getAlignment() < 4)
+ if (MMO.getAlign() < Align(4))
return false;
// str <undef> could probably be eliminated entirely, but for now we just want
@@ -1692,7 +1726,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
assert((isT2 || MI->getOperand(3).getReg() == ARM::NoRegister) &&
"register offset not handled below");
int OffImm = getMemoryOpOffset(*MI);
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
if (OddRegNum > EvenRegNum && OffImm == 0) {
@@ -1792,7 +1826,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
const MachineOperand &MO = MBBI->getOperand(0);
Register Reg = MO.getReg();
Register Base = getLoadStoreBaseOp(*MBBI).getReg();
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
int Offset = getMemoryOpOffset(*MBBI);
if (CurrBase == 0) {
@@ -2046,6 +2080,7 @@ namespace {
const TargetRegisterInfo *TRI;
const ARMSubtarget *STI;
MachineRegisterInfo *MRI;
+ MachineDominatorTree *DT;
MachineFunction *MF;
ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
@@ -2058,29 +2093,34 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
private:
bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
- unsigned &NewOpc, unsigned &EvenReg,
- unsigned &OddReg, unsigned &BaseReg,
- int &Offset,
- unsigned &PredReg, ARMCC::CondCodes &Pred,
- bool &isT2);
+ unsigned &NewOpc, Register &EvenReg, Register &OddReg,
+ Register &BaseReg, int &Offset, Register &PredReg,
+ ARMCC::CondCodes &Pred, bool &isT2);
bool RescheduleOps(MachineBasicBlock *MBB,
SmallVectorImpl<MachineInstr *> &Ops,
unsigned Base, bool isLd,
DenseMap<MachineInstr*, unsigned> &MI2LocMap);
bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
+ bool DistributeIncrements();
+ bool DistributeIncrements(Register Base);
};
} // end anonymous namespace
char ARMPreAllocLoadStoreOpt::ID = 0;
-INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
- ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
+ ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
+ ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
// Limit the number of instructions to be rescheduled.
// FIXME: tune this limit, and/or come up with some better heuristics.
@@ -2096,10 +2136,11 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
TII = STI->getInstrInfo();
TRI = STI->getRegisterInfo();
MRI = &Fn.getRegInfo();
+ DT = &getAnalysis<MachineDominatorTree>();
MF = &Fn;
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- bool Modified = false;
+ bool Modified = DistributeIncrements();
for (MachineBasicBlock &MFI : Fn)
Modified |= RescheduleLoadStoreInstrs(&MFI);
@@ -2143,15 +2184,10 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
return AddedRegPressure.size() <= MemRegs.size() * 2;
}
-bool
-ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
- DebugLoc &dl, unsigned &NewOpc,
- unsigned &FirstReg,
- unsigned &SecondReg,
- unsigned &BaseReg, int &Offset,
- unsigned &PredReg,
- ARMCC::CondCodes &Pred,
- bool &isT2) {
+bool ARMPreAllocLoadStoreOpt::CanFormLdStDWord(
+ MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc,
+ Register &FirstReg, Register &SecondReg, Register &BaseReg, int &Offset,
+ Register &PredReg, ARMCC::CondCodes &Pred, bool &isT2) {
// Make sure we're allowed to generate LDRD/STRD.
if (!STI->hasV5TEOps())
return false;
@@ -2183,12 +2219,12 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
(*Op0->memoperands_begin())->isAtomic())
return false;
- unsigned Align = (*Op0->memoperands_begin())->getAlignment();
+ Align Alignment = (*Op0->memoperands_begin())->getAlign();
const Function &Func = MF->getFunction();
- unsigned ReqAlign = STI->hasV6Ops()
- ? TD->getABITypeAlignment(Type::getInt64Ty(Func.getContext()))
- : 8; // Pre-v6 need 8-byte align
- if (Align < ReqAlign)
+ Align ReqAlign =
+ STI->hasV6Ops() ? TD->getABITypeAlign(Type::getInt64Ty(Func.getContext()))
+ : Align(8); // Pre-v6 need 8-byte align
+ if (Alignment < ReqAlign)
return false;
// Then make sure the immediate offset fits.
@@ -2313,8 +2349,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
// to try to allocate a pair of registers that can form register pairs.
MachineInstr *Op0 = Ops.back();
MachineInstr *Op1 = Ops[Ops.size()-2];
- unsigned FirstReg = 0, SecondReg = 0;
- unsigned BaseReg = 0, PredReg = 0;
+ Register FirstReg, SecondReg;
+ Register BaseReg, PredReg;
ARMCC::CondCodes Pred = ARMCC::AL;
bool isT2 = false;
unsigned NewOpc = 0;
@@ -2416,7 +2452,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
if (!isMemoryOp(MI))
continue;
- unsigned PredReg = 0;
+ Register PredReg;
if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
continue;
@@ -2482,6 +2518,199 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
return RetVal;
}
+// Get the Base register operand index from the memory access MachineInst if we
+// should attempt to distribute postinc on it. Return -1 if not of a valid
+// instruction type. If it returns an index, it is assumed that instruction is a
+// r+i indexing mode, and getBaseOperandIndex() + 1 is the Offset index.
+static int getBaseOperandIndex(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case ARM::MVE_VLDRBS16:
+ case ARM::MVE_VLDRBS32:
+ case ARM::MVE_VLDRBU16:
+ case ARM::MVE_VLDRBU32:
+ case ARM::MVE_VLDRHS32:
+ case ARM::MVE_VLDRHU32:
+ case ARM::MVE_VLDRBU8:
+ case ARM::MVE_VLDRHU16:
+ case ARM::MVE_VLDRWU32:
+ case ARM::MVE_VSTRB16:
+ case ARM::MVE_VSTRB32:
+ case ARM::MVE_VSTRH32:
+ case ARM::MVE_VSTRBU8:
+ case ARM::MVE_VSTRHU16:
+ case ARM::MVE_VSTRWU32:
+ return 1;
+ }
+ return -1;
+}
+
+static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
+ Register NewReg,
+ const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI) {
+ MachineFunction *MF = MI->getMF();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ unsigned NewOpcode = getPostIndexedLoadStoreOpcode(
+ MI->getOpcode(), Offset > 0 ? ARM_AM::add : ARM_AM::sub);
+
+ const MCInstrDesc &MCID = TII->get(NewOpcode);
+ // Constrain the def register class
+ const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
+ MRI.constrainRegClass(NewReg, TRC);
+ // And do the same for the base operand
+ TRC = TII->getRegClass(MCID, 2, TRI, *MF);
+ MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
+
+ return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+ .addReg(NewReg, RegState::Define)
+ .add(MI->getOperand(0))
+ .add(MI->getOperand(1))
+ .addImm(Offset)
+ .add(MI->getOperand(3))
+ .add(MI->getOperand(4))
+ .cloneMemRefs(*MI);
+}
+
+// Given a Base Register, optimise the load/store uses to attempt to create more
+// post-inc accesses. We do this by taking zero offset loads/stores with an add,
+// and convert them to a postinc load/store of the same type. Any subsequent
+// accesses will be adjusted to use and account for the post-inc value.
+// For example:
+// LDR #0 LDR_POSTINC #16
+// LDR #4 LDR #-12
+// LDR #8 LDR #-8
+// LDR #12 LDR #-4
+// ADD #16
+bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
+ // We are looking for:
+ // One zero offset load/store that can become postinc
+ MachineInstr *BaseAccess = nullptr;
+ // An increment that can be folded in
+ MachineInstr *Increment = nullptr;
+ // Other accesses after BaseAccess that will need to be updated to use the
+ // postinc value
+ SmallPtrSet<MachineInstr *, 8> OtherAccesses;
+ for (auto &Use : MRI->use_nodbg_instructions(Base)) {
+ if (!Increment && getAddSubImmediate(Use) != 0) {
+ Increment = &Use;
+ continue;
+ }
+
+ int BaseOp = getBaseOperandIndex(Use);
+ if (BaseOp == -1)
+ return false;
+
+ if (!Use.getOperand(BaseOp).isReg() ||
+ Use.getOperand(BaseOp).getReg() != Base)
+ return false;
+ if (Use.getOperand(BaseOp + 1).getImm() == 0)
+ BaseAccess = &Use;
+ else
+ OtherAccesses.insert(&Use);
+ }
+
+ if (!BaseAccess || !Increment ||
+ BaseAccess->getParent() != Increment->getParent())
+ return false;
+ Register PredReg;
+ if (Increment->definesRegister(ARM::CPSR) ||
+ getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
+ << Base.virtRegIndex() << "\n");
+
+ // Make sure that Increment has no uses before BaseAccess.
+ for (MachineInstr &Use :
+ MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
+ if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
+ LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n");
+ return false;
+ }
+ }
+
+ // Make sure that Increment can be folded into Base
+ int IncrementOffset = getAddSubImmediate(*Increment);
+ unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
+ BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
+ if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
+ LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n");
+ return false;
+ }
+
+ // And make sure that the negative value of increment can be added to all
+ // other offsets after the BaseAccess. We rely on either
+ // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess)
+ // to keep things simple.
+ SmallPtrSet<MachineInstr *, 4> SuccessorAccesses;
+ for (auto *Use : OtherAccesses) {
+ if (DT->dominates(BaseAccess, Use)) {
+ SuccessorAccesses.insert(Use);
+ unsigned BaseOp = getBaseOperandIndex(*Use);
+ if (!isLegalAddressImm(
+ Use->getOpcode(),
+ Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) {
+ LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on use\n");
+ return false;
+ }
+ } else if (!DT->dominates(Use, BaseAccess)) {
+ LLVM_DEBUG(
+ dbgs() << " Unknown dominance relation between Base and Use\n");
+ return false;
+ }
+ }
+
+ // Replace BaseAccess with a post inc
+ LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
+ LLVM_DEBUG(dbgs() << " And : "; Increment->dump());
+ Register NewBaseReg = Increment->getOperand(0).getReg();
+ MachineInstr *BaseAccessPost =
+ createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
+ BaseAccess->eraseFromParent();
+ Increment->eraseFromParent();
+ (void)BaseAccessPost;
+ LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump());
+
+ for (auto *Use : SuccessorAccesses) {
+ LLVM_DEBUG(dbgs() << "Changing: "; Use->dump());
+ unsigned BaseOp = getBaseOperandIndex(*Use);
+ Use->getOperand(BaseOp).setReg(NewBaseReg);
+ int OldOffset = Use->getOperand(BaseOp + 1).getImm();
+ Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset);
+ LLVM_DEBUG(dbgs() << " To : "; Use->dump());
+ }
+
+ // Remove the kill flag from all uses of NewBaseReg, in case any old uses
+ // remain.
+ for (MachineOperand &Op : MRI->use_nodbg_operands(NewBaseReg))
+ Op.setIsKill(false);
+ return true;
+}
+
+bool ARMPreAllocLoadStoreOpt::DistributeIncrements() {
+ bool Changed = false;
+ SmallSetVector<Register, 4> Visited;
+ for (auto &MBB : *MF) {
+ for (auto &MI : MBB) {
+ int BaseOp = getBaseOperandIndex(MI);
+ if (BaseOp == -1 || !MI.getOperand(BaseOp).isReg())
+ continue;
+
+ Register Base = MI.getOperand(BaseOp).getReg();
+ if (!Base.isVirtual() || Visited.count(Base))
+ continue;
+
+ Visited.insert(Base);
+ }
+ }
+
+ for (auto Base : Visited)
+ Changed |= DistributeIncrements(Base);
+
+ return Changed;
+}
+
/// Returns an instance of the load / store optimization pass.
FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
if (PreAlloc)
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 6717d4706aef..be75d6bef08c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -35,6 +35,20 @@
/// are defined to be as large as this maximum sequence of replacement
/// instructions.
///
+/// A note on VPR.P0 (the lane mask):
+/// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
+/// "VPT Active" context (which includes low-overhead loops and vpt blocks).
+/// They will simply "and" the result of their calculation with the current
+/// value of VPR.P0. You can think of it like this:
+/// \verbatim
+/// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
+/// VPR.P0 &= Value
+/// else
+/// VPR.P0 = Value
+/// \endverbatim
+/// When we're inside the low-overhead loop (between DLSTP and LETP), we always
+/// fall in the "VPT active" case, so we can consider that all VPR writes by
+/// one of those instruction is actually a "and".
//===----------------------------------------------------------------------===//
#include "ARM.h"
@@ -45,6 +59,7 @@
#include "Thumb2InstrInfo.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineLoopUtils.h"
@@ -60,34 +75,93 @@ using namespace llvm;
namespace {
+ using InstSet = SmallPtrSetImpl<MachineInstr *>;
+
+ class PostOrderLoopTraversal {
+ MachineLoop &ML;
+ MachineLoopInfo &MLI;
+ SmallPtrSet<MachineBasicBlock*, 4> Visited;
+ SmallVector<MachineBasicBlock*, 4> Order;
+
+ public:
+ PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
+ : ML(ML), MLI(MLI) { }
+
+ const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
+ return Order;
+ }
+
+ // Visit all the blocks within the loop, as well as exit blocks and any
+ // blocks properly dominating the header.
+ void ProcessLoop() {
+ std::function<void(MachineBasicBlock*)> Search = [this, &Search]
+ (MachineBasicBlock *MBB) -> void {
+ if (Visited.count(MBB))
+ return;
+
+ Visited.insert(MBB);
+ for (auto *Succ : MBB->successors()) {
+ if (!ML.contains(Succ))
+ continue;
+ Search(Succ);
+ }
+ Order.push_back(MBB);
+ };
+
+ // Insert exit blocks.
+ SmallVector<MachineBasicBlock*, 2> ExitBlocks;
+ ML.getExitBlocks(ExitBlocks);
+ for (auto *MBB : ExitBlocks)
+ Order.push_back(MBB);
+
+ // Then add the loop body.
+ Search(ML.getHeader());
+
+ // Then try the preheader and its predecessors.
+ std::function<void(MachineBasicBlock*)> GetPredecessor =
+ [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
+ Order.push_back(MBB);
+ if (MBB->pred_size() == 1)
+ GetPredecessor(*MBB->pred_begin());
+ };
+
+ if (auto *Preheader = ML.getLoopPreheader())
+ GetPredecessor(Preheader);
+ else if (auto *Preheader = MLI.findLoopPreheader(&ML, true))
+ GetPredecessor(Preheader);
+ }
+ };
+
struct PredicatedMI {
MachineInstr *MI = nullptr;
SetVector<MachineInstr*> Predicates;
public:
- PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) :
- MI(I) {
+ PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
+ assert(I && "Instruction must not be null!");
Predicates.insert(Preds.begin(), Preds.end());
}
};
- // Represent a VPT block, a list of instructions that begins with a VPST and
- // has a maximum of four proceeding instructions. All instructions within the
- // block are predicated upon the vpr and we allow instructions to define the
- // vpr within in the block too.
+ // Represent a VPT block, a list of instructions that begins with a VPT/VPST
+ // and has a maximum of four proceeding instructions. All instructions within
+ // the block are predicated upon the vpr and we allow instructions to define
+ // the vpr within in the block too.
class VPTBlock {
- std::unique_ptr<PredicatedMI> VPST;
+ // The predicate then instruction, which is either a VPT, or a VPST
+ // instruction.
+ std::unique_ptr<PredicatedMI> PredicateThen;
PredicatedMI *Divergent = nullptr;
SmallVector<PredicatedMI, 4> Insts;
public:
VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
- VPST = std::make_unique<PredicatedMI>(MI, Preds);
+ PredicateThen = std::make_unique<PredicatedMI>(MI, Preds);
}
void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
- if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) {
+ if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) {
Divergent = &Insts.back();
LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
}
@@ -104,38 +178,73 @@ namespace {
// Is the given instruction part of the predicate set controlling the entry
// to the block.
bool IsPredicatedOn(MachineInstr *MI) const {
- return VPST->Predicates.count(MI);
+ return PredicateThen->Predicates.count(MI);
+ }
+
+ // Returns true if this is a VPT instruction.
+ bool isVPT() const { return !isVPST(); }
+
+ // Returns true if this is a VPST instruction.
+ bool isVPST() const {
+ return PredicateThen->MI->getOpcode() == ARM::MVE_VPST;
}
// Is the given instruction the only predicate which controls the entry to
// the block.
bool IsOnlyPredicatedOn(MachineInstr *MI) const {
- return IsPredicatedOn(MI) && VPST->Predicates.size() == 1;
+ return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1;
}
unsigned size() const { return Insts.size(); }
SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
- MachineInstr *getVPST() const { return VPST->MI; }
+ MachineInstr *getPredicateThen() const { return PredicateThen->MI; }
PredicatedMI *getDivergent() const { return Divergent; }
};
+ struct Reduction {
+ MachineInstr *Init;
+ MachineInstr &Copy;
+ MachineInstr &Reduce;
+ MachineInstr &VPSEL;
+
+ Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add,
+ MachineInstr *Sel)
+ : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel) { }
+ };
+
struct LowOverheadLoop {
- MachineLoop *ML = nullptr;
+ MachineLoop &ML;
+ MachineBasicBlock *Preheader = nullptr;
+ MachineLoopInfo &MLI;
+ ReachingDefAnalysis &RDA;
+ const TargetRegisterInfo &TRI;
+ const ARMBaseInstrInfo &TII;
MachineFunction *MF = nullptr;
MachineInstr *InsertPt = nullptr;
MachineInstr *Start = nullptr;
MachineInstr *Dec = nullptr;
MachineInstr *End = nullptr;
MachineInstr *VCTP = nullptr;
+ SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
VPTBlock *CurrentBlock = nullptr;
SetVector<MachineInstr*> CurrentPredicate;
SmallVector<VPTBlock, 4> VPTBlocks;
+ SmallPtrSet<MachineInstr*, 4> ToRemove;
+ SmallVector<std::unique_ptr<Reduction>, 1> Reductions;
+ SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
bool Revert = false;
bool CannotTailPredicate = false;
- LowOverheadLoop(MachineLoop *ML) : ML(ML) {
- MF = ML->getHeader()->getParent();
+ LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
+ ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
+ const ARMBaseInstrInfo &TII)
+ : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII) {
+ MF = ML.getHeader()->getParent();
+ if (auto *MBB = ML.getLoopPreheader())
+ Preheader = MBB;
+ else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
+ Preheader = MBB;
}
// If this is an MVE instruction, check that we know how to use tail
@@ -151,22 +260,30 @@ namespace {
// For now, let's keep things really simple and only support a single
// block for tail predication.
return !Revert && FoundAllComponents() && VCTP &&
- !CannotTailPredicate && ML->getNumBlocks() == 1;
+ !CannotTailPredicate && ML.getNumBlocks() == 1;
}
- bool ValidateTailPredicate(MachineInstr *StartInsertPt,
- ReachingDefAnalysis *RDA,
- MachineLoopInfo *MLI);
+ // Check that the predication in the loop will be equivalent once we
+ // perform the conversion. Also ensure that we can provide the number
+ // of elements to the loop start instruction.
+ bool ValidateTailPredicate(MachineInstr *StartInsertPt);
+
+ // See whether the live-out instructions are a reduction that we can fixup
+ // later.
+ bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers);
+
+ // Check that any values available outside of the loop will be the same
+ // after tail predication conversion.
+ bool ValidateLiveOuts();
// Is it safe to define LR with DLS/WLS?
// LR can be defined if it is the operand to start, because it's the same
// value, or if it's going to be equivalent to the operand to Start.
- MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA);
+ MachineInstr *isSafeToDefineLR();
// Check the branch targets are within range and we satisfy our
// restrictions.
- void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA,
- MachineLoopInfo *MLI);
+ void CheckLegality(ARMBasicBlockUtils *BBUtils);
bool FoundAllComponents() const {
return Start && Dec && End;
@@ -241,18 +358,19 @@ namespace {
void RevertWhile(MachineInstr *MI) const;
- bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const;
+ bool RevertLoopDec(MachineInstr *MI) const;
void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
- void RemoveLoopUpdate(LowOverheadLoop &LoLoop);
-
void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
+ void FixupReductions(LowOverheadLoop &LoLoop) const;
+
MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
void Expand(LowOverheadLoop &LoLoop);
+ void IterationCountDCE(LowOverheadLoop &LoLoop);
};
}
@@ -261,7 +379,7 @@ char ARMLowOverheadLoops::ID = 0;
INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
false, false)
-MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
+MachineInstr *LowOverheadLoop::isSafeToDefineLR() {
// We can define LR because LR already contains the same value.
if (Start->getOperand(0).getReg() == ARM::LR)
return Start;
@@ -279,52 +397,22 @@ MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
// Find an insertion point:
// - Is there a (mov lr, Count) before Start? If so, and nothing else writes
// to Count before Start, we can insert at that mov.
- if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR))
- if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
+ if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR))
+ if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
return LRDef;
// - Is there a (mov lr, Count) after Start? If so, and nothing else writes
// to Count after Start, we can insert at that mov.
- if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR))
- if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
+ if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR))
+ if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
return LRDef;
// We've found no suitable LR def and Start doesn't use LR directly. Can we
// just define LR anyway?
- if (!RDA->isRegUsedAfter(Start, ARM::LR))
- return Start;
-
- return nullptr;
-}
-
-// Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must
-// not define a register that is used by any instructions, after and including,
-// 'To'. These instructions also must not redefine any of Froms operands.
-template<typename Iterator>
-static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) {
- SmallSet<int, 2> Defs;
- // First check that From would compute the same value if moved.
- for (auto &MO : From->operands()) {
- if (!MO.isReg() || MO.isUndef() || !MO.getReg())
- continue;
- if (MO.isDef())
- Defs.insert(MO.getReg());
- else if (!RDA->hasSameReachingDef(From, To, MO.getReg()))
- return false;
- }
-
- // Now walk checking that the rest of the instructions will compute the same
- // value.
- for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) {
- for (auto &MO : I->operands())
- if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg()))
- return false;
- }
- return true;
+ return RDA.isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr;
}
-bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
- ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) {
+bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
assert(VCTP && "VCTP instruction expected but is not set");
// All predication within the loop should be based on vctp. If the block
// isn't predicated on entry, check whether the vctp is within the block
@@ -332,24 +420,35 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
for (auto &Block : VPTBlocks) {
if (Block.IsPredicatedOn(VCTP))
continue;
- if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) {
+ if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) {
LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
- << *Block.getDivergent()->MI);
+ << *Block.getDivergent()->MI);
return false;
}
SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
for (auto &PredMI : Insts) {
- if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI))
+ // Check the instructions in the block and only allow:
+ // - VCTPs
+ // - Instructions predicated on the main VCTP
+ // - Any VCMP
+ // - VCMPs just "and" their result with VPR.P0. Whether they are
+ // located before/after the VCTP is irrelevant - the end result will
+ // be the same in both cases, so there's no point in requiring them
+ // to be located after the VCTP!
+ if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) ||
+ VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0)
continue;
LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
- << " - which is predicated on:\n";
- for (auto *MI : PredMI.Predicates)
- dbgs() << " - " << *MI;
- );
+ << " - which is predicated on:\n";
+ for (auto *MI : PredMI.Predicates)
+ dbgs() << " - " << *MI);
return false;
}
}
+ if (!ValidateLiveOuts())
+ return false;
+
// For tail predication, we need to provide the number of elements, instead
// of the iteration count, to the loop start instruction. The number of
// elements is provided to the vctp instruction, so we need to check that
@@ -359,7 +458,7 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
// If the register is defined within loop, then we can't perform TP.
// TODO: Check whether this is just a mov of a register that would be
// available.
- if (RDA->getReachingDef(VCTP, NumElements) >= 0) {
+ if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
return false;
}
@@ -367,17 +466,20 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
// The element count register maybe defined after InsertPt, in which case we
// need to try to move either InsertPt or the def so that the [w|d]lstp can
// use the value.
- MachineBasicBlock *InsertBB = InsertPt->getParent();
- if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) {
- if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) {
- if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) {
+ // TODO: On failing to move an instruction, check if the count is provided by
+ // a mov and whether we can use the mov operand directly.
+ MachineBasicBlock *InsertBB = StartInsertPt->getParent();
+ if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) {
+ if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) {
+ if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) {
ElemDef->removeFromParent();
- InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef);
+ InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef);
LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
<< *ElemDef);
- } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) {
- InsertPt->removeFromParent();
- InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt);
+ } else if (RDA.isSafeToMoveBackwards(StartInsertPt, ElemDef)) {
+ StartInsertPt->removeFromParent();
+ InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
+ StartInsertPt);
LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
} else {
LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
@@ -390,10 +492,10 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
// Especially in the case of while loops, InsertBB may not be the
// preheader, so we need to check that the register isn't redefined
// before entering the loop.
- auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB,
+ auto CannotProvideElements = [this](MachineBasicBlock *MBB,
Register NumElements) {
// NumElements is redefined in this block.
- if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0)
+ if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
return true;
// Don't continue searching up through multiple predecessors.
@@ -404,7 +506,7 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
};
// First, find the block that looks like the preheader.
- MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true);
+ MachineBasicBlock *MBB = Preheader;
if (!MBB) {
LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n");
return false;
@@ -419,13 +521,372 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
MBB = *MBB->pred_begin();
}
- LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n");
+ // Check that the value change of the element count is what we expect and
+ // that the predication will be equivalent. For this we need:
+ // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
+ // and we can also allow register copies within the chain too.
+ auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
+ return -getAddSubImmediate(*MI) == ExpectedVecWidth;
+ };
+
+ MBB = VCTP->getParent();
+ if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), NumElements)) {
+ SmallPtrSet<MachineInstr*, 2> ElementChain;
+ SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP };
+ unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
+
+ Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end());
+
+ if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) {
+ bool FoundSub = false;
+
+ for (auto *MI : ElementChain) {
+ if (isMovRegOpcode(MI->getOpcode()))
+ continue;
+
+ if (isSubImmOpcode(MI->getOpcode())) {
+ if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth))
+ return false;
+ FoundSub = true;
+ } else
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Will remove element count chain:\n";
+ for (auto *MI : ElementChain)
+ dbgs() << " - " << *MI);
+ ToRemove.insert(ElementChain.begin(), ElementChain.end());
+ }
+ }
+ return true;
+}
+
+static bool isVectorPredicated(MachineInstr *MI) {
+ int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
+ return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
+}
+
+static bool isRegInClass(const MachineOperand &MO,
+ const TargetRegisterClass *Class) {
+ return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
+}
+
+// MVE 'narrowing' operate on half a lane, reading from half and writing
+// to half, which are referred to has the top and bottom half. The other
+// half retains its previous value.
+static bool retainsPreviousHalfElement(const MachineInstr &MI) {
+ const MCInstrDesc &MCID = MI.getDesc();
+ uint64_t Flags = MCID.TSFlags;
+ return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
+}
+
+// Some MVE instructions read from the top/bottom halves of their operand(s)
+// and generate a vector result with result elements that are double the
+// width of the input.
+static bool producesDoubleWidthResult(const MachineInstr &MI) {
+ const MCInstrDesc &MCID = MI.getDesc();
+ uint64_t Flags = MCID.TSFlags;
+ return (Flags & ARMII::DoubleWidthResult) != 0;
+}
+
+static bool isHorizontalReduction(const MachineInstr &MI) {
+ const MCInstrDesc &MCID = MI.getDesc();
+ uint64_t Flags = MCID.TSFlags;
+ return (Flags & ARMII::HorizontalReduction) != 0;
+}
+
+// Can this instruction generate a non-zero result when given only zeroed
+// operands? This allows us to know that, given operands with false bytes
+// zeroed by masked loads, that the result will also contain zeros in those
+// bytes.
+static bool canGenerateNonZeros(const MachineInstr &MI) {
+
+ // Check for instructions which can write into a larger element size,
+ // possibly writing into a previous zero'd lane.
+ if (producesDoubleWidthResult(MI))
+ return true;
+
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
+ // fp16 -> fp32 vector conversions.
+ // Instructions that perform a NOT will generate 1s from 0s.
+ case ARM::MVE_VMVN:
+ case ARM::MVE_VORN:
+ // Count leading zeros will do just that!
+ case ARM::MVE_VCLZs8:
+ case ARM::MVE_VCLZs16:
+ case ARM::MVE_VCLZs32:
+ return true;
+ }
+ return false;
+}
+
+
+// Look at its register uses to see if it only can only receive zeros
+// into its false lanes which would then produce zeros. Also check that
+// the output register is also defined by an FalseLanesZero instruction
+// so that if tail-predication happens, the lanes that aren't updated will
+// still be zeros.
+static bool producesFalseLanesZero(MachineInstr &MI,
+ const TargetRegisterClass *QPRs,
+ const ReachingDefAnalysis &RDA,
+ InstSet &FalseLanesZero) {
+ if (canGenerateNonZeros(MI))
+ return false;
+
+ bool AllowScalars = isHorizontalReduction(MI);
+ for (auto &MO : MI.operands()) {
+ if (!MO.isReg() || !MO.getReg())
+ continue;
+ if (!isRegInClass(MO, QPRs) && AllowScalars)
+ continue;
+ if (auto *OpDef = RDA.getMIOperand(&MI, MO))
+ if (FalseLanesZero.count(OpDef))
+ continue;
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
+ return true;
+}
+
+bool
+LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) {
+ // Also check for reductions where the operation needs to be merging values
+ // from the last and previous loop iterations. This means an instruction
+ // producing a value and a vmov storing the value calculated in the previous
+ // iteration. So we can have two live-out regs, one produced by a vmov and
+ // both being consumed by a vpsel.
+ LLVM_DEBUG(dbgs() << "ARM Loops: Looking for reduction live-outs:\n";
+ for (auto *MI : LiveMIs)
+ dbgs() << " - " << *MI);
+
+ if (!Preheader)
+ return false;
+
+ // Expect a vmov, a vadd and a single vpsel user.
+ // TODO: This means we can't currently support multiple reductions in the
+ // loop.
+ if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1)
+ return false;
+
+ MachineInstr *VPSEL = *LiveOutUsers.begin();
+ if (VPSEL->getOpcode() != ARM::MVE_VPSEL)
+ return false;
+
+ unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1;
+ MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx);
+ if (!Pred || Pred != VCTP) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n");
+ return false;
+ }
+
+ MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1);
+ if (!Reduce)
+ return false;
+
+ assert(LiveMIs.count(Reduce) && "Expected MI to be live-out");
+
+ // TODO: Support more operations than VADD.
+ switch (VCTP->getOpcode()) {
+ default:
+ return false;
+ case ARM::MVE_VCTP8:
+ if (Reduce->getOpcode() != ARM::MVE_VADDi8)
+ return false;
+ break;
+ case ARM::MVE_VCTP16:
+ if (Reduce->getOpcode() != ARM::MVE_VADDi16)
+ return false;
+ break;
+ case ARM::MVE_VCTP32:
+ if (Reduce->getOpcode() != ARM::MVE_VADDi32)
+ return false;
+ break;
+ }
+
+ // Test that the reduce op is overwriting ones of its operands.
+ if (Reduce->getOperand(0).getReg() != Reduce->getOperand(1).getReg() &&
+ Reduce->getOperand(0).getReg() != Reduce->getOperand(2).getReg()) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Reducing op isn't overwriting itself.\n");
+ return false;
+ }
+
+ // Check that the VORR is actually a VMOV.
+ MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2);
+ if (!Copy || Copy->getOpcode() != ARM::MVE_VORR ||
+ !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() ||
+ Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg())
+ return false;
+
+ assert(LiveMIs.count(Copy) && "Expected MI to be live-out");
+
+ // Check that the vadd and vmov are only used by each other and the vpsel.
+ SmallPtrSet<MachineInstr*, 2> CopyUsers;
+ RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers);
+ if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Copy users unsupported.\n");
+ return false;
+ }
+
+ SmallPtrSet<MachineInstr*, 2> ReduceUsers;
+ RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers);
+ if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Reduce users unsupported.\n");
+ return false;
+ }
+
+ // Then find whether there's an instruction initialising the register that
+ // is storing the reduction.
+ SmallPtrSet<MachineInstr*, 2> Incoming;
+ RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming);
+ if (Incoming.size() > 1)
+ return false;
+
+ MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin();
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n"
+ << " - " << *Copy
+ << " - " << *Reduce
+ << " - " << *VPSEL);
+ Reductions.push_back(std::make_unique<Reduction>(Init, Copy, Reduce, VPSEL));
return true;
}
-void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
- ReachingDefAnalysis *RDA,
- MachineLoopInfo *MLI) {
+bool LowOverheadLoop::ValidateLiveOuts() {
+ // We want to find out if the tail-predicated version of this loop will
+ // produce the same values as the loop in its original form. For this to
+ // be true, the newly inserted implicit predication must not change the
+ // the (observable) results.
+ // We're doing this because many instructions in the loop will not be
+ // predicated and so the conversion from VPT predication to tail-predication
+ // can result in different values being produced; due to the tail-predication
+ // preventing many instructions from updating their falsely predicated
+ // lanes. This analysis assumes that all the instructions perform lane-wise
+ // operations and don't perform any exchanges.
+ // A masked load, whether through VPT or tail predication, will write zeros
+ // to any of the falsely predicated bytes. So, from the loads, we know that
+ // the false lanes are zeroed and here we're trying to track that those false
+ // lanes remain zero, or where they change, the differences are masked away
+ // by their user(s).
+ // All MVE loads and stores have to be predicated, so we know that any load
+ // operands, or stored results are equivalent already. Other explicitly
+ // predicated instructions will perform the same operation in the original
+ // loop and the tail-predicated form too. Because of this, we can insert
+ // loads, stores and other predicated instructions into our Predicated
+ // set and build from there.
+ const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
+ SetVector<MachineInstr *> FalseLanesUnknown;
+ SmallPtrSet<MachineInstr *, 4> FalseLanesZero;
+ SmallPtrSet<MachineInstr *, 4> Predicated;
+ MachineBasicBlock *Header = ML.getHeader();
+
+ for (auto &MI : *Header) {
+ const MCInstrDesc &MCID = MI.getDesc();
+ uint64_t Flags = MCID.TSFlags;
+ if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
+ continue;
+
+ if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
+ continue;
+
+ // Predicated loads will write zeros to the falsely predicated bytes of the
+ // destination register.
+ if (isVectorPredicated(&MI)) {
+ if (MI.mayLoad())
+ FalseLanesZero.insert(&MI);
+ Predicated.insert(&MI);
+ continue;
+ }
+
+ if (MI.getNumDefs() == 0)
+ continue;
+
+ if (!producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) {
+ // We require retaining and horizontal operations to operate upon zero'd
+ // false lanes to ensure the conversion doesn't change the output.
+ if (retainsPreviousHalfElement(MI) || isHorizontalReduction(MI))
+ return false;
+ // Otherwise we need to evaluate this instruction later to see whether
+ // unknown false lanes will get masked away by their user(s).
+ FalseLanesUnknown.insert(&MI);
+ } else if (!isHorizontalReduction(MI))
+ FalseLanesZero.insert(&MI);
+ }
+
+ auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
+ SmallPtrSetImpl<MachineInstr *> &Predicated) {
+ SmallPtrSet<MachineInstr *, 2> Uses;
+ RDA.getGlobalUses(MI, MO.getReg(), Uses);
+ for (auto *Use : Uses) {
+ if (Use != MI && !Predicated.count(Use))
+ return false;
+ }
+ return true;
+ };
+
+ // Visit the unknowns in reverse so that we can start at the values being
+ // stored and then we can work towards the leaves, hopefully adding more
+ // instructions to Predicated. Successfully terminating the loop means that
+ // all the unknown values have to found to be masked by predicated user(s).
+ // For any unpredicated values, we store them in NonPredicated so that we
+ // can later check whether these form a reduction.
+ SmallPtrSet<MachineInstr*, 2> NonPredicated;
+ for (auto *MI : reverse(FalseLanesUnknown)) {
+ for (auto &MO : MI->operands()) {
+ if (!isRegInClass(MO, QPRs) || !MO.isDef())
+ continue;
+ if (!HasPredicatedUsers(MI, MO, Predicated)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
+ << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
+ NonPredicated.insert(MI);
+ continue;
+ }
+ }
+ // Any unknown false lanes have been masked away by the user(s).
+ Predicated.insert(MI);
+ }
+
+ SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
+ SmallPtrSet<MachineInstr*, 2> LiveOutUsers;
+ SmallVector<MachineBasicBlock *, 2> ExitBlocks;
+ ML.getExitBlocks(ExitBlocks);
+ assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
+ assert(ExitBlocks.size() == 1 && "Expected a single exit block");
+ MachineBasicBlock *ExitBB = ExitBlocks.front();
+ for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
+ // Check Q-regs that are live in the exit blocks. We don't collect scalars
+ // because they won't be affected by lane predication.
+ if (QPRs->contains(RegMask.PhysReg)) {
+ if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
+ LiveOutMIs.insert(MI);
+ RDA.getLiveInUses(ExitBB, RegMask.PhysReg, LiveOutUsers);
+ }
+ }
+
+ // If we have any non-predicated live-outs, they need to be part of a
+ // reduction that we can fixup later. The reduction that the form of an
+ // operation that uses its previous values through a vmov and then a vpsel
+ // resides in the exit blocks to select the final bytes from n and n-1
+ // iterations.
+ if (!NonPredicated.empty() &&
+ !FindValidReduction(NonPredicated, LiveOutUsers))
+ return false;
+
+ // We've already validated that any VPT predication within the loop will be
+ // equivalent when we perform the predication transformation; so we know that
+ // any VPT predicated instruction is predicated upon VCTP. Any live-out
+ // instruction needs to be predicated, so check this here. The instructions
+ // in NonPredicated have been found to be a reduction that we can ensure its
+ // legality.
+ for (auto *MI : LiveOutMIs)
+ if (!isVectorPredicated(MI) && !NonPredicated.count(MI))
+ return false;
+
+ return true;
+}
+
+void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
if (Revert)
return;
@@ -434,7 +895,7 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
// TODO Maybe there's cases where the target doesn't have to be the header,
// but for now be safe and revert.
- if (End->getOperand(1).getMBB() != ML->getHeader()) {
+ if (End->getOperand(1).getMBB() != ML.getHeader()) {
LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
Revert = true;
return;
@@ -442,8 +903,8 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
// The WLS and LE instructions have 12-bits for the label offset. WLS
// requires a positive offset, while LE uses negative.
- if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
- !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
+ if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
+ !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
Revert = true;
return;
@@ -458,7 +919,7 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
return;
}
- InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA);
+ InsertPt = Revert ? nullptr : isSafeToDefineLR();
if (!InsertPt) {
LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
Revert = true;
@@ -473,9 +934,9 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
return;
}
- assert(ML->getBlocks().size() == 1 &&
+ assert(ML.getBlocks().size() == 1 &&
"Shouldn't be processing a loop with more than one block");
- CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI);
+ CannotTailPredicate = !ValidateTailPredicate(InsertPt);
LLVM_DEBUG(if (CannotTailPredicate)
dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
}
@@ -484,29 +945,44 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
if (CannotTailPredicate)
return false;
- // Only support a single vctp.
- if (isVCTP(MI) && VCTP)
- return false;
+ if (isVCTP(MI)) {
+ // If we find another VCTP, check whether it uses the same value as the main VCTP.
+ // If it does, store it in the SecondaryVCTPs set, else refuse it.
+ if (VCTP) {
+ if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
+ !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
+ "definition from the main VCTP");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI);
+ SecondaryVCTPs.insert(MI);
+ } else {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI);
+ VCTP = MI;
+ }
+ } else if (isVPTOpcode(MI->getOpcode())) {
+ if (MI->getOpcode() != ARM::MVE_VPST) {
+ assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 &&
+ "VPT does not implicitly define VPR?!");
+ CurrentPredicate.insert(MI);
+ }
- // Start a new vpt block when we discover a vpt.
- if (MI->getOpcode() == ARM::MVE_VPST) {
VPTBlocks.emplace_back(MI, CurrentPredicate);
CurrentBlock = &VPTBlocks.back();
return true;
- } else if (isVCTP(MI))
- VCTP = MI;
- else if (MI->getOpcode() == ARM::MVE_VPSEL ||
- MI->getOpcode() == ARM::MVE_VPNOT)
+ } else if (MI->getOpcode() == ARM::MVE_VPSEL ||
+ MI->getOpcode() == ARM::MVE_VPNOT) {
+ // TODO: Allow VPSEL and VPNOT, we currently cannot because:
+ // 1) It will use the VPR as a predicate operand, but doesn't have to be
+ // instead a VPT block, which means we can assert while building up
+ // the VPT block because we don't find another VPT or VPST to being a new
+ // one.
+ // 2) VPSEL still requires a VPR operand even after tail predicating,
+ // which means we can't remove it unless there is another
+ // instruction, such as vcmp, that can provide the VPR def.
return false;
-
- // TODO: Allow VPSEL and VPNOT, we currently cannot because:
- // 1) It will use the VPR as a predicate operand, but doesn't have to be
- // instead a VPT block, which means we can assert while building up
- // the VPT block because we don't find another VPST to being a new
- // one.
- // 2) VPSEL still requires a VPR operand even after tail predicating,
- // which means we can't remove it unless there is another
- // instruction, such as vcmp, that can provide the VPR def.
+ }
bool IsUse = false;
bool IsDef = false;
@@ -548,7 +1024,9 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
return false;
}
- return true;
+ // If the instruction is already explicitly predicated, then the conversion
+ // will be fine, but ensure that all memory operations are predicated.
+ return !IsUse && MI->mayLoadOrStore() ? false : true;
}
bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
@@ -591,6 +1069,8 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
dbgs() << " - " << Preheader->getName() << "\n";
else if (auto *Preheader = MLI->findLoopPreheader(ML))
dbgs() << " - " << Preheader->getName() << "\n";
+ else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
+ dbgs() << " - " << Preheader->getName() << "\n";
for (auto *MBB : ML->getBlocks())
dbgs() << " - " << MBB->getName() << "\n";
);
@@ -608,14 +1088,12 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
return nullptr;
};
- LowOverheadLoop LoLoop(ML);
+ LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
// Search the preheader for the start intrinsic.
// FIXME: I don't see why we shouldn't be supporting multiple predecessors
// with potentially multiple set.loop.iterations, so we need to enable this.
- if (auto *Preheader = ML->getLoopPreheader())
- LoLoop.Start = SearchForStart(Preheader);
- else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
- LoLoop.Start = SearchForStart(Preheader);
+ if (LoLoop.Preheader)
+ LoLoop.Start = SearchForStart(LoLoop.Preheader);
else
return false;
@@ -624,7 +1102,9 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
// whether we can convert that predicate using tail predication.
for (auto *MBB : reverse(ML->getBlocks())) {
for (auto &MI : *MBB) {
- if (MI.getOpcode() == ARM::t2LoopDec)
+ if (MI.isDebugValue())
+ continue;
+ else if (MI.getOpcode() == ARM::t2LoopDec)
LoLoop.Dec = &MI;
else if (MI.getOpcode() == ARM::t2LoopEnd)
LoLoop.End = &MI;
@@ -641,28 +1121,6 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
// Check we know how to tail predicate any mve instructions.
LoLoop.AnalyseMVEInst(&MI);
}
-
- // We need to ensure that LR is not used or defined inbetween LoopDec and
- // LoopEnd.
- if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert)
- continue;
-
- // If we find that LR has been written or read between LoopDec and
- // LoopEnd, expect that the decremented value is being used else where.
- // Because this value isn't actually going to be produced until the
- // latch, by LE, we would need to generate a real sub. The value is also
- // likely to be copied/reloaded for use of LoopEnd - in which in case
- // we'd need to perform an add because it gets subtracted again by LE!
- // The other option is to then generate the other form of LE which doesn't
- // perform the sub.
- for (auto &MO : MI.operands()) {
- if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
- MO.getReg() == ARM::LR) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
- LoLoop.Revert = true;
- break;
- }
- }
}
}
@@ -672,7 +1130,15 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
return false;
}
- LoLoop.CheckLegality(BBUtils.get(), RDA, MLI);
+ // Check that the only instruction using LoopDec is LoopEnd.
+ // TODO: Check for copy chains that really have no effect.
+ SmallPtrSet<MachineInstr*, 2> Uses;
+ RDA->getReachingLocalUses(LoLoop.Dec, ARM::LR, Uses);
+ if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
+ LoLoop.Revert = true;
+ }
+ LoLoop.CheckLegality(BBUtils.get());
Expand(LoLoop);
return true;
}
@@ -702,16 +1168,19 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
MI->eraseFromParent();
}
-bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
- bool SetFlags) const {
+bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
MachineBasicBlock *MBB = MI->getParent();
+ SmallPtrSet<MachineInstr*, 1> Ignore;
+ for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
+ if (I->getOpcode() == ARM::t2LoopEnd) {
+ Ignore.insert(&*I);
+ break;
+ }
+ }
// If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
- if (SetFlags &&
- (RDA->isRegUsedAfter(MI, ARM::CPSR) ||
- !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR)))
- SetFlags = false;
+ bool SetFlags = RDA->isSafeToDefRegAt(MI, ARM::CPSR, Ignore);
MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
TII->get(ARM::t2SUBri));
@@ -759,7 +1228,102 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
MI->eraseFromParent();
}
+// Perform dead code elimation on the loop iteration count setup expression.
+// If we are tail-predicating, the number of elements to be processed is the
+// operand of the VCTP instruction in the vector body, see getCount(), which is
+// register $r3 in this example:
+//
+// $lr = big-itercount-expression
+// ..
+// t2DoLoopStart renamable $lr
+// vector.body:
+// ..
+// $vpr = MVE_VCTP32 renamable $r3
+// renamable $lr = t2LoopDec killed renamable $lr, 1
+// t2LoopEnd renamable $lr, %vector.body
+// tB %end
+//
+// What we would like achieve here is to replace the do-loop start pseudo
+// instruction t2DoLoopStart with:
+//
+// $lr = MVE_DLSTP_32 killed renamable $r3
+//
+// Thus, $r3 which defines the number of elements, is written to $lr,
+// and then we want to delete the whole chain that used to define $lr,
+// see the comment below how this chain could look like.
+//
+void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
+ if (!LoLoop.IsTailPredicationLegal())
+ return;
+
+ LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
+
+ MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 0);
+ if (!Def) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
+ return;
+ }
+
+ // Collect and remove the users of iteration count.
+ SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec,
+ LoLoop.End, LoLoop.InsertPt };
+ SmallPtrSet<MachineInstr*, 2> Remove;
+ if (RDA->isSafeToRemove(Def, Remove, Killed))
+ LoLoop.ToRemove.insert(Remove.begin(), Remove.end());
+ else {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
+ return;
+ }
+
+ // Collect the dead code and the MBBs in which they reside.
+ RDA->collectKilledOperands(Def, Killed);
+ SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
+ for (auto *MI : Killed)
+ BasicBlocks.insert(MI->getParent());
+
+ // Collect IT blocks in all affected basic blocks.
+ std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
+ for (auto *MBB : BasicBlocks) {
+ for (auto &MI : *MBB) {
+ if (MI.getOpcode() != ARM::t2IT)
+ continue;
+ RDA->getReachingLocalUses(&MI, ARM::ITSTATE, ITBlocks[&MI]);
+ }
+ }
+
+ // If we're removing all of the instructions within an IT block, then
+ // also remove the IT instruction.
+ SmallPtrSet<MachineInstr*, 2> ModifiedITs;
+ for (auto *MI : Killed) {
+ if (MachineOperand *MO = MI->findRegisterUseOperand(ARM::ITSTATE)) {
+ MachineInstr *IT = RDA->getMIOperand(MI, *MO);
+ auto &CurrentBlock = ITBlocks[IT];
+ CurrentBlock.erase(MI);
+ if (CurrentBlock.empty())
+ ModifiedITs.erase(IT);
+ else
+ ModifiedITs.insert(IT);
+ }
+ }
+
+ // Delete the killed instructions only if we don't have any IT blocks that
+ // need to be modified because we need to fixup the mask.
+ // TODO: Handle cases where IT blocks are modified.
+ if (ModifiedITs.empty()) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Will remove iteration count:\n";
+ for (auto *MI : Killed)
+ dbgs() << " - " << *MI);
+ LoLoop.ToRemove.insert(Killed.begin(), Killed.end());
+ } else
+ LLVM_DEBUG(dbgs() << "ARM Loops: Would need to modify IT block(s).\n");
+}
+
MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
+ // When using tail-predication, try to delete the dead code that was used to
+ // calculate the number of loop iterations.
+ IterationCountDCE(LoLoop);
+
MachineInstr *InsertPt = LoLoop.InsertPt;
MachineInstr *Start = LoLoop.Start;
MachineBasicBlock *MBB = InsertPt->getParent();
@@ -775,109 +1339,67 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
if (!IsDo)
MIB.add(Start->getOperand(1));
- // When using tail-predication, try to delete the dead code that was used to
- // calculate the number of loop iterations.
- if (LoLoop.IsTailPredicationLegal()) {
- SmallVector<MachineInstr*, 4> Killed;
- SmallVector<MachineInstr*, 4> Dead;
- if (auto *Def = RDA->getReachingMIDef(Start,
- Start->getOperand(0).getReg())) {
- Killed.push_back(Def);
-
- while (!Killed.empty()) {
- MachineInstr *Def = Killed.back();
- Killed.pop_back();
- Dead.push_back(Def);
- for (auto &MO : Def->operands()) {
- if (!MO.isReg() || !MO.isKill())
- continue;
-
- MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg());
- if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1)
- Killed.push_back(Kill);
- }
- }
- for (auto *MI : Dead)
- MI->eraseFromParent();
- }
- }
-
// If we're inserting at a mov lr, then remove it as it's redundant.
if (InsertPt != Start)
- InsertPt->eraseFromParent();
- Start->eraseFromParent();
+ LoLoop.ToRemove.insert(InsertPt);
+ LoLoop.ToRemove.insert(Start);
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
return &*MIB;
}
-// Goal is to optimise and clean-up these loops:
-//
-// vector.body:
-// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
-// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4
-// ..
-// $lr = MVE_DLSTP_32 renamable $r3
-//
-// The SUB is the old update of the loop iteration count expression, which
-// is no longer needed. This sub is removed when the element count, which is in
-// r3 in this example, is defined by an instruction in the loop, and it has
-// no uses.
-//
-void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) {
- Register ElemCount = LoLoop.VCTP->getOperand(1).getReg();
- MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back();
-
- LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n");
-
- if (LoLoop.ML->getNumBlocks() != 1) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n");
- return;
- }
-
- LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: ";
- LoLoop.VCTP->getOperand(1).dump());
-
- // Find the definition we are interested in removing, if there is one.
- MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount);
- if (!Def) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n");
- return;
- }
-
- // Bail if we define CPSR and it is not dead
- if (!Def->registerDefIsDead(ARM::CPSR, TRI)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n");
- return;
- }
-
- // Bail if elemcount is used in exit blocks, i.e. if it is live-in.
- if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n");
- return;
- }
+void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n");
+ auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) {
+ MachineBasicBlock *MBB = InsertPt.getParent();
+ MachineInstrBuilder MIB =
+ BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR));
+ MIB.addDef(To);
+ MIB.addReg(From);
+ MIB.addReg(From);
+ MIB.addImm(0);
+ MIB.addReg(0);
+ MIB.addReg(To);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB);
+ };
- // Bail if there are uses after this Def in the block.
- SmallVector<MachineInstr*, 4> Uses;
- RDA->getReachingLocalUses(Def, ElemCount, Uses);
- if (Uses.size()) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n");
- return;
- }
+ for (auto &Reduction : LoLoop.Reductions) {
+ MachineInstr &Copy = Reduction->Copy;
+ MachineInstr &Reduce = Reduction->Reduce;
+ Register DestReg = Copy.getOperand(0).getReg();
- Uses.clear();
- RDA->getAllInstWithUseBefore(Def, ElemCount, Uses);
+ // Change the initialiser if present
+ if (Reduction->Init) {
+ MachineInstr *Init = Reduction->Init;
- // Remove Def if there are no uses, or if the only use is the VCTP
- // instruction.
- if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) {
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: ";
- Def->dump());
- Def->eraseFromParent();
- return;
+ for (unsigned i = 0; i < Init->getNumOperands(); ++i) {
+ MachineOperand &MO = Init->getOperand(i);
+ if (MO.isReg() && MO.isUse() && MO.isTied() &&
+ Init->findTiedOperandIdx(i) == 0)
+ Init->getOperand(i).setReg(DestReg);
+ }
+ Init->getOperand(0).setReg(DestReg);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init);
+ } else
+ BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg());
+
+ // Change the reducing op to write to the register that is used to copy
+ // its value on the next iteration. Also update the tied-def operand.
+ Reduce.getOperand(0).setReg(DestReg);
+ Reduce.getOperand(5).setReg(DestReg);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce);
+
+ // Instead of a vpsel, just copy the register into the necessary one.
+ MachineInstr &VPSEL = Reduction->VPSEL;
+ if (VPSEL.getOperand(0).getReg() != DestReg)
+ BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg);
+
+ // Remove the unnecessary instructions.
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n"
+ << " - " << Copy
+ << " - " << VPSEL << "\n");
+ Copy.eraseFromParent();
+ VPSEL.eraseFromParent();
}
-
- LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n";
- for (auto U : Uses) U->dump());
}
void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
@@ -893,28 +1415,24 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
};
// There are a few scenarios which we have to fix up:
- // 1) A VPT block with is only predicated by the vctp and has no internal vpr
- // defs.
- // 2) A VPT block which is only predicated by the vctp but has an internal
- // vpr def.
- // 3) A VPT block which is predicated upon the vctp as well as another vpr
- // def.
- // 4) A VPT block which is not predicated upon a vctp, but contains it and
- // all instructions within the block are predicated upon in.
-
+ // 1. VPT Blocks with non-uniform predicates:
+ // - a. When the divergent instruction is a vctp
+ // - b. When the block uses a vpst, and is only predicated on the vctp
+ // - c. When the block uses a vpt and (optionally) contains one or more
+ // vctp.
+ // 2. VPT Blocks with uniform predicates:
+ // - a. The block uses a vpst, and is only predicated on the vctp
for (auto &Block : LoLoop.getVPTBlocks()) {
SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
if (Block.HasNonUniformPredicate()) {
PredicatedMI *Divergent = Block.getDivergent();
if (isVCTP(Divergent->MI)) {
- // The vctp will be removed, so the size of the vpt block needs to be
- // modified.
- uint64_t Size = getARMVPTBlockMask(Block.size() - 1);
- Block.getVPST()->getOperand(0).setImm(Size);
- LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n");
- } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
- // The VPT block has a non-uniform predicate but it's entry is guarded
- // only by a vctp, which means we:
+ // The vctp will be removed, so the block mask of the vp(s)t will need
+ // to be recomputed.
+ LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
+ } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+ // The VPT block has a non-uniform predicate but it uses a vpst and its
+ // entry is guarded only by a vctp, which means we:
// - Need to remove the original vpst.
// - Then need to unpredicate any following instructions, until
// we come across the divergent vpr def.
@@ -922,7 +1440,7 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
// the divergent vpr def.
// TODO: We could be producing more VPT blocks than necessary and could
// fold the newly created one into a proceeding one.
- for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()),
+ for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),
E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
RemovePredicate(&*I);
@@ -935,28 +1453,58 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
++Size;
++I;
}
+ // Create a VPST (with a null mask for now, we'll recompute it later).
MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
InsertAt->getDebugLoc(),
TII->get(ARM::MVE_VPST));
- MIB.addImm(getARMVPTBlockMask(Size));
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
+ MIB.addImm(0);
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
- Block.getVPST()->eraseFromParent();
+ LoLoop.ToRemove.insert(Block.getPredicateThen());
+ LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+ }
+ // Else, if the block uses a vpt, iterate over the block, removing the
+ // extra VCTPs it may contain.
+ else if (Block.isVPT()) {
+ bool RemovedVCTP = false;
+ for (PredicatedMI &Elt : Block.getInsts()) {
+ MachineInstr *MI = Elt.MI;
+ if (isVCTP(MI)) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI);
+ LoLoop.ToRemove.insert(MI);
+ RemovedVCTP = true;
+ continue;
+ }
+ }
+ if (RemovedVCTP)
+ LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
}
- } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
- // A vpt block which is only predicated upon vctp and has no internal vpr
- // defs:
+ } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) {
+ // A vpt block starting with VPST, is only predicated upon vctp and has no
+ // internal vpr defs:
// - Remove vpst.
// - Unpredicate the remaining instructions.
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
- Block.getVPST()->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
+ LoLoop.ToRemove.insert(Block.getPredicateThen());
for (auto &PredMI : Insts)
RemovePredicate(PredMI.MI);
}
}
-
- LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
- LoLoop.VCTP->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n");
+ // Remove the "main" VCTP
+ LoLoop.ToRemove.insert(LoLoop.VCTP);
+ LLVM_DEBUG(dbgs() << " " << *LoLoop.VCTP);
+ // Remove remaining secondary VCTPs
+ for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) {
+ // All VCTPs that aren't marked for removal yet should be unpredicated ones.
+ // The predicated ones should have already been marked for removal when
+ // visiting the VPT blocks.
+ if (LoLoop.ToRemove.insert(VCTP).second) {
+ assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None &&
+ "Removing Predicated VCTP without updating the block mask!");
+ LLVM_DEBUG(dbgs() << " " << *VCTP);
+ }
+ }
}
void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
@@ -973,9 +1521,8 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
MIB.add(End->getOperand(0));
MIB.add(End->getOperand(1));
LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
-
- LoLoop.End->eraseFromParent();
- LoLoop.Dec->eraseFromParent();
+ LoLoop.ToRemove.insert(LoLoop.Dec);
+ LoLoop.ToRemove.insert(End);
return &*MIB;
};
@@ -1001,7 +1548,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
RevertWhile(LoLoop.Start);
else
LoLoop.Start->eraseFromParent();
- bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true);
+ bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec);
RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
} else {
LoLoop.Start = ExpandLoopStart(LoLoop);
@@ -1009,10 +1556,35 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
LoLoop.End = ExpandLoopEnd(LoLoop);
RemoveDeadBranch(LoLoop.End);
if (LoLoop.IsTailPredicationLegal()) {
- RemoveLoopUpdate(LoLoop);
ConvertVPTBlocks(LoLoop);
+ FixupReductions(LoLoop);
+ }
+ for (auto *I : LoLoop.ToRemove) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
+ I->eraseFromParent();
+ }
+ for (auto *I : LoLoop.BlockMasksToRecompute) {
+ LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
+ recomputeVPTBlockMask(*I);
+ LLVM_DEBUG(dbgs() << " ... done: " << *I);
}
}
+
+ PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
+ DFS.ProcessLoop();
+ const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
+ for (auto *MBB : PostOrder) {
+ recomputeLiveIns(*MBB);
+ // FIXME: For some reason, the live-in print order is non-deterministic for
+ // our tests and I can't out why... So just sort them.
+ MBB->sortUniqueLiveIns();
+ }
+
+ for (auto *MBB : reverse(PostOrder))
+ recomputeLivenessFlags(*MBB);
+
+ // We've moved, removed and inserted new instructions, so update RDA.
+ RDA->reset();
}
bool ARMLowOverheadLoops::RevertNonLoops() {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index 8e01b998d900..f893faa4cf97 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -194,9 +194,9 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
// BLX ip
// POP{ r0, lr }
//
- OutStreamer->EmitCodeAlignment(4);
+ OutStreamer->emitCodeAlignment(4);
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
- OutStreamer->EmitLabel(CurSled);
+ OutStreamer->emitLabel(CurSled);
auto Target = OutContext.createTempSymbol();
// Emit "B #20" instruction, which jumps over the next 24 bytes (because
@@ -209,8 +209,8 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
emitNops(NoopsInSledCount);
- OutStreamer->EmitLabel(Target);
- recordSled(CurSled, MI, Kind);
+ OutStreamer->emitLabel(Target);
+ recordSled(CurSled, MI, Kind, 2);
}
void ARMAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 3b676ca4c883..507c3e69b3a4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -15,4 +15,6 @@ void ARMFunctionInfo::anchor() {}
ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
: isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()),
- hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()) {}
+ hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()),
+ IsCmseNSEntry(MF.getFunction().hasFnAttribute("cmse_nonsecure_entry")),
+ IsCmseNSCall(MF.getFunction().hasFnAttribute("cmse_nonsecure_call")) {}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index bb136e92329b..298c8a238987 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -58,10 +58,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// emitPrologue.
bool RestoreSPFromFP = false;
- /// LRSpilledForFarJump - True if the LR register has been for spilled to
- /// enable far jump.
- bool LRSpilledForFarJump = false;
-
/// LRSpilled - True if the LR register has been for spilled for
/// any reason, so it's legal to emit an ARM::tBfar (i.e. "bl").
bool LRSpilled = false;
@@ -87,6 +83,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
/// areas.
+ unsigned FPCXTSaveSize = 0;
unsigned GPRCS1Size = 0;
unsigned GPRCS2Size = 0;
unsigned DPRCSAlignGapSize = 0;
@@ -109,6 +106,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
/// HasITBlocks - True if IT blocks have been inserted.
bool HasITBlocks = false;
+ // Security Extensions
+ bool IsCmseNSEntry;
+ bool IsCmseNSCall;
+
/// CPEClones - Track constant pool entries clones created by Constant Island
/// pass.
DenseMap<unsigned, unsigned> CPEClones;
@@ -144,6 +145,9 @@ public:
bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
bool isThumb2Function() const { return isThumb && hasThumb2; }
+ bool isCmseNSEntryFunction() const { return IsCmseNSEntry; }
+ bool isCmseNSCallFunction() const { return IsCmseNSCall; }
+
unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; }
void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; }
@@ -162,9 +166,6 @@ public:
bool isLRSpilled() const { return LRSpilled; }
void setLRIsSpilled(bool s) { LRSpilled = s; }
- bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
- void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
-
unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
@@ -179,11 +180,13 @@ public:
void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
void setDPRCalleeSavedAreaOffset(unsigned o) { DPRCSOffset = o; }
+ unsigned getFPCXTSaveAreaSize() const { return FPCXTSaveSize; }
unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
unsigned getDPRCalleeSavedGapSize() const { return DPRCSAlignGapSize; }
unsigned getDPRCalleeSavedAreaSize() const { return DPRCSSize; }
+ void setFPCXTSaveAreaSize(unsigned s) { FPCXTSaveSize = s; }
void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
void setDPRCalleeSavedGapSize(unsigned s) { DPRCSAlignGapSize = s; }
@@ -252,6 +255,7 @@ public:
}
DenseMap<unsigned, unsigned> EHPrologueRemappedRegs;
+ DenseMap<unsigned, unsigned> EHPrologueOffsetInRegs;
void setPreservesR0() { PreservesR0 = true; }
bool getPreservesR0() const { return PreservesR0; }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index e2c9335db419..e750649ce86c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -19,8 +19,9 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsARM.h"
@@ -28,7 +29,6 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
-#include "llvm/PassSupport.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -352,7 +352,6 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
SmallVector<Instruction*, 8> Writes;
LoadPairs.clear();
WideLoads.clear();
- OrderedBasicBlock OrderedBB(BB);
// Collect loads and instruction that may write to memory. For now we only
// record loads which are simple, sign-extended and have a single user.
@@ -384,7 +383,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
ModRefInfo::ModRef)))
continue;
- if (OrderedBB.dominates(Write, Read))
+ if (Write->comesBefore(Read))
RAWDeps[Read].insert(Write);
}
}
@@ -392,8 +391,9 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
// Check whether there's not a write between the two loads which would
// prevent them from being safely merged.
auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
- LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset;
- LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base;
+ bool BaseFirst = Base->comesBefore(Offset);
+ LoadInst *Dominator = BaseFirst ? Base : Offset;
+ LoadInst *Dominated = BaseFirst ? Offset : Base;
if (RAWDeps.count(Dominated)) {
InstSet &WritesBefore = RAWDeps[Dominated];
@@ -401,7 +401,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
for (auto Before : WritesBefore) {
// We can't move the second load backward, past a write, to merge
// with the first load.
- if (OrderedBB.dominates(Dominator, Before))
+ if (Dominator->comesBefore(Before))
return false;
}
}
@@ -571,6 +571,10 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
+ // Check that each mul is operating on two different loads.
+ if (Ld0 == Ld2 || Ld1 == Ld3)
+ return false;
+
if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
@@ -705,12 +709,11 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
}
// Roughly sort the mul pairs in their program order.
- OrderedBasicBlock OrderedBB(R.getRoot()->getParent());
- llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) {
- const Instruction *A = PairA.first->Root;
- const Instruction *B = PairB.first->Root;
- return OrderedBB.dominates(A, B);
- });
+ llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) {
+ const Instruction *A = PairA.first->Root;
+ const Instruction *B = PairB.first->Root;
+ return A->comesBefore(B);
+ });
IntegerType *Ty = IntegerType::get(M->getContext(), 32);
for (auto &Pair : R.getMulPairs()) {
@@ -772,8 +775,7 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
LoadTy->getPointerTo(AddrSpace));
- LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
- Base->getAlignment());
+ LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign());
// Make sure everything is in the correct order in the basic block.
MoveBefore(Base->getPointerOperand(), VecPtr);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
index dea1d767beb4..1ae71be9f760 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td
@@ -7,148 +7,160 @@
//===----------------------------------------------------------------------===//
def HasV4T : Predicate<"Subtarget->hasV4TOps()">,
- AssemblerPredicate<"HasV4TOps", "armv4t">;
+ AssemblerPredicate<(all_of HasV4TOps), "armv4t">;
def NoV4T : Predicate<"!Subtarget->hasV4TOps()">;
def HasV5T : Predicate<"Subtarget->hasV5TOps()">,
- AssemblerPredicate<"HasV5TOps", "armv5t">;
+ AssemblerPredicate<(all_of HasV5TOps), "armv5t">;
def NoV5T : Predicate<"!Subtarget->hasV5TOps()">;
def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">,
- AssemblerPredicate<"HasV5TEOps", "armv5te">;
+ AssemblerPredicate<(all_of HasV5TEOps), "armv5te">;
def HasV6 : Predicate<"Subtarget->hasV6Ops()">,
- AssemblerPredicate<"HasV6Ops", "armv6">;
+ AssemblerPredicate<(all_of HasV6Ops), "armv6">;
def NoV6 : Predicate<"!Subtarget->hasV6Ops()">;
def HasV6M : Predicate<"Subtarget->hasV6MOps()">,
- AssemblerPredicate<"HasV6MOps",
+ AssemblerPredicate<(all_of HasV6MOps),
"armv6m or armv6t2">;
def HasV8MBaseline : Predicate<"Subtarget->hasV8MBaselineOps()">,
- AssemblerPredicate<"HasV8MBaselineOps",
+ AssemblerPredicate<(all_of HasV8MBaselineOps),
"armv8m.base">;
def HasV8MMainline : Predicate<"Subtarget->hasV8MMainlineOps()">,
- AssemblerPredicate<"HasV8MMainlineOps",
+ AssemblerPredicate<(all_of HasV8MMainlineOps),
"armv8m.main">;
def HasV8_1MMainline : Predicate<"Subtarget->hasV8_1MMainlineOps()">,
- AssemblerPredicate<"HasV8_1MMainlineOps",
+ AssemblerPredicate<(all_of HasV8_1MMainlineOps),
"armv8.1m.main">;
def HasMVEInt : Predicate<"Subtarget->hasMVEIntegerOps()">,
- AssemblerPredicate<"HasMVEIntegerOps",
+ AssemblerPredicate<(all_of HasMVEIntegerOps),
"mve">;
def HasMVEFloat : Predicate<"Subtarget->hasMVEFloatOps()">,
- AssemblerPredicate<"HasMVEFloatOps",
+ AssemblerPredicate<(all_of HasMVEFloatOps),
"mve.fp">;
+def HasCDE : Predicate<"Subtarget->hasCDEOps()">,
+ AssemblerPredicate<(all_of HasCDEOps),
+ "cde">;
def HasFPRegs : Predicate<"Subtarget->hasFPRegs()">,
- AssemblerPredicate<"FeatureFPRegs",
+ AssemblerPredicate<(all_of FeatureFPRegs),
"fp registers">;
def HasFPRegs16 : Predicate<"Subtarget->hasFPRegs16()">,
- AssemblerPredicate<"FeatureFPRegs16",
+ AssemblerPredicate<(all_of FeatureFPRegs16),
+ "16-bit fp registers">;
+def HasNoFPRegs16 : Predicate<"!Subtarget->hasFPRegs16()">,
+ AssemblerPredicate<(all_of (not FeatureFPRegs16)),
"16-bit fp registers">;
def HasFPRegs64 : Predicate<"Subtarget->hasFPRegs64()">,
- AssemblerPredicate<"FeatureFPRegs64",
+ AssemblerPredicate<(all_of FeatureFPRegs64),
"64-bit fp registers">;
def HasFPRegsV8_1M : Predicate<"Subtarget->hasFPRegs() && Subtarget->hasV8_1MMainlineOps()">,
- AssemblerPredicate<"FeatureFPRegs,HasV8_1MMainlineOps",
+ AssemblerPredicate<(all_of FeatureFPRegs, HasV8_1MMainlineOps),
"armv8.1m.main with FP or MVE">;
def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">,
- AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
+ AssemblerPredicate<(all_of HasV6T2Ops), "armv6t2">;
def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">;
def HasV6K : Predicate<"Subtarget->hasV6KOps()">,
- AssemblerPredicate<"HasV6KOps", "armv6k">;
+ AssemblerPredicate<(all_of HasV6KOps), "armv6k">;
def NoV6K : Predicate<"!Subtarget->hasV6KOps()">;
def HasV7 : Predicate<"Subtarget->hasV7Ops()">,
- AssemblerPredicate<"HasV7Ops", "armv7">;
+ AssemblerPredicate<(all_of HasV7Ops), "armv7">;
def HasV8 : Predicate<"Subtarget->hasV8Ops()">,
- AssemblerPredicate<"HasV8Ops", "armv8">;
+ AssemblerPredicate<(all_of HasV8Ops), "armv8">;
def PreV8 : Predicate<"!Subtarget->hasV8Ops()">,
- AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
+ AssemblerPredicate<(all_of (not HasV8Ops)), "armv7 or earlier">;
def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
- AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+ AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">;
def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
- AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+ AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">;
def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">,
- AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
+ AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">;
def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">,
- AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
+ AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">;
def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">,
- AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
+ AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
+def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">,
+ AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">;
def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">,
- AssemblerPredicate<"FeatureVFP2_SP", "VFP2">;
+ AssemblerPredicate<(all_of FeatureVFP2_SP), "VFP2">;
def HasVFP3 : Predicate<"Subtarget->hasVFP3Base()">,
- AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">;
+ AssemblerPredicate<(all_of FeatureVFP3_D16_SP), "VFP3">;
def HasVFP4 : Predicate<"Subtarget->hasVFP4Base()">,
- AssemblerPredicate<"FeatureVFP4_D16_SP", "VFP4">;
+ AssemblerPredicate<(all_of FeatureVFP4_D16_SP), "VFP4">;
def HasDPVFP : Predicate<"Subtarget->hasFP64()">,
- AssemblerPredicate<"FeatureFP64",
+ AssemblerPredicate<(all_of FeatureFP64),
"double precision VFP">;
def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8Base()">,
- AssemblerPredicate<"FeatureFPARMv8_D16_SP", "FPARMv8">;
+ AssemblerPredicate<(all_of FeatureFPARMv8_D16_SP), "FPARMv8">;
def HasNEON : Predicate<"Subtarget->hasNEON()">,
- AssemblerPredicate<"FeatureNEON", "NEON">;
+ AssemblerPredicate<(all_of FeatureNEON), "NEON">;
def HasSHA2 : Predicate<"Subtarget->hasSHA2()">,
- AssemblerPredicate<"FeatureSHA2", "sha2">;
+ AssemblerPredicate<(all_of FeatureSHA2), "sha2">;
def HasAES : Predicate<"Subtarget->hasAES()">,
- AssemblerPredicate<"FeatureAES", "aes">;
+ AssemblerPredicate<(all_of FeatureAES), "aes">;
def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
- AssemblerPredicate<"FeatureCrypto", "crypto">;
+ AssemblerPredicate<(all_of FeatureCrypto), "crypto">;
def HasDotProd : Predicate<"Subtarget->hasDotProd()">,
- AssemblerPredicate<"FeatureDotProd", "dotprod">;
+ AssemblerPredicate<(all_of FeatureDotProd), "dotprod">;
def HasCRC : Predicate<"Subtarget->hasCRC()">,
- AssemblerPredicate<"FeatureCRC", "crc">;
+ AssemblerPredicate<(all_of FeatureCRC), "crc">;
def HasRAS : Predicate<"Subtarget->hasRAS()">,
- AssemblerPredicate<"FeatureRAS", "ras">;
+ AssemblerPredicate<(all_of FeatureRAS), "ras">;
def HasLOB : Predicate<"Subtarget->hasLOB()">,
- AssemblerPredicate<"FeatureLOB", "lob">;
+ AssemblerPredicate<(all_of FeatureLOB), "lob">;
def HasFP16 : Predicate<"Subtarget->hasFP16()">,
- AssemblerPredicate<"FeatureFP16","half-float conversions">;
+ AssemblerPredicate<(all_of FeatureFP16),"half-float conversions">;
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
- AssemblerPredicate<"FeatureFullFP16","full half-float">;
+ AssemblerPredicate<(all_of FeatureFullFP16),"full half-float">;
def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">,
- AssemblerPredicate<"FeatureFP16FML","full half-float fml">;
+ AssemblerPredicate<(all_of FeatureFP16FML),"full half-float fml">;
+def HasBF16 : Predicate<"Subtarget->hasBF16()">,
+ AssemblerPredicate<(all_of FeatureBF16),"BFloat16 floating point extension">;
+def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">,
+ AssemblerPredicate<(all_of FeatureMatMulInt8),"8-bit integer matrix multiply">;
def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">,
- AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">;
+ AssemblerPredicate<(all_of FeatureHWDivThumb), "divide in THUMB">;
def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">,
- AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
+ AssemblerPredicate<(all_of FeatureHWDivARM), "divide in ARM">;
def HasDSP : Predicate<"Subtarget->hasDSP()">,
- AssemblerPredicate<"FeatureDSP", "dsp">;
+ AssemblerPredicate<(all_of FeatureDSP), "dsp">;
def HasDB : Predicate<"Subtarget->hasDataBarrier()">,
- AssemblerPredicate<"FeatureDB",
+ AssemblerPredicate<(all_of FeatureDB),
"data-barriers">;
def HasDFB : Predicate<"Subtarget->hasFullDataBarrier()">,
- AssemblerPredicate<"FeatureDFB",
+ AssemblerPredicate<(all_of FeatureDFB),
"full-data-barrier">;
def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">,
- AssemblerPredicate<"FeatureV7Clrex",
+ AssemblerPredicate<(all_of FeatureV7Clrex),
"v7 clrex">;
def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">,
- AssemblerPredicate<"FeatureAcquireRelease",
+ AssemblerPredicate<(all_of FeatureAcquireRelease),
"acquire/release">;
def HasMP : Predicate<"Subtarget->hasMPExtension()">,
- AssemblerPredicate<"FeatureMP",
+ AssemblerPredicate<(all_of FeatureMP),
"mp-extensions">;
def HasVirtualization: Predicate<"false">,
- AssemblerPredicate<"FeatureVirtualization",
+ AssemblerPredicate<(all_of FeatureVirtualization),
"virtualization-extensions">;
def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">,
- AssemblerPredicate<"FeatureTrustZone",
+ AssemblerPredicate<(all_of FeatureTrustZone),
"TrustZone">;
def Has8MSecExt : Predicate<"Subtarget->has8MSecExt()">,
- AssemblerPredicate<"Feature8MSecExt",
+ AssemblerPredicate<(all_of Feature8MSecExt),
"ARMv8-M Security Extensions">;
def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">;
def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
def IsThumb : Predicate<"Subtarget->isThumb()">,
- AssemblerPredicate<"ModeThumb", "thumb">;
+ AssemblerPredicate<(all_of ModeThumb), "thumb">;
def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">;
def IsThumb2 : Predicate<"Subtarget->isThumb2()">,
- AssemblerPredicate<"ModeThumb,FeatureThumb2",
+ AssemblerPredicate<(all_of ModeThumb, FeatureThumb2),
"thumb2">;
def IsMClass : Predicate<"Subtarget->isMClass()">,
- AssemblerPredicate<"FeatureMClass", "armv*m">;
+ AssemblerPredicate<(all_of FeatureMClass), "armv*m">;
def IsNotMClass : Predicate<"!Subtarget->isMClass()">,
- AssemblerPredicate<"!FeatureMClass",
+ AssemblerPredicate<(all_of (not FeatureMClass)),
"!armv*m">;
def IsARM : Predicate<"!Subtarget->isThumb()">,
- AssemblerPredicate<"!ModeThumb", "arm-mode">;
+ AssemblerPredicate<(all_of (not ModeThumb)), "arm-mode">;
def IsMachO : Predicate<"Subtarget->isTargetMachO()">;
def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">;
def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">;
@@ -157,12 +169,12 @@ def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">;
def IsReadTPHard : Predicate<"Subtarget->isReadTPHard()">;
def IsReadTPSoft : Predicate<"!Subtarget->isReadTPHard()">;
def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">,
- AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
+ AssemblerPredicate<(all_of FeatureNaClTrap), "NaCl">;
def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">;
def UseNegativeImmediates :
Predicate<"false">,
- AssemblerPredicate<"!FeatureNoNegativeImmediates",
+ AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)),
"NegativeImmediates">;
// FIXME: Eventually this will be just "hasV6T2Ops".
@@ -206,4 +218,4 @@ def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;
// Armv8.5-A extensions
def HasSB : Predicate<"Subtarget->hasSB()">,
- AssemblerPredicate<"FeatureSB", "sb">;
+ AssemblerPredicate<(all_of FeatureSB), "sb">;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 43c8cd5a89be..f9dbfef4c113 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -131,45 +131,47 @@ static void checkValueMappings() {
ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
: ARMGenRegisterBankInfo() {
- static bool AlreadyInit = false;
// We have only one set of register banks, whatever the subtarget
// is. Therefore, the initialization of the RegBanks table should be
// done only once. Indeed the table of all register banks
// (ARM::RegBanks) is unique in the compiler. At some point, it
// will get tablegen'ed and the whole constructor becomes empty.
- if (AlreadyInit)
- return;
- AlreadyInit = true;
+ static llvm::once_flag InitializeRegisterBankFlag;
- const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID);
- (void)RBGPR;
- assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up");
+ static auto InitializeRegisterBankOnce = [&]() {
+ const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID);
+ (void)RBGPR;
+ assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up");
- // Initialize the GPR bank.
- assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnopcRegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.covers(*TRI.getRegClass(ARM::rGPRRegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPRRegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) &&
- "Subclass not added?");
- assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
+ // Initialize the GPR bank.
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnopcRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::rGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(
+ *TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) &&
+ "Subclass not added?");
+ assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
#ifndef NDEBUG
- ARM::checkPartialMappings();
- ARM::checkValueMappings();
+ ARM::checkPartialMappings();
+ ARM::checkValueMappings();
#endif
+ };
+
+ llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
}
const RegisterBank &
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 56055a15483a..a384b0dc757c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -305,6 +305,17 @@ def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
let DiagnosticType = "rGPR";
}
+// GPRs without the PC and SP but with APSR_NZCV.Some instructions allow
+// accessing the APSR_NZCV, while actually encoding PC in the register field.
+// This is useful for assembly and disassembly only.
+// Currently used by the CDE extension.
+def GPRwithAPSR_NZCVnosp
+ : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), LR, APSR_NZCV)> {
+ let isAllocatable = 0;
+ let DiagnosticString =
+ "operand must be a register in the range [r0, r12], r14 or apsr_nzcv";
+}
+
// Thumb registers are R0-R7 normally. Some instructions can still use
// the general GPR register class above (MOV, e.g.)
def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)> {
@@ -379,7 +390,7 @@ def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
let DiagnosticString = "operand must be a register in range [s0, s31]";
}
-def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> {
+def HPR : RegisterClass<"ARM", [f16, bf16], 32, (sequence "S%u", 0, 31)> {
let AltOrders = [(add (decimate HPR, 2), SPR),
(add (decimate HPR, 4),
(decimate HPR, 2),
@@ -401,7 +412,7 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> {
// class.
// ARM requires only word alignment for double. It's more performant if it
// is double-word alignment though.
-def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16, v4bf16], 64,
(sequence "D%u", 0, 31)> {
// Allocate non-VFP2 registers D16-D31 first, and prefer even registers on
// Darwin platforms.
@@ -422,20 +433,20 @@ def FPWithVPR : RegisterClass<"ARM", [f32], 32, (add SPR, DPR, VPR)> {
// Subset of DPR that are accessible with VFP2 (and so that also have
// 32-bit SPR subregs).
-def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16, v4bf16], 64,
(trunc DPR, 16)> {
let DiagnosticString = "operand must be a register in range [d0, d15]";
}
// Subset of DPR which can be used as a source of NEON scalars for 16-bit
// operations
-def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16, v4bf16], 64,
(trunc DPR, 8)> {
let DiagnosticString = "operand must be a register in range [d0, d7]";
}
// Generic 128-bit vector register class.
-def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16, v8bf16], 128,
(sequence "Q%u", 0, 15)> {
// Allocate non-VFP2 aliases Q8-Q15 first.
let AltOrders = [(rotl QPR, 8), (trunc QPR, 8)];
@@ -577,3 +588,6 @@ def Tuples4DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4, dsub_6],
// Spaced quads of D registers.
def DQuadSpc : RegisterClass<"ARM", [v4i64], 64, (add Tuples3DSpc)>;
+
+// FP context payload
+def FPCXTRegs : RegisterClass<"ARM", [i32], 32, (add FPCXTNS)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td
index a79f3348f338..d9a8d304c41f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td
@@ -96,7 +96,7 @@ def CortexA57Model : SchedMachineModel {
let FullInstRWOverlapCheck = 0;
let UnsupportedFeatures = [HasV8_1MMainline, HasMVEInt, HasMVEFloat,
- HasFPRegsV8_1M];
+ HasFPRegsV8_1M, HasFP16FML, HasMatMulInt8, HasBF16];
}
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td
index 00a44599b1b2..e0e98bfa0e9b 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -744,7 +744,7 @@ let SchedModel = SwiftModel in {
SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
- // Inaccurate: reuse describtion from 9 S registers.
+ // Inaccurate: reuse description from 9 S registers.
SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
@@ -760,7 +760,7 @@ let SchedModel = SwiftModel in {
SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
SwiftWriteP01OneCycle, SwiftVLDMPerm3]>,
- // Inaccurate: reuse describtion from 9 S registers.
+ // Inaccurate: reuse description from 9 S registers.
SchedVar<SwiftLMAddr13Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
@@ -958,7 +958,7 @@ let SchedModel = SwiftModel in {
def : InstRW<[SwiftWriteLM7Cy, SwiftWriteP01OneCycle, SwiftWriteLM8Cy,
SwiftWriteLM8Cy, SwiftExt1xP0, SwiftVLDMPerm3],
(instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
- // Four element struture.
+ // Four element structure.
def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
SwiftWriteLM10CyNo, SwiftExt1xP0, SwiftVLDMPerm5],
(instregex "VLD4(LN|DUP)(d|q)(8|16|32)$",
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index cade06e8c109..7e06229b60c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -126,24 +126,24 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
const ARMSubtarget &Subtarget =
DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
// Do repeated 4-byte loads and stores. To be improved.
// This requires 4-byte alignment.
- if ((Align & 3) != 0)
+ if (Alignment < Align(4))
return SDValue();
// This requires the copy size to be a constant, preferably
// within a subtarget-specific limit.
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
if (!ConstantSize)
- return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
- RTLIB::MEMCPY);
+ return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+ Alignment.value(), RTLIB::MEMCPY);
uint64_t SizeVal = ConstantSize->getZExtValue();
if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
- return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
- RTLIB::MEMCPY);
+ return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+ Alignment.value(), RTLIB::MEMCPY);
unsigned BytesLeft = SizeVal & 3;
unsigned NumMemOps = SizeVal >> 2;
@@ -240,16 +240,16 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile,
+ SDValue Size, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
- return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
- RTLIB::MEMMOVE);
+ return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+ Alignment.value(), RTLIB::MEMMOVE);
}
SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile,
+ SDValue Size, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
- return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
- RTLIB::MEMSET);
+ return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+ Alignment.value(), RTLIB::MEMSET);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
index b8a86ae7310f..7aa831c09248 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -39,22 +39,22 @@ class ARMSelectionDAGInfo : public SelectionDAGTargetInfo {
public:
SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile,
- bool AlwaysInline,
+ SDValue Size, Align Alignment,
+ bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const override;
SDValue
EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
SDValue Dst, SDValue Src, SDValue Size,
- unsigned Align, bool isVolatile,
+ Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const override;
// Adjust parameters for memset, see RTABI section 4.3.4
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Op1, SDValue Op2,
- SDValue Op3, unsigned Align, bool isVolatile,
+ SDValue Op3, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo) const override;
SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
index eb4d39b01cbb..46802037c2aa 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -183,7 +183,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
if (!ArchFS.empty())
ArchFS = (Twine(ArchFS) + "," + FS).str();
else
- ArchFS = FS;
+ ArchFS = std::string(FS);
}
ParseSubtargetFeatures(CPUString, ArchFS);
@@ -292,12 +292,15 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
case CortexA73:
case CortexA75:
case CortexA76:
+ case CortexA77:
+ case CortexA78:
case CortexR4:
case CortexR4F:
case CortexR5:
case CortexR7:
case CortexM3:
case CortexR52:
+ case CortexX1:
break;
case Exynos:
LdStMultipleTiming = SingleIssuePlusExtras;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
index 6bdd021970ef..2703e385dd81 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -28,6 +28,7 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/MC/MCSchedule.h"
+#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <memory>
#include <string>
@@ -60,6 +61,8 @@ protected:
CortexA73,
CortexA75,
CortexA76,
+ CortexA77,
+ CortexA78,
CortexA8,
CortexA9,
CortexM3,
@@ -68,6 +71,7 @@ protected:
CortexR5,
CortexR52,
CortexR7,
+ CortexX1,
Exynos,
Krait,
Kryo,
@@ -108,6 +112,7 @@ protected:
ARMv83a,
ARMv84a,
ARMv85a,
+ ARMv86a,
ARMv8a,
ARMv8mBaseline,
ARMv8mMainline,
@@ -157,11 +162,13 @@ protected:
bool HasV8_3aOps = false;
bool HasV8_4aOps = false;
bool HasV8_5aOps = false;
+ bool HasV8_6aOps = false;
bool HasV8MBaselineOps = false;
bool HasV8MMainlineOps = false;
bool HasV8_1MMainlineOps = false;
bool HasMVEIntegerOps = false;
bool HasMVEFloatOps = false;
+ bool HasCDEOps = false;
/// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
/// floating point ISAs are supported.
@@ -254,6 +261,12 @@ protected:
/// HasFP16FML - True if subtarget supports half-precision FP fml operations
bool HasFP16FML = false;
+ /// HasBF16 - True if subtarget supports BFloat16 floating point operations
+ bool HasBF16 = false;
+
+ /// HasMatMulInt8 - True if subtarget supports 8-bit integer matrix multiply
+ bool HasMatMulInt8 = false;
+
/// HasD32 - True if subtarget has the full 32 double precision
/// FP registers for VFPv3.
bool HasD32 = false;
@@ -562,6 +575,7 @@ private:
void initSubtargetFeatures(StringRef CPU, StringRef FS);
ARMFrameLowering *initializeFrameLowering(StringRef CPU, StringRef FS);
+ std::bitset<8> CoprocCDE = {};
public:
void computeIssueWidth();
@@ -579,11 +593,13 @@ public:
bool hasV8_3aOps() const { return HasV8_3aOps; }
bool hasV8_4aOps() const { return HasV8_4aOps; }
bool hasV8_5aOps() const { return HasV8_5aOps; }
+ bool hasV8_6aOps() const { return HasV8_6aOps; }
bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; }
bool hasMVEIntegerOps() const { return HasMVEIntegerOps; }
bool hasMVEFloatOps() const { return HasMVEFloatOps; }
+ bool hasCDEOps() const { return HasCDEOps; }
bool hasFPRegs() const { return HasFPRegs; }
bool hasFPRegs16() const { return HasFPRegs16; }
bool hasFPRegs64() const { return HasFPRegs64; }
@@ -689,12 +705,15 @@ public:
bool hasD32() const { return HasD32; }
bool hasFullFP16() const { return HasFullFP16; }
bool hasFP16FML() const { return HasFP16FML; }
+ bool hasBF16() const { return HasBF16; }
bool hasFuseAES() const { return HasFuseAES; }
bool hasFuseLiterals() const { return HasFuseLiterals; }
/// Return true if the CPU supports any kind of instruction fusion.
bool hasFusion() const { return hasFuseAES() || hasFuseLiterals(); }
+ bool hasMatMulInt8() const { return HasMatMulInt8; }
+
const Triple &getTargetTriple() const { return TargetTriple; }
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 84876eda33a6..9ead5fa4308c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -96,6 +96,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
initializeARMExpandPseudoPass(Registry);
initializeThumb2SizeReducePass(Registry);
initializeMVEVPTBlockPass(Registry);
+ initializeMVEVPTOptimisationsPass(Registry);
initializeMVETailPredicationPass(Registry);
initializeARMLowOverheadLoopsPass(Registry);
initializeMVEGatherScatterLoweringPass(Registry);
@@ -243,7 +244,14 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
this->Options.NoTrapAfterNoreturn = true;
}
+ // ARM supports the debug entry values.
+ setSupportsDebugEntryValues(true);
+
initAsmInfo();
+
+ // ARM supports the MachineOutliner.
+ setMachineOutliner(true);
+ setSupportsDefaultOutlining(false);
}
ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
@@ -359,6 +367,7 @@ public:
void addPreRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
+ void addPreEmitPass2() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
@@ -483,6 +492,8 @@ bool ARMPassConfig::addGlobalInstructionSelect() {
void ARMPassConfig::addPreRegAlloc() {
if (getOptLevel() != CodeGenOpt::None) {
+ addPass(createMVEVPTOptimisationsPass());
+
addPass(createMLxExpansionPass());
if (EnableARMLoadStoreOpt)
@@ -507,9 +518,12 @@ void ARMPassConfig::addPreSched2() {
addPass(createARMExpandPseudoPass());
if (getOptLevel() != CodeGenOpt::None) {
- // in v8, IfConversion depends on Thumb instruction widths
+ // When optimising for size, always run the Thumb2SizeReduction pass before
+ // IfConversion. Otherwise, check whether IT blocks are restricted
+ // (e.g. in v8, IfConversion depends on Thumb instruction widths)
addPass(createThumb2SizeReductionPass([this](const Function &F) {
- return this->TM->getSubtarget<ARMSubtarget>(F).restrictIT();
+ return this->TM->getSubtarget<ARMSubtarget>(F).hasMinSize() ||
+ this->TM->getSubtarget<ARMSubtarget>(F).restrictIT();
}));
addPass(createIfConverter([](const MachineFunction &MF) {
@@ -538,7 +552,9 @@ void ARMPassConfig::addPreEmitPass() {
// Don't optimize barriers at -O0.
if (getOptLevel() != CodeGenOpt::None)
addPass(createARMOptimizeBarriersPass());
+}
+void ARMPassConfig::addPreEmitPass2() {
addPass(createARMConstantIslandPass());
addPass(createARMLowOverheadLoopsPass());
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index 891329d3f297..3f0e3360632d 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -49,7 +49,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
// Since we cannot modify flags for an existing section, we create a new
// section with the right flags, and use 0 as the unique ID for
// execute-only text
- TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U);
+ TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U, nullptr);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 7ff05034c1f2..bea4e157a131 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -16,18 +16,19 @@
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -45,7 +46,7 @@ static cl::opt<bool> DisableLowOverheadLoops(
"disable-arm-loloops", cl::Hidden, cl::init(false),
cl::desc("Disable the generation of low-overhead loops"));
-extern cl::opt<bool> DisableTailPredication;
+extern cl::opt<TailPredication::Mode> EnableTailPredication;
extern cl::opt<bool> EnableMaskedGatherScatters;
@@ -57,17 +58,32 @@ bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
const FeatureBitset &CalleeBits =
TM.getSubtargetImpl(*Callee)->getFeatureBits();
- // To inline a callee, all features not in the whitelist must match exactly.
- bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
- (CalleeBits & ~InlineFeatureWhitelist);
- // For features in the whitelist, the callee's features must be a subset of
+ // To inline a callee, all features not in the allowed list must match exactly.
+ bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
+ (CalleeBits & ~InlineFeaturesAllowed);
+ // For features in the allowed list, the callee's features must be a subset of
// the callers'.
- bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
- (CalleeBits & InlineFeatureWhitelist);
+ bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
+ (CalleeBits & InlineFeaturesAllowed);
return MatchExact && MatchSubset;
}
-int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const {
+ if (L->getHeader()->getParent()->hasOptSize())
+ return false;
+ if (ST->hasMVEIntegerOps())
+ return false;
+ return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
+}
+
+bool ARMTTIImpl::shouldFavorPostInc() const {
+ if (ST->hasMVEIntegerOps())
+ return true;
+ return false;
+}
+
+int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned Bits = Ty->getPrimitiveSizeInBits();
@@ -110,7 +126,7 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
}
int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty) {
+ Type *Ty, TTI::TargetCostKind CostKind) {
// Division by a constant can be turned into multiplication, but only if we
// know it's constant. So it's not so much that the immediate is cheap (it's
// not), but that the alternative is worse.
@@ -125,12 +141,14 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im
if (Imm == 255 || Imm == 65535)
return 0;
// Conversion to BIC is free, and means we can use ~Imm instead.
- return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
+ return std::min(getIntImmCost(Imm, Ty, CostKind),
+ getIntImmCost(~Imm, Ty, CostKind));
}
if (Opcode == Instruction::Add)
// Conversion to SUB is free, and means we can use -Imm instead.
- return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
+ return std::min(getIntImmCost(Imm, Ty, CostKind),
+ getIntImmCost(-Imm, Ty, CostKind));
if (Opcode == Instruction::ICmp && Imm.isNegative() &&
Ty->getIntegerBitWidth() == 32) {
@@ -147,34 +165,27 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im
if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
return 0;
- return getIntImmCost(Imm, Ty);
+ return getIntImmCost(Imm, Ty, CostKind);
}
int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
- // Single to/from double precision conversions.
- static const CostTblEntry NEONFltDblTbl[] = {
- // Vector fptrunc/fpext conversions.
- { ISD::FP_ROUND, MVT::v2f64, 2 },
- { ISD::FP_EXTEND, MVT::v2f32, 2 },
- { ISD::FP_EXTEND, MVT::v4f32, 4 }
+ // TODO: Allow non-throughput costs that aren't binary.
+ auto AdjustCost = [&CostKind](int Cost) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Cost == 0 ? 0 : 1;
+ return Cost;
};
- if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
- ISD == ISD::FP_EXTEND)) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
- if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
- return LT.first * Entry->Cost;
- }
-
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
if (!SrcTy.isSimple() || !DstTy.isSimple())
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
// The extend of a load is free
if (I && isa<LoadInst>(I->getOperand(0))) {
@@ -194,7 +205,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
};
if (const auto *Entry = ConvertCostTableLookup(
LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
@@ -203,27 +214,129 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
{ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
{ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
+ // The following extend from a legal type to an illegal type, so need to
+ // split the load. This introduced an extra load operation, but the
+ // extend is still "free".
+ {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
+ {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
+ {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
+ {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
+ {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
+ {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
};
if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
if (const auto *Entry =
ConvertCostTableLookup(MVELoadConversionTbl, ISD,
DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ }
+
+ static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
+ // FPExtends are similar but also require the VCVT instructions.
+ {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
+ {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
+ };
+ if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
+ if (const auto *Entry =
+ ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
+ DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+ return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ }
+ }
+
+ // The truncate of a store is free. This is the mirror of extends above.
+ if (I && I->hasOneUse() && isa<StoreInst>(*I->user_begin())) {
+ static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
+ {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
+ {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
+ {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
+ {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
+ {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
+ {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
+ };
+ if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
+ if (const auto *Entry =
+ ConvertCostTableLookup(MVELoadConversionTbl, ISD, SrcTy.getSimpleVT(),
+ DstTy.getSimpleVT()))
+ return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ }
+
+ static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
+ {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
+ {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
+ };
+ if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
+ if (const auto *Entry =
+ ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(),
+ DstTy.getSimpleVT()))
+ return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
}
+ // NEON vector operations that can extend their inputs.
+ if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
+ I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
+ static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
+ // vaddl
+ { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
+ { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
+ // vsubl
+ { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
+ { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
+ // vmull
+ { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
+ { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
+ // vshll
+ { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
+ { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
+ };
+
+ auto *User = cast<Instruction>(*I->user_begin());
+ int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
+ if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
+ DstTy.getSimpleVT(),
+ SrcTy.getSimpleVT())) {
+ return AdjustCost(Entry->Cost);
+ }
+ }
+
+ // Single to/from double precision conversions.
+ if (Src->isVectorTy() && ST->hasNEON() &&
+ ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
+ DstTy.getScalarType() == MVT::f32) ||
+ (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
+ DstTy.getScalarType() == MVT::f64))) {
+ static const CostTblEntry NEONFltDblTbl[] = {
+ // Vector fptrunc/fpext conversions.
+ {ISD::FP_ROUND, MVT::v2f64, 2},
+ {ISD::FP_EXTEND, MVT::v2f32, 2},
+ {ISD::FP_EXTEND, MVT::v4f32, 4}};
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
+ return AdjustCost(LT.first * Entry->Cost);
+ }
+
// Some arithmetic, load and store operations have specific instructions
// to cast up/down their types automatically at no extra cost.
// TODO: Get these tables to know at least what the related operations are.
static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
- { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
- { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
// The number of vmovl instructions for the extension.
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
@@ -294,7 +407,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
DstTy.getSimpleVT(),
SrcTy.getSimpleVT()))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
// Scalar float to integer conversions.
@@ -324,7 +437,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
DstTy.getSimpleVT(),
SrcTy.getSimpleVT()))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
// Scalar integer to float conversions.
@@ -355,7 +468,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
ISD, DstTy.getSimpleVT(),
SrcTy.getSimpleVT()))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
// MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
@@ -380,7 +493,28 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
ISD, DstTy.getSimpleVT(),
SrcTy.getSimpleVT()))
- return Entry->Cost * ST->getMVEVectorCostFactor();
+ return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+ }
+
+ if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
+ // As general rule, fp converts that were not matched above are scalarized
+ // and cost 1 vcvt for each lane, so long as the instruction is available.
+ // If not it will become a series of function calls.
+ const int CallCost = getCallInstrCost(nullptr, Dst, {Src}, CostKind);
+ int Lanes = 1;
+ if (SrcTy.isFixedLengthVector())
+ Lanes = SrcTy.getVectorNumElements();
+ auto IsLegal = [this](EVT VT) {
+ EVT EltVT = VT.getScalarType();
+ return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
+ (EltVT == MVT::f64 && ST->hasFP64()) ||
+ (EltVT == MVT::f16 && ST->hasFullFP16());
+ };
+
+ if (IsLegal(SrcTy) && IsLegal(DstTy))
+ return Lanes;
+ else
+ return Lanes * CallCost;
}
// Scalar integer conversion costs.
@@ -399,13 +533,14 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
DstTy.getSimpleVT(),
SrcTy.getSimpleVT()))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
? ST->getMVEVectorCostFactor()
: 1;
- return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return AdjustCost(
+ BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
}
int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -420,7 +555,7 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
Opcode == Instruction::ExtractElement)) {
// Cross-class copies are expensive on many microarchitectures,
// so assume they are expensive by default.
- if (ValTy->getVectorElementType()->isIntegerTy())
+ if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
return 3;
// Even if it's not a cross class copy, this likely leads to mixing
@@ -438,14 +573,19 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
// result anyway.
return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
ST->getMVEVectorCostFactor()) *
- ValTy->getVectorNumElements() / 2;
+ cast<FixedVectorType>(ValTy)->getNumElements() / 2;
}
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
}
int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// On NEON a vector select gets lowered to vbsl.
if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
@@ -472,7 +612,8 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
? ST->getMVEVectorCostFactor()
: 1;
- return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+ return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind,
+ I);
}
int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -496,11 +637,28 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
return BaseT::getAddressComputationCost(Ty, SE, Ptr);
}
-bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
+bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ // If a VCTP is part of a chain, it's already profitable and shouldn't be
+ // optimized, else LSR may block tail-predication.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::arm_mve_vctp8:
+ case Intrinsic::arm_mve_vctp16:
+ case Intrinsic::arm_mve_vctp32:
+ case Intrinsic::arm_mve_vctp64:
+ return true;
+ default:
+ break;
+ }
+ }
+ return false;
+}
+
+bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
return false;
- if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
+ if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
// Don't support v2i1 yet.
if (VecTy->getNumElements() == 2)
return false;
@@ -512,12 +670,11 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
}
unsigned EltWidth = DataTy->getScalarSizeInBits();
- return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
- (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
- (EltWidth == 8);
+ return (EltWidth == 32 && Alignment >= 4) ||
+ (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
}
-bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
+bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
return false;
@@ -534,8 +691,8 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
return false;
unsigned EltWidth = Ty->getScalarSizeInBits();
- return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
- (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8);
+ return ((EltWidth == 32 && Alignment >= 4) ||
+ (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
}
int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
@@ -552,8 +709,8 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
return LibCallCost;
const unsigned Size = C->getValue().getZExtValue();
- const unsigned DstAlign = MI->getDestAlignment();
- const unsigned SrcAlign = MI->getSourceAlignment();
+ const Align DstAlign = *MI->getDestAlign();
+ const Align SrcAlign = *MI->getSourceAlign();
const Function *F = I->getParent()->getParent();
const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
std::vector<EVT> MemOps;
@@ -562,8 +719,9 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
// loaded and stored. That's why we multiply the number of elements by 2 to
// get the cost for this memcpy.
if (getTLI()->findOptimalMemOpLowering(
- MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/,
- false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/,
+ MemOps, Limit,
+ MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
+ /*IsVolatile*/ true),
MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
F->getAttributes()))
return MemOps.size() * 2;
@@ -572,8 +730,8 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
return LibCallCost;
}
-int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
+int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+ int Index, VectorType *SubTp) {
if (ST->hasNEON()) {
if (Kind == TTI::SK_Broadcast) {
static const CostTblEntry NEONDupTbl[] = {
@@ -667,12 +825,19 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
}
int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind,
TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info,
TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo,
ArrayRef<const Value *> Args,
const Instruction *CxtI) {
+ // TODO: Handle more cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info, Opd1PropInfo,
+ Opd2PropInfo, Args, CxtI);
+
int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -723,7 +888,8 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
return LT.first * Entry->Cost;
- int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+ int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info,
Opd1PropInfo, Opd2PropInfo);
// This is somewhat of a hack. The problem that we are facing is that SROA
@@ -779,12 +945,13 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * BaseCost;
// Else this is expand, assume that we need to scalarize this op.
- if (Ty->isVectorTy()) {
- unsigned Num = Ty->getVectorNumElements();
- unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+ if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
+ unsigned Num = VTy->getNumElements();
+ unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType(),
+ CostKind);
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
- return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
+ return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost;
}
return BaseCost;
@@ -792,26 +959,53 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return 1;
+
+ // Type legalization can't handle structs
+ if (TLI->getValueType(DL, Src, true) == MVT::Other)
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
if (ST->hasNEON() && Src->isVectorTy() &&
(Alignment && *Alignment != Align(16)) &&
- Src->getVectorElementType()->isDoubleTy()) {
+ cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
// Unaligned loads/stores are extremely inefficient.
// We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
return LT.first * 4;
}
+
+ // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
+ // Same for stores.
+ if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
+ ((Opcode == Instruction::Load && I->hasOneUse() &&
+ isa<FPExtInst>(*I->user_begin())) ||
+ (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
+ FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
+ Type *DstTy =
+ Opcode == Instruction::Load
+ ? (*I->user_begin())->getType()
+ : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
+ if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
+ DstTy->getScalarType()->isFloatTy())
+ return ST->getMVEVectorCostFactor();
+ }
+
int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
? ST->getMVEVectorCostFactor()
: 1;
- return BaseCost * LT.first;
+ return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind, I);
}
int ARMTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
- bool UseMaskForGaps) {
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
@@ -820,8 +1014,9 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
!UseMaskForCond && !UseMaskForGaps) {
- unsigned NumElts = VecTy->getVectorNumElements();
- auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+ unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
+ auto *SubVecTy =
+ FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
// vldN/vstN only support legal vector types of size 64 or 128 in bits.
// Accesses having vector types that are a multiple of 128 bits can be
@@ -842,10 +1037,109 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
}
+unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
+ using namespace PatternMatch;
+ if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
+ return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+ Alignment, CostKind, I);
+
+ assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
+ auto *VTy = cast<FixedVectorType>(DataTy);
+
+ // TODO: Splitting, once we do that.
+
+ unsigned NumElems = VTy->getNumElements();
+ unsigned EltSize = VTy->getScalarSizeInBits();
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
+
+ // For now, it is assumed that for the MVE gather instructions the loads are
+ // all effectively serialised. This means the cost is the scalar cost
+ // multiplied by the number of elements being loaded. This is possibly very
+ // conservative, but even so we still end up vectorising loops because the
+ // cost per iteration for many loops is lower than for scalar loops.
+ unsigned VectorCost = NumElems * LT.first;
+ // The scalarization cost should be a lot higher. We use the number of vector
+ // elements plus the scalarization overhead.
+ unsigned ScalarCost =
+ NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
+
+ if (Alignment < EltSize / 8)
+ return ScalarCost;
+
+ unsigned ExtSize = EltSize;
+ // Check whether there's a single user that asks for an extended type
+ if (I != nullptr) {
+ // Dependent of the caller of this function, a gather instruction will
+ // either have opcode Instruction::Load or be a call to the masked_gather
+ // intrinsic
+ if ((I->getOpcode() == Instruction::Load ||
+ match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
+ I->hasOneUse()) {
+ const User *Us = *I->users().begin();
+ if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
+ // only allow valid type combinations
+ unsigned TypeSize =
+ cast<Instruction>(Us)->getType()->getScalarSizeInBits();
+ if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
+ (TypeSize == 16 && EltSize == 8)) &&
+ TypeSize * NumElems == 128) {
+ ExtSize = TypeSize;
+ }
+ }
+ }
+ // Check whether the input data needs to be truncated
+ TruncInst *T;
+ if ((I->getOpcode() == Instruction::Store ||
+ match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
+ (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
+ // Only allow valid type combinations
+ unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
+ if (((EltSize == 16 && TypeSize == 32) ||
+ (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
+ TypeSize * NumElems == 128)
+ ExtSize = TypeSize;
+ }
+ }
+
+ if (ExtSize * NumElems != 128 || NumElems < 4)
+ return ScalarCost;
+
+ // Any (aligned) i32 gather will not need to be scalarised.
+ if (ExtSize == 32)
+ return VectorCost;
+ // For smaller types, we need to ensure that the gep's inputs are correctly
+ // extended from a small enough value. Other sizes (including i64) are
+ // scalarized for now.
+ if (ExtSize != 8 && ExtSize != 16)
+ return ScalarCost;
+
+ if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
+ Ptr = BC->getOperand(0);
+ if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+ if (GEP->getNumOperands() != 2)
+ return ScalarCost;
+ unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
+ // Scale needs to be correct (which is only relevant for i16s).
+ if (Scale != 1 && Scale * 8 != ExtSize)
+ return ScalarCost;
+ // And we need to zext (not sext) the indexes from a small enough type.
+ if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
+ if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
+ return VectorCost;
+ }
+ return ScalarCost;
+ }
+ return ScalarCost;
+}
+
bool ARMTTIImpl::isLoweredToCall(const Function *F) {
if (!F->isIntrinsic())
BaseT::isLoweredToCall(F);
@@ -913,23 +1207,31 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
HardwareLoopInfo &HWLoopInfo) {
// Low-overhead branches are only supported in the 'low-overhead branch'
// extension of v8.1-m.
- if (!ST->hasLOB() || DisableLowOverheadLoops)
+ if (!ST->hasLOB() || DisableLowOverheadLoops) {
+ LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
return false;
+ }
- if (!SE.hasLoopInvariantBackedgeTakenCount(L))
+ if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
+ LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
return false;
+ }
const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
- if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+ LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
return false;
+ }
const SCEV *TripCountSCEV =
SE.getAddExpr(BackedgeTakenCount,
SE.getOne(BackedgeTakenCount->getType()));
// We need to store the trip count in LR, a 32-bit register.
- if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
+ if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
+ LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
return false;
+ }
// Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
// point in generating a hardware loop if that's going to happen.
@@ -1034,8 +1336,10 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
auto ScanLoop = [&](Loop *L) {
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
- if (MaybeCall(I) || IsHardwareLoopIntrinsic(I))
+ if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) {
+ LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
return false;
+ }
}
}
return true;
@@ -1102,12 +1406,47 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
const DataLayout &DL,
const LoopAccessInfo *LAI) {
+ LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
+
+ // If there are live-out values, it is probably a reduction, which needs a
+ // final reduction step after the loop. MVE has a VADDV instruction to reduce
+ // integer vectors, but doesn't have an equivalent one for float vectors. A
+ // live-out value that is not recognised as a reduction will result in the
+ // tail-predicated loop to be reverted to a non-predicated loop and this is
+ // very expensive, i.e. it has a significant performance impact. So, in this
+ // case it's better not to tail-predicate the loop, which is what we check
+ // here. Thus, we allow only 1 live-out value, which has to be an integer
+ // reduction, which matches the loops supported by ARMLowOverheadLoops.
+ // It is important to keep ARMLowOverheadLoops and canTailPredicateLoop in
+ // sync with each other.
+ SmallVector< Instruction *, 8 > LiveOuts;
+ LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
+ bool IntReductionsDisabled =
+ EnableTailPredication == TailPredication::EnabledNoReductions ||
+ EnableTailPredication == TailPredication::ForceEnabledNoReductions;
+
+ for (auto *I : LiveOuts) {
+ if (!I->getType()->isIntegerTy()) {
+ LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer "
+ "live-out value\n");
+ return false;
+ }
+ if (I->getOpcode() != Instruction::Add) {
+ LLVM_DEBUG(dbgs() << "Only add reductions supported\n");
+ return false;
+ }
+ if (IntReductionsDisabled) {
+ LLVM_DEBUG(dbgs() << "Integer add reductions not enabled\n");
+ return false;
+ }
+ }
+
+ // Next, check that all instructions can be tail-predicated.
PredicatedScalarEvolution PSE = LAI->getPSE();
+ SmallVector<Instruction *, 16> LoadStores;
int ICmpCount = 0;
int Stride = 0;
- LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
- SmallVector<Instruction *, 16> LoadStores;
for (BasicBlock *BB : L->blocks()) {
for (Instruction &I : BB->instructionsWithoutDebug()) {
if (isa<PHINode>(&I))
@@ -1155,8 +1494,10 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
TargetLibraryInfo *TLI,
DominatorTree *DT,
const LoopAccessInfo *LAI) {
- if (DisableTailPredication)
+ if (!EnableTailPredication) {
+ LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
return false;
+ }
// Creating a predicated vector loop is the first step for generating a
// tail-predicated hardware loop, for which we need the MVE masked
@@ -1197,7 +1538,16 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
return canTailPredicateLoop(L, LI, SE, DL, LAI);
}
+bool ARMTTIImpl::emitGetActiveLaneMask() const {
+ if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
+ return false;
+ // Intrinsic @llvm.get.active.lane.mask is supported.
+ // It is used in the MVETailPredication pass, which requires the number of
+ // elements processed by this vector loop to setup the tail-predicated
+ // loop.
+ return true;
+}
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Only currently enable these preferences for M-Class cores.
@@ -1241,8 +1591,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
return;
if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
- ImmutableCallSite CS(&I);
- if (const Function *F = CS.getCalledFunction()) {
+ if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
if (!isLoweredToCall(F))
continue;
}
@@ -1251,7 +1600,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
SmallVector<const Value*, 4> Operands(I.value_op_begin(),
I.value_op_end());
- Cost += getUserCost(&I, Operands);
+ Cost += getUserCost(&I, Operands, TargetTransformInfo::TCK_CodeSize);
}
}
@@ -1271,27 +1620,12 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
UP.Force = true;
}
+void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP) {
+ BaseT::getPeelingPreferences(L, SE, PP);
+}
+
bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
- assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
- unsigned ScalarBits = Ty->getScalarSizeInBits();
- if (!ST->hasMVEIntegerOps())
- return false;
-
- switch (Opcode) {
- case Instruction::FAdd:
- case Instruction::FMul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Mul:
- case Instruction::FCmp:
- return false;
- case Instruction::ICmp:
- case Instruction::Add:
- return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128;
- default:
- llvm_unreachable("Unhandled reduction opcode");
- }
- return false;
+ return ST->hasMVEIntegerOps();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index f66083eaf187..7bf6de4bffe0 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -38,6 +38,16 @@ class ScalarEvolution;
class Type;
class Value;
+namespace TailPredication {
+ enum Mode {
+ Disabled = 0,
+ EnabledNoReductions,
+ Enabled,
+ ForceEnabledNoReductions,
+ ForceEnabled
+ };
+}
+
class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
using BaseT = BasicTTIImplBase<ARMTTIImpl>;
using TTI = TargetTransformInfo;
@@ -47,13 +57,13 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
const ARMSubtarget *ST;
const ARMTargetLowering *TLI;
- // Currently the following features are excluded from InlineFeatureWhitelist.
+ // Currently the following features are excluded from InlineFeaturesAllowed.
// ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32
// Depending on whether they are set or unset, different
// instructions/registers are available. For example, inlining a callee with
// -thumb-mode in a caller with +thumb-mode, may cause the assembler to
// fail if the callee uses ARM only instructions, e.g. in inline asm.
- const FeatureBitset InlineFeatureWhitelist = {
+ const FeatureBitset InlineFeaturesAllowed = {
ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2,
ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8,
ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb,
@@ -93,11 +103,8 @@ public:
bool enableInterleavedAccessVectorization() { return true; }
- bool shouldFavorBackedgeIndex(const Loop *L) const {
- if (L->getHeader()->getParent()->hasOptSize())
- return false;
- return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
- }
+ bool shouldFavorBackedgeIndex(const Loop *L) const;
+ bool shouldFavorPostInc() const;
/// Floating-point computation using ARMv8 AArch32 Advanced
/// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
@@ -113,9 +120,10 @@ public:
Type *Ty);
using BaseT::getIntImmCost;
- int getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
+ Type *Ty, TTI::TargetCostKind CostKind);
/// @}
@@ -153,19 +161,24 @@ public:
return ST->getMaxInterleaveFactor();
}
- bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment);
+ bool isProfitableLSRChainElement(Instruction *I);
- bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) {
+ bool isLegalMaskedLoad(Type *DataTy, Align Alignment);
+
+ bool isLegalMaskedStore(Type *DataTy, Align Alignment) {
return isLegalMaskedLoad(DataTy, Alignment);
}
- bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment);
+ bool isLegalMaskedGather(Type *Ty, Align Alignment);
- bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { return false; }
+ bool isLegalMaskedScatter(Type *Ty, Align Alignment) {
+ return isLegalMaskedGather(Ty, Alignment);
+ }
int getMemcpyCost(const Instruction *I);
- int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+ VectorType *SubTp);
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
@@ -194,9 +207,11 @@ public:
}
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
@@ -206,6 +221,7 @@ public:
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -214,13 +230,20 @@ public:
const Instruction *CxtI = nullptr);
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
- unsigned AddressSpace, const Instruction *I = nullptr);
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
- int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
- ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond = false,
- bool UseMaskForGaps = false);
+ int getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+ bool UseMaskForCond = false, bool UseMaskForGaps = false);
+
+ unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+ const Value *Ptr, bool VariableMask,
+ Align Alignment, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
bool isLoweredToCall(const Function *F);
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
@@ -236,6 +259,10 @@ public:
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
+ bool emitGetActiveLaneMask() const;
+
+ void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::PeelingPreferences &PP);
bool shouldBuildLookupTablesForConstant(Constant *C) const {
// In the ROPI and RWPI relocation models we can't have pointers to global
// variables or functions in constant data, so don't convert switches to
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index f6d76ee09534..05f870b90ecd 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Triple.h"
@@ -180,10 +181,68 @@ public:
}
};
+// Various sets of ARM instruction mnemonics which are used by the asm parser
+class ARMMnemonicSets {
+ StringSet<> CDE;
+ StringSet<> CDEWithVPTSuffix;
+public:
+ ARMMnemonicSets(const MCSubtargetInfo &STI);
+
+ /// Returns true iff a given mnemonic is a CDE instruction
+ bool isCDEInstr(StringRef Mnemonic) {
+ // Quick check before searching the set
+ if (!Mnemonic.startswith("cx") && !Mnemonic.startswith("vcx"))
+ return false;
+ return CDE.count(Mnemonic);
+ }
+
+ /// Returns true iff a given mnemonic is a VPT-predicable CDE instruction
+ /// (possibly with a predication suffix "e" or "t")
+ bool isVPTPredicableCDEInstr(StringRef Mnemonic) {
+ if (!Mnemonic.startswith("vcx"))
+ return false;
+ return CDEWithVPTSuffix.count(Mnemonic);
+ }
+
+ /// Returns true iff a given mnemonic is an IT-predicable CDE instruction
+ /// (possibly with a condition suffix)
+ bool isITPredicableCDEInstr(StringRef Mnemonic) {
+ if (!Mnemonic.startswith("cx"))
+ return false;
+ return Mnemonic.startswith("cx1a") || Mnemonic.startswith("cx1da") ||
+ Mnemonic.startswith("cx2a") || Mnemonic.startswith("cx2da") ||
+ Mnemonic.startswith("cx3a") || Mnemonic.startswith("cx3da");
+ }
+
+ /// Return true iff a given mnemonic is an integer CDE instruction with
+ /// dual-register destination
+ bool isCDEDualRegInstr(StringRef Mnemonic) {
+ if (!Mnemonic.startswith("cx"))
+ return false;
+ return Mnemonic == "cx1d" || Mnemonic == "cx1da" ||
+ Mnemonic == "cx2d" || Mnemonic == "cx2da" ||
+ Mnemonic == "cx3d" || Mnemonic == "cx3da";
+ }
+};
+
+ARMMnemonicSets::ARMMnemonicSets(const MCSubtargetInfo &STI) {
+ for (StringRef Mnemonic: { "cx1", "cx1a", "cx1d", "cx1da",
+ "cx2", "cx2a", "cx2d", "cx2da",
+ "cx3", "cx3a", "cx3d", "cx3da", })
+ CDE.insert(Mnemonic);
+ for (StringRef Mnemonic :
+ {"vcx1", "vcx1a", "vcx2", "vcx2a", "vcx3", "vcx3a"}) {
+ CDE.insert(Mnemonic);
+ CDEWithVPTSuffix.insert(Mnemonic);
+ CDEWithVPTSuffix.insert(std::string(Mnemonic) + "t");
+ CDEWithVPTSuffix.insert(std::string(Mnemonic) + "e");
+ }
+}
class ARMAsmParser : public MCTargetAsmParser {
const MCRegisterInfo *MRI;
UnwindContext UC;
+ ARMMnemonicSets MS;
ARMTargetStreamer &getTargetStreamer() {
assert(getParser().getStreamer().getTargetStreamer() &&
@@ -245,12 +304,12 @@ class ARMAsmParser : public MCTargetAsmParser {
ITInst.setOpcode(ARM::t2IT);
ITInst.addOperand(MCOperand::createImm(ITState.Cond));
ITInst.addOperand(MCOperand::createImm(ITState.Mask));
- Out.EmitInstruction(ITInst, getSTI());
+ Out.emitInstruction(ITInst, getSTI());
// Emit the conditonal instructions
assert(PendingConditionalInsts.size() <= 4);
for (const MCInst &Inst : PendingConditionalInsts) {
- Out.EmitInstruction(Inst, getSTI());
+ Out.emitInstruction(Inst, getSTI());
}
PendingConditionalInsts.clear();
@@ -444,6 +503,8 @@ class ARMAsmParser : public MCTargetAsmParser {
void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
OperandVector &Operands);
+ bool CDEConvertDualRegOperand(StringRef Mnemonic, OperandVector &Operands);
+
bool isThumb() const {
// FIXME: Can tablegen auto-generate this?
return getSTI().getFeatureBits()[ARM::ModeThumb];
@@ -501,6 +562,9 @@ class ARMAsmParser : public MCTargetAsmParser {
bool hasMVEFloat() const {
return getSTI().getFeatureBits()[ARM::HasMVEFloatOps];
}
+ bool hasCDE() const {
+ return getSTI().getFeatureBits()[ARM::HasCDEOps];
+ }
bool has8MSecExt() const {
return getSTI().getFeatureBits()[ARM::Feature8MSecExt];
}
@@ -605,7 +669,7 @@ public:
ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
const MCInstrInfo &MII, const MCTargetOptions &Options)
- : MCTargetAsmParser(Options, STI, MII), UC(Parser) {
+ : MCTargetAsmParser(Options, STI, MII), UC(Parser), MS(STI) {
MCAsmParserExtension::Initialize(Parser);
// Cache the MCRegisterInfo.
@@ -628,6 +692,8 @@ public:
// Implementation of the MCTargetAsmParser interface:
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) override;
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
bool ParseDirective(AsmToken DirectiveID) override;
@@ -3553,8 +3619,7 @@ public:
if (Kind == k_RegisterList && Regs.back().second == ARM::APSR)
Kind = k_RegisterListWithAPSR;
- assert(std::is_sorted(Regs.begin(), Regs.end()) &&
- "Register list must be sorted by encoding");
+ assert(llvm::is_sorted(Regs) && "Register list must be sorted by encoding");
auto Op = std::make_unique<ARMOperand>(Kind);
for (const auto &P : Regs)
@@ -3885,6 +3950,14 @@ bool ARMAsmParser::ParseRegister(unsigned &RegNo,
return (RegNo == (unsigned)-1);
}
+OperandMatchResultTy ARMAsmParser::tryParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ if (ParseRegister(RegNo, StartLoc, EndLoc))
+ return MatchOperand_NoMatch;
+ return MatchOperand_Success;
+}
+
/// Try to parse a register name. The token must be an Identifier when called,
/// and if it is a register name the token is eaten and the register number is
/// returned. Otherwise return -1.
@@ -6045,20 +6118,35 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
case AsmToken::LCurly:
return parseRegisterList(Operands, !Mnemonic.startswith("clr"));
case AsmToken::Dollar:
- case AsmToken::Hash:
- // #42 -> immediate.
+ case AsmToken::Hash: {
+ // #42 -> immediate
+ // $ 42 -> immediate
+ // $foo -> symbol name
+ // $42 -> symbol name
S = Parser.getTok().getLoc();
- Parser.Lex();
+
+ // Favor the interpretation of $-prefixed operands as symbol names.
+ // Cases where immediates are explicitly expected are handled by their
+ // specific ParseMethod implementations.
+ auto AdjacentToken = getLexer().peekTok(/*ShouldSkipSpace=*/false);
+ bool ExpectIdentifier = Parser.getTok().is(AsmToken::Dollar) &&
+ (AdjacentToken.is(AsmToken::Identifier) ||
+ AdjacentToken.is(AsmToken::Integer));
+ if (!ExpectIdentifier) {
+ // Token is not part of identifier. Drop leading $ or # before parsing
+ // expression.
+ Parser.Lex();
+ }
if (Parser.getTok().isNot(AsmToken::Colon)) {
- bool isNegative = Parser.getTok().is(AsmToken::Minus);
+ bool IsNegative = Parser.getTok().is(AsmToken::Minus);
const MCExpr *ImmVal;
if (getParser().parseExpression(ImmVal))
return true;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
if (CE) {
int32_t Val = CE->getValue();
- if (isNegative && Val == 0)
+ if (IsNegative && Val == 0)
ImmVal = MCConstantExpr::create(std::numeric_limits<int32_t>::min(),
getContext());
}
@@ -6077,7 +6165,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
}
// w/ a ':' after the '#', it's just like a plain ':'.
LLVM_FALLTHROUGH;
-
+ }
case AsmToken::Colon: {
S = Parser.getTok().getLoc();
// ":lower16:" and ":upper16:" expression prefixes
@@ -6233,6 +6321,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" ||
Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" ||
Mnemonic == "bxns" || Mnemonic == "blxns" ||
+ Mnemonic == "vdot" || Mnemonic == "vmmla" ||
Mnemonic == "vudot" || Mnemonic == "vsdot" ||
Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
@@ -6373,14 +6462,20 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic,
Mnemonic == "vudot" || Mnemonic == "vsdot" ||
Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
+ Mnemonic == "vfmat" || Mnemonic == "vfmab" ||
+ Mnemonic == "vdot" || Mnemonic == "vmmla" ||
Mnemonic == "sb" || Mnemonic == "ssbb" ||
- Mnemonic == "pssbb" ||
+ Mnemonic == "pssbb" || Mnemonic == "vsmmla" ||
+ Mnemonic == "vummla" || Mnemonic == "vusmmla" ||
+ Mnemonic == "vusdot" || Mnemonic == "vsudot" ||
Mnemonic == "bfcsel" || Mnemonic == "wls" ||
Mnemonic == "dls" || Mnemonic == "le" || Mnemonic == "csel" ||
Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" ||
Mnemonic == "cinc" || Mnemonic == "cinv" || Mnemonic == "cneg" ||
Mnemonic == "cset" || Mnemonic == "csetm" ||
Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst") ||
+ (hasCDE() && MS.isCDEInstr(Mnemonic) &&
+ !MS.isITPredicableCDEInstr(Mnemonic)) ||
(hasMVE() &&
(Mnemonic.startswith("vst2") || Mnemonic.startswith("vld2") ||
Mnemonic.startswith("vst4") || Mnemonic.startswith("vld4") ||
@@ -6770,6 +6865,69 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,
ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc()));
}
+// Dual-register instruction have the following syntax:
+// <mnemonic> <predicate>? <coproc>, <Rdest>, <Rdest+1>, <Rsrc>, ..., #imm
+// This function tries to remove <Rdest+1> and replace <Rdest> with a pair
+// operand. If the conversion fails an error is diagnosed, and the function
+// returns true.
+bool ARMAsmParser::CDEConvertDualRegOperand(StringRef Mnemonic,
+ OperandVector &Operands) {
+ assert(MS.isCDEDualRegInstr(Mnemonic));
+ bool isPredicable =
+ Mnemonic == "cx1da" || Mnemonic == "cx2da" || Mnemonic == "cx3da";
+ size_t NumPredOps = isPredicable ? 1 : 0;
+
+ if (Operands.size() <= 3 + NumPredOps)
+ return false;
+
+ StringRef Op2Diag(
+ "operand must be an even-numbered register in the range [r0, r10]");
+
+ const MCParsedAsmOperand &Op2 = *Operands[2 + NumPredOps];
+ if (!Op2.isReg())
+ return Error(Op2.getStartLoc(), Op2Diag);
+
+ unsigned RNext;
+ unsigned RPair;
+ switch (Op2.getReg()) {
+ default:
+ return Error(Op2.getStartLoc(), Op2Diag);
+ case ARM::R0:
+ RNext = ARM::R1;
+ RPair = ARM::R0_R1;
+ break;
+ case ARM::R2:
+ RNext = ARM::R3;
+ RPair = ARM::R2_R3;
+ break;
+ case ARM::R4:
+ RNext = ARM::R5;
+ RPair = ARM::R4_R5;
+ break;
+ case ARM::R6:
+ RNext = ARM::R7;
+ RPair = ARM::R6_R7;
+ break;
+ case ARM::R8:
+ RNext = ARM::R9;
+ RPair = ARM::R8_R9;
+ break;
+ case ARM::R10:
+ RNext = ARM::R11;
+ RPair = ARM::R10_R11;
+ break;
+ }
+
+ const MCParsedAsmOperand &Op3 = *Operands[3 + NumPredOps];
+ if (!Op3.isReg() || Op3.getReg() != RNext)
+ return Error(Op3.getStartLoc(), "operand must be a consecutive register");
+
+ Operands.erase(Operands.begin() + 3 + NumPredOps);
+ Operands[2 + NumPredOps] =
+ ARMOperand::CreateReg(RPair, Op2.getStartLoc(), Op2.getEndLoc());
+ return false;
+}
+
/// Parse an arm instruction mnemonic followed by its operands.
bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) {
@@ -6786,7 +6944,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// First check for the ARM-specific .req directive.
if (Parser.getTok().is(AsmToken::Identifier) &&
- Parser.getTok().getIdentifier() == ".req") {
+ Parser.getTok().getIdentifier().lower() == ".req") {
parseDirectiveReq(Name, NameLoc);
// We always return 'error' for this, as we're done with this
// statement and don't need to match the 'instruction."
@@ -6823,6 +6981,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// ITx -> x100 (ITT -> 0100, ITE -> 1100)
// ITxy -> xy10 (e.g. ITET -> 1010)
// ITxyz -> xyz1 (e.g. ITEET -> 1101)
+ // Note: See the ARM::PredBlockMask enum in
+ // /lib/Target/ARM/Utils/ARMBaseInfo.h
if (Mnemonic == "it" || Mnemonic.startswith("vpt") ||
Mnemonic.startswith("vpst")) {
SMLoc Loc = Mnemonic == "it" ? SMLoc::getFromPointer(NameLoc.getPointer() + 2) :
@@ -6969,6 +7129,21 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands);
+ if (hasCDE() && MS.isCDEInstr(Mnemonic)) {
+ // Dual-register instructions use even-odd register pairs as their
+ // destination operand, in assembly such pair is spelled as two
+ // consecutive registers, without any special syntax. ConvertDualRegOperand
+ // tries to convert such operand into register pair, e.g. r2, r3 -> r2_r3.
+ // It returns true, if an error message has been emitted. If the function
+ // returns false, the function either succeeded or an error (e.g. missing
+ // operand) will be diagnosed elsewhere.
+ if (MS.isCDEDualRegInstr(Mnemonic)) {
+ bool GotError = CDEConvertDualRegOperand(Mnemonic, Operands);
+ if (GotError)
+ return GotError;
+ }
+ }
+
// Some instructions, mostly Thumb, have forms for the same mnemonic that
// do and don't have a cc_out optional-def operand. With some spot-checks
// of the operand list, we can figure out which variant we're trying to
@@ -7947,6 +8122,142 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
return Error (Operands[3]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1");
break;
}
+ case ARM::UMAAL:
+ case ARM::UMLAL:
+ case ARM::UMULL:
+ case ARM::t2UMAAL:
+ case ARM::t2UMLAL:
+ case ARM::t2UMULL:
+ case ARM::SMLAL:
+ case ARM::SMLALBB:
+ case ARM::SMLALBT:
+ case ARM::SMLALD:
+ case ARM::SMLALDX:
+ case ARM::SMLALTB:
+ case ARM::SMLALTT:
+ case ARM::SMLSLD:
+ case ARM::SMLSLDX:
+ case ARM::SMULL:
+ case ARM::t2SMLAL:
+ case ARM::t2SMLALBB:
+ case ARM::t2SMLALBT:
+ case ARM::t2SMLALD:
+ case ARM::t2SMLALDX:
+ case ARM::t2SMLALTB:
+ case ARM::t2SMLALTT:
+ case ARM::t2SMLSLD:
+ case ARM::t2SMLSLDX:
+ case ARM::t2SMULL: {
+ unsigned RdHi = Inst.getOperand(0).getReg();
+ unsigned RdLo = Inst.getOperand(1).getReg();
+ if(RdHi == RdLo) {
+ return Error(Loc,
+ "unpredictable instruction, RdHi and RdLo must be different");
+ }
+ break;
+ }
+
+ case ARM::CDE_CX1:
+ case ARM::CDE_CX1A:
+ case ARM::CDE_CX1D:
+ case ARM::CDE_CX1DA:
+ case ARM::CDE_CX2:
+ case ARM::CDE_CX2A:
+ case ARM::CDE_CX2D:
+ case ARM::CDE_CX2DA:
+ case ARM::CDE_CX3:
+ case ARM::CDE_CX3A:
+ case ARM::CDE_CX3D:
+ case ARM::CDE_CX3DA:
+ case ARM::CDE_VCX1_vec:
+ case ARM::CDE_VCX1_fpsp:
+ case ARM::CDE_VCX1_fpdp:
+ case ARM::CDE_VCX1A_vec:
+ case ARM::CDE_VCX1A_fpsp:
+ case ARM::CDE_VCX1A_fpdp:
+ case ARM::CDE_VCX2_vec:
+ case ARM::CDE_VCX2_fpsp:
+ case ARM::CDE_VCX2_fpdp:
+ case ARM::CDE_VCX2A_vec:
+ case ARM::CDE_VCX2A_fpsp:
+ case ARM::CDE_VCX2A_fpdp:
+ case ARM::CDE_VCX3_vec:
+ case ARM::CDE_VCX3_fpsp:
+ case ARM::CDE_VCX3_fpdp:
+ case ARM::CDE_VCX3A_vec:
+ case ARM::CDE_VCX3A_fpsp:
+ case ARM::CDE_VCX3A_fpdp: {
+ assert(Inst.getOperand(1).isImm() &&
+ "CDE operand 1 must be a coprocessor ID");
+ int64_t Coproc = Inst.getOperand(1).getImm();
+ if (Coproc < 8 && !ARM::isCDECoproc(Coproc, *STI))
+ return Error(Operands[1]->getStartLoc(),
+ "coprocessor must be configured as CDE");
+ else if (Coproc >= 8)
+ return Error(Operands[1]->getStartLoc(),
+ "coprocessor must be in the range [p0, p7]");
+ break;
+ }
+
+ case ARM::t2CDP:
+ case ARM::t2CDP2:
+ case ARM::t2LDC2L_OFFSET:
+ case ARM::t2LDC2L_OPTION:
+ case ARM::t2LDC2L_POST:
+ case ARM::t2LDC2L_PRE:
+ case ARM::t2LDC2_OFFSET:
+ case ARM::t2LDC2_OPTION:
+ case ARM::t2LDC2_POST:
+ case ARM::t2LDC2_PRE:
+ case ARM::t2LDCL_OFFSET:
+ case ARM::t2LDCL_OPTION:
+ case ARM::t2LDCL_POST:
+ case ARM::t2LDCL_PRE:
+ case ARM::t2LDC_OFFSET:
+ case ARM::t2LDC_OPTION:
+ case ARM::t2LDC_POST:
+ case ARM::t2LDC_PRE:
+ case ARM::t2MCR:
+ case ARM::t2MCR2:
+ case ARM::t2MCRR:
+ case ARM::t2MCRR2:
+ case ARM::t2MRC:
+ case ARM::t2MRC2:
+ case ARM::t2MRRC:
+ case ARM::t2MRRC2:
+ case ARM::t2STC2L_OFFSET:
+ case ARM::t2STC2L_OPTION:
+ case ARM::t2STC2L_POST:
+ case ARM::t2STC2L_PRE:
+ case ARM::t2STC2_OFFSET:
+ case ARM::t2STC2_OPTION:
+ case ARM::t2STC2_POST:
+ case ARM::t2STC2_PRE:
+ case ARM::t2STCL_OFFSET:
+ case ARM::t2STCL_OPTION:
+ case ARM::t2STCL_POST:
+ case ARM::t2STCL_PRE:
+ case ARM::t2STC_OFFSET:
+ case ARM::t2STC_OPTION:
+ case ARM::t2STC_POST:
+ case ARM::t2STC_PRE: {
+ unsigned Opcode = Inst.getOpcode();
+ // Inst.getOperand indexes operands in the (oops ...) and (iops ...) dags,
+ // CopInd is the index of the coprocessor operand.
+ size_t CopInd = 0;
+ if (Opcode == ARM::t2MRRC || Opcode == ARM::t2MRRC2)
+ CopInd = 2;
+ else if (Opcode == ARM::t2MRC || Opcode == ARM::t2MRC2)
+ CopInd = 1;
+ assert(Inst.getOperand(CopInd).isImm() &&
+ "Operand must be a coprocessor ID");
+ int64_t Coproc = Inst.getOperand(CopInd).getImm();
+ // Operands[2] is the coprocessor operand at syntactic level
+ if (ARM::isCDECoproc(Coproc, *STI))
+ return Error(Operands[2]->getStartLoc(),
+ "coprocessor must be configured as GCP");
+ break;
+ }
}
return false;
@@ -8223,50 +8534,6 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
}
switch (Inst.getOpcode()) {
- case ARM::MVE_VORNIZ0v4i32:
- case ARM::MVE_VORNIZ0v8i16:
- case ARM::MVE_VORNIZ8v4i32:
- case ARM::MVE_VORNIZ8v8i16:
- case ARM::MVE_VORNIZ16v4i32:
- case ARM::MVE_VORNIZ24v4i32:
- case ARM::MVE_VANDIZ0v4i32:
- case ARM::MVE_VANDIZ0v8i16:
- case ARM::MVE_VANDIZ8v4i32:
- case ARM::MVE_VANDIZ8v8i16:
- case ARM::MVE_VANDIZ16v4i32:
- case ARM::MVE_VANDIZ24v4i32: {
- unsigned Opcode;
- bool imm16 = false;
- switch(Inst.getOpcode()) {
- case ARM::MVE_VORNIZ0v4i32: Opcode = ARM::MVE_VORRIZ0v4i32; break;
- case ARM::MVE_VORNIZ0v8i16: Opcode = ARM::MVE_VORRIZ0v8i16; imm16 = true; break;
- case ARM::MVE_VORNIZ8v4i32: Opcode = ARM::MVE_VORRIZ8v4i32; break;
- case ARM::MVE_VORNIZ8v8i16: Opcode = ARM::MVE_VORRIZ8v8i16; imm16 = true; break;
- case ARM::MVE_VORNIZ16v4i32: Opcode = ARM::MVE_VORRIZ16v4i32; break;
- case ARM::MVE_VORNIZ24v4i32: Opcode = ARM::MVE_VORRIZ24v4i32; break;
- case ARM::MVE_VANDIZ0v4i32: Opcode = ARM::MVE_VBICIZ0v4i32; break;
- case ARM::MVE_VANDIZ0v8i16: Opcode = ARM::MVE_VBICIZ0v8i16; imm16 = true; break;
- case ARM::MVE_VANDIZ8v4i32: Opcode = ARM::MVE_VBICIZ8v4i32; break;
- case ARM::MVE_VANDIZ8v8i16: Opcode = ARM::MVE_VBICIZ8v8i16; imm16 = true; break;
- case ARM::MVE_VANDIZ16v4i32: Opcode = ARM::MVE_VBICIZ16v4i32; break;
- case ARM::MVE_VANDIZ24v4i32: Opcode = ARM::MVE_VBICIZ24v4i32; break;
- default: llvm_unreachable("unexpected opcode");
- }
-
- MCInst TmpInst;
- TmpInst.setOpcode(Opcode);
- TmpInst.addOperand(Inst.getOperand(0));
- TmpInst.addOperand(Inst.getOperand(1));
-
- // invert immediate
- unsigned imm = ~Inst.getOperand(2).getImm() & (imm16 ? 0xffff : 0xffffffff);
- TmpInst.addOperand(MCOperand::createImm(imm));
-
- TmpInst.addOperand(Inst.getOperand(3));
- TmpInst.addOperand(Inst.getOperand(4));
- Inst = TmpInst;
- return true;
- }
// Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
case ARM::LDRT_POST:
case ARM::LDRBT_POST: {
@@ -8285,6 +8552,26 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
Inst = TmpInst;
return true;
}
+ // Alias for 'ldr{sb,h,sh}t Rt, [Rn] {, #imm}' for ommitted immediate.
+ case ARM::LDRSBTii:
+ case ARM::LDRHTii:
+ case ARM::LDRSHTii: {
+ MCInst TmpInst;
+
+ if (Inst.getOpcode() == ARM::LDRSBTii)
+ TmpInst.setOpcode(ARM::LDRSBTi);
+ else if (Inst.getOpcode() == ARM::LDRHTii)
+ TmpInst.setOpcode(ARM::LDRHTi);
+ else if (Inst.getOpcode() == ARM::LDRSHTii)
+ TmpInst.setOpcode(ARM::LDRSHTi);
+ TmpInst.addOperand(Inst.getOperand(0));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(Inst.getOperand(1));
+ TmpInst.addOperand(MCOperand::createImm(256));
+ TmpInst.addOperand(Inst.getOperand(2));
+ Inst = TmpInst;
+ return true;
+ }
// Alias for alternate form of 'str{,b}t Rt, [Rn], #imm' instruction.
case ARM::STRT_POST:
case ARM::STRBT_POST: {
@@ -8323,7 +8610,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
// Reading PC provides the start of the current instruction + 8 and
// the transform to adr is biased by that.
MCSymbol *Dot = getContext().createTempSymbol();
- Out.EmitLabel(Dot);
+ Out.emitLabel(Dot);
const MCExpr *OpExpr = Inst.getOperand(2).getExpr();
const MCExpr *InstPC = MCSymbolRefExpr::create(Dot,
MCSymbolRefExpr::VK_None,
@@ -10521,7 +10808,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
if (isITBlockFull() || isITBlockTerminator(Inst))
flushPendingInstructions(Out);
} else {
- Out.EmitInstruction(Inst, getSTI());
+ Out.emitInstruction(Inst, getSTI());
}
return false;
case Match_NearMisses:
@@ -10546,7 +10833,7 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
bool IsMachO = Format == MCObjectFileInfo::IsMachO;
bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
- StringRef IDVal = DirectiveID.getIdentifier();
+ std::string IDVal = DirectiveID.getIdentifier().lower();
if (IDVal == ".word")
parseLiteralValues(4, DirectiveID.getLoc());
else if (IDVal == ".short" || IDVal == ".hword")
@@ -10632,7 +10919,7 @@ bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
const MCExpr *Value;
if (getParser().parseExpression(Value))
return true;
- getParser().getStreamer().EmitValue(Value, Size, L);
+ getParser().getStreamer().emitValue(Value, Size, L);
return false;
};
return (parseMany(parseOne));
@@ -10648,7 +10935,7 @@ bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
if (!isThumb())
SwitchMode();
- getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
return false;
}
@@ -10661,7 +10948,7 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
if (isThumb())
SwitchMode();
- getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code32);
return false;
}
@@ -10673,7 +10960,7 @@ void ARMAsmParser::doBeforeLabelEmit(MCSymbol *Symbol) {
void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
if (NextSymbolIsThumb) {
- getParser().getStreamer().EmitThumbFunc(Symbol);
+ getParser().getStreamer().emitThumbFunc(Symbol);
NextSymbolIsThumb = false;
}
}
@@ -10693,7 +10980,7 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
Parser.getTok().is(AsmToken::String)) {
MCSymbol *Func = getParser().getContext().getOrCreateSymbol(
Parser.getTok().getIdentifier());
- getParser().getStreamer().EmitThumbFunc(Func);
+ getParser().getStreamer().emitThumbFunc(Func);
Parser.Lex();
if (parseToken(AsmToken::EndOfStatement,
"unexpected token in '.thumb_func' directive"))
@@ -10757,14 +11044,14 @@ bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
if (!isThumb())
SwitchMode();
- getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
} else {
if (!hasARM())
return Error(L, "target does not support ARM mode");
if (isThumb())
SwitchMode();
- getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code32);
}
return false;
@@ -10817,7 +11104,7 @@ void ARMAsmParser::FixModeAfterArchChange(bool WasThumb, SMLoc Loc) {
SwitchMode();
} else {
// Mode switch forced, because the new arch doesn't support the old mode.
- getParser().getStreamer().EmitAssemblerFlag(isThumb() ? MCAF_Code16
+ getParser().getStreamer().emitAssemblerFlag(isThumb() ? MCAF_Code16
: MCAF_Code32);
// Warn about the implcit mode switch. GAS does not switch modes here,
// but instead stays in the old mode, reporting an error on any following
@@ -10859,11 +11146,13 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
TagLoc = Parser.getTok().getLoc();
if (Parser.getTok().is(AsmToken::Identifier)) {
StringRef Name = Parser.getTok().getIdentifier();
- Tag = ARMBuildAttrs::AttrTypeFromString(Name);
- if (Tag == -1) {
+ Optional<unsigned> Ret =
+ ELFAttrs::attrTypeFromString(Name, ARMBuildAttrs::ARMAttributeTags);
+ if (!Ret.hasValue()) {
Error(TagLoc, "attribute name not recognised: " + Name);
return false;
}
+ Tag = Ret.getValue();
Parser.Lex();
} else {
const MCExpr *AttrExpr;
@@ -11314,9 +11603,9 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
assert(Section && "must have section to emit alignment");
if (Section->UseCodeAlign())
- getStreamer().EmitCodeAlignment(2);
+ getStreamer().emitCodeAlignment(2);
else
- getStreamer().EmitValueToAlignment(2);
+ getStreamer().emitValueToAlignment(2);
return false;
}
@@ -11516,9 +11805,9 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
const MCSection *Section = getStreamer().getCurrentSectionOnly();
assert(Section && "must have section to emit alignment");
if (Section->UseCodeAlign())
- getStreamer().EmitCodeAlignment(4, 0);
+ getStreamer().emitCodeAlignment(4, 0);
else
- getStreamer().EmitValueToAlignment(4, 0, 1, 0);
+ getStreamer().emitValueToAlignment(4, 0, 1, 0);
return false;
}
return true;
@@ -11770,7 +12059,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
// when we start to table-generate them, and we can use the ARM
// flags below, that were generated by table-gen.
static const struct {
- const unsigned Kind;
+ const uint64_t Kind;
const FeatureBitset ArchCheck;
const FeatureBitset Features;
} Extensions[] = {
@@ -11819,7 +12108,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
EnableFeature = false;
Name = Name.substr(2);
}
- unsigned FeatureKind = ARM::parseArchExt(Name);
+ uint64_t FeatureKind = ARM::parseArchExt(Name);
if (FeatureKind == ARM::AEK_INVALID)
return Error(ExtLoc, "unknown architectural extension: " + Name);
@@ -11969,6 +12258,7 @@ bool ARMAsmParser::isMnemonicVPTPredicable(StringRef Mnemonic,
Mnemonic.startswith("vpnot") || Mnemonic.startswith("vbic") ||
Mnemonic.startswith("vrmlsldavh") || Mnemonic.startswith("vmlsldav") ||
Mnemonic.startswith("vcvt") ||
+ MS.isVPTPredicableCDEInstr(Mnemonic) ||
(Mnemonic.startswith("vmov") &&
!(ExtraToken == ".f16" || ExtraToken == ".32" ||
ExtraToken == ".16" || ExtraToken == ".8"));
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index d26b04556abb..54ff0d9966cb 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -182,6 +182,9 @@ static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
+static DecodeStatus
+DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
unsigned RegNo, uint64_t Address,
const void *Decoder);
@@ -201,6 +204,8 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder);
static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder);
@@ -538,10 +543,6 @@ template<unsigned MinLog, unsigned MaxLog>
static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
uint64_t Address,
const void *Decoder);
-template <int shift>
-static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder);
template<unsigned start>
static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val,
uint64_t Address,
@@ -1087,8 +1088,12 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
}
}
+ uint32_t Coproc = fieldFromInstruction(Insn32, 8, 4);
+ const uint8_t *DecoderTable = ARM::isCDECoproc(Coproc, STI)
+ ? DecoderTableThumb2CDE32
+ : DecoderTableThumb2CoProc32;
Result =
- decodeInstruction(DecoderTableThumb2CoProc32, MI, Insn32, Address, this, STI);
+ decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI);
if (Result != MCDisassembler::Fail) {
Size = 4;
Check(Result, AddThumbPredicate(MI));
@@ -1220,10 +1225,12 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
+ // According to the Arm ARM RegNo = 14 is undefined, but we return fail
+ // rather than SoftFail as there is no GPRPair table entry for index 7.
if (RegNo > 13)
return MCDisassembler::Fail;
- if ((RegNo & 1) || RegNo == 0xe)
+ if (RegNo & 1)
S = MCDisassembler::SoftFail;
unsigned RegisterPair = GPRPairDecoderTable[RegNo/2];
@@ -1231,6 +1238,19 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
return S;
}
+static DecodeStatus DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo > 13)
+ return MCDisassembler::Fail;
+
+ unsigned RegisterPair = GPRPairDecoderTable[RegNo/2];
+ Inst.addOperand(MCOperand::createReg(RegisterPair));
+
+ if ((RegNo & 1) || RegNo > 10)
+ return MCDisassembler::SoftFail;
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const void *Decoder) {
@@ -6068,6 +6088,23 @@ static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
return MCDisassembler::Success;
}
+static DecodeStatus
+DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
+ uint64_t Address, const void *Decoder) {
+ if (RegNo == 15) {
+ Inst.addOperand(MCOperand::createReg(ARM::APSR_NZCV));
+ return MCDisassembler::Success;
+ }
+
+ unsigned Register = GPRDecoderTable[RegNo];
+ Inst.addOperand(MCOperand::createReg(Register));
+
+ if (RegNo == 13)
+ return MCDisassembler::SoftFail;
+
+ return MCDisassembler::Success;
+}
+
static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
const void *Decoder) {
DecodeStatus S = MCDisassembler::Success;
@@ -6395,16 +6432,6 @@ static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
return S;
}
-template <int shift>
-static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val,
- uint64_t Address,
- const void *Decoder) {
- Val <<= shift;
-
- Inst.addOperand(MCOperand::createImm(Val));
- return MCDisassembler::Success;
-}
-
template<unsigned start>
static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val,
uint64_t Address,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index be02da18fb7d..9ad595c016c4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -48,10 +48,17 @@ public:
} // end anonymous namespace
Optional<MCFixupKind> ARMAsmBackend::getFixupKind(StringRef Name) const {
- if (STI.getTargetTriple().isOSBinFormatELF() && Name == "R_ARM_NONE")
- return FK_NONE;
-
- return MCAsmBackend::getFixupKind(Name);
+ if (!STI.getTargetTriple().isOSBinFormatELF())
+ return None;
+
+ unsigned Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/ARM.def"
+#undef ELF_RELOC
+ .Default(-1u);
+ if (Type == -1u)
+ return None;
+ return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
}
const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
@@ -166,6 +173,11 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
{"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel}
};
+ // Fixup kinds from .reloc directive are like R_ARM_NONE. They do not require
+ // any extra processing.
+ if (Kind >= FirstLiteralRelocationKind)
+ return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
@@ -310,9 +322,8 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
return reasonForFixupRelaxation(Fixup, Value);
}
-void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
- const MCSubtargetInfo &STI,
- MCInst &Res) const {
+void ARMAsmBackend::relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode(), STI);
// Sanity check w/ diagnostic if we get here w/ a bogus instruction.
@@ -328,17 +339,18 @@ void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
// have to change the operands too.
if ((Inst.getOpcode() == ARM::tCBZ || Inst.getOpcode() == ARM::tCBNZ) &&
RelaxedOp == ARM::tHINT) {
+ MCInst Res;
Res.setOpcode(RelaxedOp);
Res.addOperand(MCOperand::createImm(0));
Res.addOperand(MCOperand::createImm(14));
Res.addOperand(MCOperand::createReg(0));
+ Inst = std::move(Res);
return;
}
// The rest of instructions we're relaxing have the same operands.
// We just need to update to the proper opcode.
- Res = Inst;
- Res.setOpcode(RelaxedOp);
+ Inst.setOpcode(RelaxedOp);
}
bool ARMAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
@@ -432,7 +444,6 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
default:
Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type");
return 0;
- case FK_NONE:
case FK_Data_1:
case FK_Data_2:
case FK_Data_4:
@@ -865,7 +876,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
const MCSymbolRefExpr *A = Target.getSymA();
const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
const unsigned FixupKind = Fixup.getKind();
- if (FixupKind == FK_NONE)
+ if (FixupKind >= FirstLiteralRelocationKind)
return true;
if (FixupKind == ARM::fixup_arm_thumb_bl) {
assert(Sym && "How did we resolve this?");
@@ -909,9 +920,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
- case FK_NONE:
- return 0;
-
case FK_Data_1:
case ARM::fixup_arm_thumb_bcc:
case ARM::fixup_arm_thumb_cp:
@@ -973,9 +981,6 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
- case FK_NONE:
- return 0;
-
case FK_Data_1:
return 1;
case FK_Data_2:
@@ -1031,7 +1036,10 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
MutableArrayRef<char> Data, uint64_t Value,
bool IsResolved,
const MCSubtargetInfo* STI) const {
- unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+ unsigned Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return;
+ unsigned NumBytes = getFixupKindNumBytes(Kind);
MCContext &Ctx = Asm.getContext();
Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx, STI);
if (!Value)
@@ -1043,7 +1051,7 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
// Used to point to big endian bytes.
unsigned FullSizeBytes;
if (Endian == support::big) {
- FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
+ FullSizeBytes = getFixupKindContainerSizeBytes(Kind);
assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
}
@@ -1110,11 +1118,11 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
const MCCFIInstruction &Inst = Instrs[i];
switch (Inst.getOperation()) {
case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
- CFARegisterOffset = -Inst.getOffset();
+ CFARegisterOffset = Inst.getOffset();
CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
break;
case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset
- CFARegisterOffset = -Inst.getOffset();
+ CFARegisterOffset = Inst.getOffset();
break;
case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register
CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
@@ -1271,35 +1279,6 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
return CompactUnwindEncoding | ((FloatRegCount - 1) << 8);
}
-static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) {
- ARM::ArchKind AK = ARM::parseArch(Arch);
- switch (AK) {
- default:
- return MachO::CPU_SUBTYPE_ARM_V7;
- case ARM::ArchKind::ARMV4T:
- return MachO::CPU_SUBTYPE_ARM_V4T;
- case ARM::ArchKind::ARMV5T:
- case ARM::ArchKind::ARMV5TE:
- case ARM::ArchKind::ARMV5TEJ:
- return MachO::CPU_SUBTYPE_ARM_V5;
- case ARM::ArchKind::ARMV6:
- case ARM::ArchKind::ARMV6K:
- return MachO::CPU_SUBTYPE_ARM_V6;
- case ARM::ArchKind::ARMV7A:
- return MachO::CPU_SUBTYPE_ARM_V7;
- case ARM::ArchKind::ARMV7S:
- return MachO::CPU_SUBTYPE_ARM_V7S;
- case ARM::ArchKind::ARMV7K:
- return MachO::CPU_SUBTYPE_ARM_V7K;
- case ARM::ArchKind::ARMV6M:
- return MachO::CPU_SUBTYPE_ARM_V6M;
- case ARM::ArchKind::ARMV7M:
- return MachO::CPU_SUBTYPE_ARM_V7M;
- case ARM::ArchKind::ARMV7EM:
- return MachO::CPU_SUBTYPE_ARM_V7EM;
- }
-}
-
static MCAsmBackend *createARMAsmBackend(const Target &T,
const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
@@ -1309,10 +1288,8 @@ static MCAsmBackend *createARMAsmBackend(const Target &T,
switch (TheTriple.getObjectFormat()) {
default:
llvm_unreachable("unsupported object format");
- case Triple::MachO: {
- MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName());
- return new ARMAsmBackendDarwin(T, STI, MRI, CS);
- }
+ case Triple::MachO:
+ return new ARMAsmBackendDarwin(T, STI, MRI);
case Triple::COFF:
assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
return new ARMAsmBackendWinCOFF(T, STI);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 67722a5e5b64..38c7b30769b3 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -66,8 +66,8 @@ public:
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override;
- void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- MCInst &Res) const override;
+ void relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index 87e56940f46d..e27bb134670f 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -16,16 +16,20 @@
namespace llvm {
class ARMAsmBackendDarwin : public ARMAsmBackend {
const MCRegisterInfo &MRI;
+ Triple TT;
public:
const MachO::CPUSubTypeARM Subtype;
ARMAsmBackendDarwin(const Target &T, const MCSubtargetInfo &STI,
- const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
- : ARMAsmBackend(T, STI, support::little), MRI(MRI), Subtype(st) {}
+ const MCRegisterInfo &MRI)
+ : ARMAsmBackend(T, STI, support::little), MRI(MRI),
+ TT(STI.getTargetTriple()),
+ Subtype((MachO::CPUSubTypeARM)cantFail(
+ MachO::getCPUSubType(STI.getTargetTriple()))) {}
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
- return createARMMachObjectWriter(/*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
- Subtype);
+ return createARMMachObjectWriter(
+ /*Is64Bit=*/false, cantFail(MachO::getCPUType(TT)), Subtype);
}
uint32_t generateCompactUnwindEncoding(
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 6293a2462306..74cd2e681ded 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -393,9 +393,21 @@ namespace ARMII {
// in an IT block).
ThumbArithFlagSetting = 1 << 19,
- // Whether an instruction can be included in an MVE tail-predicated loop.
+ // Whether an instruction can be included in an MVE tail-predicated loop,
+ // though extra validity checks may need to be performed too.
ValidForTailPredication = 1 << 20,
+ // Whether an instruction writes to the top/bottom half of a vector element
+ // and leaves the other half untouched.
+ RetainsPreviousHalfElement = 1 << 21,
+
+ // Whether the instruction produces a scalar result from vector operands.
+ HorizontalReduction = 1 << 22,
+
+ // Whether this instruction produces a vector result that is larger than
+ // its input, typically reading from the top/bottom halves of the input(s).
+ DoubleWidthResult = 1 << 23,
+
//===------------------------------------------------------------------===//
// Code domain.
DomainShift = 15,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 2c26dd388c05..37d81e4b0af1 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -53,8 +53,8 @@ ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
unsigned Type) const {
- // FIXME: This is extremely conservative. This really needs to use a
- // whitelist with a clear explanation for why each realocation needs to
+ // FIXME: This is extremely conservative. This really needs to use an
+ // explicit list with a clear explanation for why each realocation needs to
// point to the symbol, not to the section.
switch (Type) {
default:
@@ -79,6 +79,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel,
MCContext &Ctx) const {
+ unsigned Kind = Fixup.getTargetKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return Kind - FirstLiteralRelocationKind;
MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
if (IsPCRel) {
@@ -89,9 +92,18 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
case FK_Data_4:
switch (Modifier) {
default:
- llvm_unreachable("Unsupported Modifier");
- case MCSymbolRefExpr::VK_None:
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 4-byte pc-relative data relocation");
+ return ELF::R_ARM_NONE;
+ case MCSymbolRefExpr::VK_None: {
+ if (const MCSymbolRefExpr *SymRef = Target.getSymA()) {
+ // For GNU AS compatibility expressions such as
+ // _GLOBAL_OFFSET_TABLE_ - label emit a R_ARM_BASE_PREL relocation.
+ if (SymRef->getSymbol().getName() == "_GLOBAL_OFFSET_TABLE_")
+ return ELF::R_ARM_BASE_PREL;
+ }
return ELF::R_ARM_REL32;
+ }
case MCSymbolRefExpr::VK_GOTTPOFF:
return ELF::R_ARM_TLS_IE32;
case MCSymbolRefExpr::VK_ARM_GOT_PREL:
@@ -145,30 +157,34 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
return ELF::R_ARM_THM_BF18;
}
}
- switch (Fixup.getTargetKind()) {
+ switch (Kind) {
default:
Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
return ELF::R_ARM_NONE;
- case FK_NONE:
- return ELF::R_ARM_NONE;
case FK_Data_1:
switch (Modifier) {
default:
- llvm_unreachable("unsupported Modifier");
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 1-byte data relocation");
+ return ELF::R_ARM_NONE;
case MCSymbolRefExpr::VK_None:
return ELF::R_ARM_ABS8;
}
case FK_Data_2:
switch (Modifier) {
default:
- llvm_unreachable("unsupported modifier");
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 2-byte data relocation");
+ return ELF::R_ARM_NONE;
case MCSymbolRefExpr::VK_None:
return ELF::R_ARM_ABS16;
}
case FK_Data_4:
switch (Modifier) {
default:
- llvm_unreachable("Unsupported Modifier");
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 4-byte data relocation");
+ return ELF::R_ARM_NONE;
case MCSymbolRefExpr::VK_ARM_NONE:
return ELF::R_ARM_NONE;
case MCSymbolRefExpr::VK_GOT:
@@ -210,7 +226,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
case ARM::fixup_arm_movt_hi16:
switch (Modifier) {
default:
- llvm_unreachable("Unsupported Modifier");
+ Ctx.reportError(Fixup.getLoc(), "invalid fixup for ARM MOVT instruction");
+ return ELF::R_ARM_NONE;
case MCSymbolRefExpr::VK_None:
return ELF::R_ARM_MOVT_ABS;
case MCSymbolRefExpr::VK_ARM_SBREL:
@@ -219,7 +236,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
case ARM::fixup_arm_movw_lo16:
switch (Modifier) {
default:
- llvm_unreachable("Unsupported Modifier");
+ Ctx.reportError(Fixup.getLoc(), "invalid fixup for ARM MOVW instruction");
+ return ELF::R_ARM_NONE;
case MCSymbolRefExpr::VK_None:
return ELF::R_ARM_MOVW_ABS_NC;
case MCSymbolRefExpr::VK_ARM_SBREL:
@@ -228,7 +246,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
case ARM::fixup_t2_movt_hi16:
switch (Modifier) {
default:
- llvm_unreachable("Unsupported Modifier");
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for Thumb MOVT instruction");
+ return ELF::R_ARM_NONE;
case MCSymbolRefExpr::VK_None:
return ELF::R_ARM_THM_MOVT_ABS;
case MCSymbolRefExpr::VK_ARM_SBREL:
@@ -237,7 +257,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
case ARM::fixup_t2_movw_lo16:
switch (Modifier) {
default:
- llvm_unreachable("Unsupported Modifier");
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for Thumb MOVW instruction");
+ return ELF::R_ARM_NONE;
case MCSymbolRefExpr::VK_None:
return ELF::R_ARM_THM_MOVW_ABS_NC;
case MCSymbolRefExpr::VK_ARM_SBREL:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index f558ca8d2d9f..876741d6c343 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -93,7 +93,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
StringRef StringValue) override;
void emitArch(ARM::ArchKind Arch) override;
- void emitArchExtension(unsigned ArchExt) override;
+ void emitArchExtension(uint64_t ArchExt) override;
void emitObjectArch(ARM::ArchKind Arch) override;
void emitFPU(unsigned FPU) override;
void emitInst(uint32_t Inst, char Suffix = '\0') override;
@@ -177,7 +177,8 @@ void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {}
void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value);
if (IsVerboseAsm) {
- StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+ StringRef Name =
+ ELFAttrs::attrTypeAsString(Attribute, ARMBuildAttrs::ARMAttributeTags);
if (!Name.empty())
OS << "\t@ " << Name;
}
@@ -193,7 +194,8 @@ void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
default:
OS << "\t.eabi_attribute\t" << Attribute << ", \"" << String << "\"";
if (IsVerboseAsm) {
- StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+ StringRef Name = ELFAttrs::attrTypeAsString(
+ Attribute, ARMBuildAttrs::ARMAttributeTags);
if (!Name.empty())
OS << "\t@ " << Name;
}
@@ -212,7 +214,9 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
if (!StringValue.empty())
OS << ", \"" << StringValue << "\"";
if (IsVerboseAsm)
- OS << "\t@ " << ARMBuildAttrs::AttrTypeAsString(Attribute);
+ OS << "\t@ "
+ << ELFAttrs::attrTypeAsString(Attribute,
+ ARMBuildAttrs::ARMAttributeTags);
break;
}
OS << "\n";
@@ -222,7 +226,7 @@ void ARMTargetAsmStreamer::emitArch(ARM::ArchKind Arch) {
OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n";
}
-void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) {
+void ARMTargetAsmStreamer::emitArchExtension(uint64_t ArchExt) {
OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n";
}
@@ -238,7 +242,7 @@ void ARMTargetAsmStreamer::finishAttributeSection() {}
void
ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
- OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
+ OS << "\t.tlsdescseq\t" << S->getSymbol().getName() << "\n";
}
void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
@@ -328,12 +332,8 @@ private:
}
// Create new attribute item
- AttributeItem Item = {
- AttributeItem::NumericAttribute,
- Attribute,
- Value,
- StringRef("")
- };
+ AttributeItem Item = {AttributeItem::NumericAttribute, Attribute, Value,
+ std::string(StringRef(""))};
Contents.push_back(Item);
}
@@ -344,17 +344,13 @@ private:
if (!OverwriteExisting)
return;
Item->Type = AttributeItem::TextAttribute;
- Item->StringValue = Value;
+ Item->StringValue = std::string(Value);
return;
}
// Create new attribute item
- AttributeItem Item = {
- AttributeItem::TextAttribute,
- Attribute,
- 0,
- Value
- };
+ AttributeItem Item = {AttributeItem::TextAttribute, Attribute, 0,
+ std::string(Value)};
Contents.push_back(Item);
}
@@ -366,17 +362,13 @@ private:
return;
Item->Type = AttributeItem::NumericAndTextAttributes;
Item->IntValue = IntValue;
- Item->StringValue = StringValue;
+ Item->StringValue = std::string(StringValue);
return;
}
// Create new attribute item
- AttributeItem Item = {
- AttributeItem::NumericAndTextAttributes,
- Attribute,
- IntValue,
- StringValue
- };
+ AttributeItem Item = {AttributeItem::NumericAndTextAttributes, Attribute,
+ IntValue, std::string(StringValue)};
Contents.push_back(Item);
}
@@ -452,7 +444,7 @@ public:
~ARMELFStreamer() override = default;
- void FinishImpl() override;
+ void finishImpl() override;
// ARM exception handling directives
void emitFnStart();
@@ -468,13 +460,13 @@ public:
void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
SMLoc Loc) override {
- EmitDataMappingSymbol();
+ emitDataMappingSymbol();
MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
}
- void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
+ void changeSection(MCSection *Section, const MCExpr *Subsection) override {
LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo);
- MCELFStreamer::ChangeSection(Section, Subsection);
+ MCELFStreamer::changeSection(Section, Subsection);
auto LastMappingSymbol = LastMappingSymbols.find(Section);
if (LastMappingSymbol != LastMappingSymbols.end()) {
LastEMSInfo = std::move(LastMappingSymbol->second);
@@ -486,14 +478,14 @@ public:
/// This function is the one used to emit instruction data into the ELF
/// streamer. We override it to add the appropriate mapping symbol if
/// necessary.
- void EmitInstruction(const MCInst &Inst,
+ void emitInstruction(const MCInst &Inst,
const MCSubtargetInfo &STI) override {
if (IsThumb)
EmitThumbMappingSymbol();
else
EmitARMMappingSymbol();
- MCELFStreamer::EmitInstruction(Inst, STI);
+ MCELFStreamer::emitInstruction(Inst, STI);
}
void emitInst(uint32_t Inst, char Suffix) {
@@ -533,15 +525,15 @@ public:
llvm_unreachable("Invalid Suffix");
}
- MCELFStreamer::EmitBytes(StringRef(Buffer, Size));
+ MCELFStreamer::emitBytes(StringRef(Buffer, Size));
}
/// This is one of the functions used to emit data into an ELF section, so the
/// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
/// necessary.
- void EmitBytes(StringRef Data) override {
- EmitDataMappingSymbol();
- MCELFStreamer::EmitBytes(Data);
+ void emitBytes(StringRef Data) override {
+ emitDataMappingSymbol();
+ MCELFStreamer::emitBytes(Data);
}
void FlushPendingMappingSymbol() {
@@ -555,7 +547,7 @@ public:
/// This is one of the functions used to emit data into an ELF section, so the
/// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
/// necessary.
- void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+ void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) {
if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) {
getContext().reportError(Loc, "relocated expression must be 32-bit");
@@ -564,12 +556,12 @@ public:
getOrCreateDataFragment();
}
- EmitDataMappingSymbol();
- MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+ emitDataMappingSymbol();
+ MCELFStreamer::emitValueImpl(Value, Size, Loc);
}
- void EmitAssemblerFlag(MCAssemblerFlag Flag) override {
- MCELFStreamer::EmitAssemblerFlag(Flag);
+ void emitAssemblerFlag(MCAssemblerFlag Flag) override {
+ MCELFStreamer::emitAssemblerFlag(Flag);
switch (Flag) {
case MCAF_SyntaxUnified:
@@ -609,7 +601,7 @@ private:
ElfMappingSymbol State;
};
- void EmitDataMappingSymbol() {
+ void emitDataMappingSymbol() {
if (LastEMSInfo->State == EMS_Data)
return;
else if (LastEMSInfo->State == EMS_None) {
@@ -648,7 +640,7 @@ private:
void EmitMappingSymbol(StringRef Name) {
auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
Name + "." + Twine(MappingSymbolCounter++)));
- EmitLabel(Symbol);
+ emitLabel(Symbol);
Symbol->setType(ELF::STT_NOTYPE);
Symbol->setBinding(ELF::STB_LOCAL);
@@ -659,15 +651,15 @@ private:
uint64_t Offset) {
auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
Name + "." + Twine(MappingSymbolCounter++)));
- EmitLabelAtPos(Symbol, Loc, F, Offset);
+ emitLabelAtPos(Symbol, Loc, F, Offset);
Symbol->setType(ELF::STT_NOTYPE);
Symbol->setBinding(ELF::STB_LOCAL);
Symbol->setExternal(false);
}
- void EmitThumbFunc(MCSymbol *Func) override {
+ void emitThumbFunc(MCSymbol *Func) override {
getAssembler().setIsThumbFunc(Func);
- EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
+ emitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
}
// Helper functions for ARM exception handling directives
@@ -868,6 +860,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
case ARM::ArchKind::ARMV8_3A:
case ARM::ArchKind::ARMV8_4A:
case ARM::ArchKind::ARMV8_5A:
+ case ARM::ArchKind::ARMV8_6A:
setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
setAttributeItem(ARM_ISA_use, Allowed, false);
setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
@@ -1091,7 +1084,7 @@ void ARMTargetELFStreamer::finishAttributeSection() {
Streamer.SwitchSection(AttributeSection);
// Format version
- Streamer.EmitIntValue(0x41, 1);
+ Streamer.emitInt8(0x41);
}
// Vendor size + Vendor name + '\0'
@@ -1102,31 +1095,31 @@ void ARMTargetELFStreamer::finishAttributeSection() {
const size_t ContentsSize = calculateContentSize();
- Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
- Streamer.EmitBytes(CurrentVendor);
- Streamer.EmitIntValue(0, 1); // '\0'
+ Streamer.emitInt32(VendorHeaderSize + TagHeaderSize + ContentsSize);
+ Streamer.emitBytes(CurrentVendor);
+ Streamer.emitInt8(0); // '\0'
- Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
- Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+ Streamer.emitInt8(ARMBuildAttrs::File);
+ Streamer.emitInt32(TagHeaderSize + ContentsSize);
// Size should have been accounted for already, now
// emit each field as its type (ULEB or String)
for (size_t i = 0; i < Contents.size(); ++i) {
AttributeItem item = Contents[i];
- Streamer.EmitULEB128IntValue(item.Tag);
+ Streamer.emitULEB128IntValue(item.Tag);
switch (item.Type) {
default: llvm_unreachable("Invalid attribute type");
case AttributeItem::NumericAttribute:
- Streamer.EmitULEB128IntValue(item.IntValue);
+ Streamer.emitULEB128IntValue(item.IntValue);
break;
case AttributeItem::TextAttribute:
- Streamer.EmitBytes(item.StringValue);
- Streamer.EmitIntValue(0, 1); // '\0'
+ Streamer.emitBytes(item.StringValue);
+ Streamer.emitInt8(0); // '\0'
break;
case AttributeItem::NumericAndTextAttributes:
- Streamer.EmitULEB128IntValue(item.IntValue);
- Streamer.EmitBytes(item.StringValue);
- Streamer.EmitIntValue(0, 1); // '\0'
+ Streamer.emitULEB128IntValue(item.IntValue);
+ Streamer.emitBytes(item.StringValue);
+ Streamer.emitInt8(0); // '\0'
break;
}
}
@@ -1143,7 +1136,7 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
Streamer.getAssembler().registerSymbol(*Symbol);
unsigned Type = cast<MCSymbolELF>(Symbol)->getType();
if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)
- Streamer.EmitThumbFunc(Symbol);
+ Streamer.emitThumbFunc(Symbol);
}
void
@@ -1155,13 +1148,13 @@ void ARMTargetELFStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
const MCSymbol &Sym = SRE->getSymbol();
if (!Sym.isDefined()) {
- getStreamer().EmitAssignment(Symbol, Value);
+ getStreamer().emitAssignment(Symbol, Value);
return;
}
}
- getStreamer().EmitThumbFunc(Symbol);
- getStreamer().EmitAssignment(Symbol, Value);
+ getStreamer().emitThumbFunc(Symbol);
+ getStreamer().emitAssignment(Symbol, Value);
}
void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
@@ -1170,12 +1163,12 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; }
-void ARMELFStreamer::FinishImpl() {
+void ARMELFStreamer::finishImpl() {
MCTargetStreamer &TS = *getTargetStreamer();
ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
ATS.finishAttributeSection();
- MCELFStreamer::FinishImpl();
+ MCELFStreamer::finishImpl();
}
void ARMELFStreamer::reset() {
@@ -1201,7 +1194,7 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
static_cast<const MCSectionELF &>(Fn.getSection());
// Create the name for new section
- StringRef FnSecName(FnSection.getSectionName());
+ StringRef FnSecName(FnSection.getName());
SmallString<128> EHSecName(Prefix);
if (FnSecName != ".text") {
EHSecName += FnSecName;
@@ -1213,13 +1206,13 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
Flags |= ELF::SHF_GROUP;
MCSectionELF *EHSection = getContext().getELFSection(
EHSecName, Type, Flags, 0, Group, FnSection.getUniqueID(),
- static_cast<const MCSymbolELF *>(&Fn));
+ static_cast<const MCSymbolELF *>(FnSection.getBeginSymbol()));
assert(EHSection && "Failed to get the required EH section");
// Switch to .ARM.extab or .ARM.exidx section
SwitchSection(EHSection);
- EmitCodeAlignment(4);
+ emitCodeAlignment(4);
}
inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
@@ -1258,7 +1251,7 @@ void ARMELFStreamer::EHReset() {
void ARMELFStreamer::emitFnStart() {
assert(FnStart == nullptr);
FnStart = getContext().createTempSymbol();
- EmitLabel(FnStart);
+ emitLabel(FnStart);
}
void ARMELFStreamer::emitFnEnd() {
@@ -1284,17 +1277,17 @@ void ARMELFStreamer::emitFnEnd() {
MCSymbolRefExpr::VK_ARM_PREL31,
getContext());
- EmitValue(FnStartRef, 4);
+ emitValue(FnStartRef, 4);
if (CantUnwind) {
- EmitIntValue(ARM::EHABI::EXIDX_CANTUNWIND, 4);
+ emitInt32(ARM::EHABI::EXIDX_CANTUNWIND);
} else if (ExTab) {
// Emit a reference to the unwind opcodes in the ".ARM.extab" section.
const MCSymbolRefExpr *ExTabEntryRef =
MCSymbolRefExpr::create(ExTab,
MCSymbolRefExpr::VK_ARM_PREL31,
getContext());
- EmitValue(ExTabEntryRef, 4);
+ emitValue(ExTabEntryRef, 4);
} else {
// For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
// the second word of exception index table entry. The size of the unwind
@@ -1307,7 +1300,7 @@ void ARMELFStreamer::emitFnEnd() {
Opcodes[1] << 8 |
Opcodes[2] << 16 |
Opcodes[3] << 24;
- EmitIntValue(Intval, Opcodes.size());
+ emitIntValue(Intval, Opcodes.size());
}
// Switch to the section containing FnStart
@@ -1366,7 +1359,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
// Create .ARM.extab label for offset in .ARM.exidx
assert(!ExTab);
ExTab = getContext().createTempSymbol();
- EmitLabel(ExTab);
+ emitLabel(ExTab);
// Emit personality
if (Personality) {
@@ -1375,7 +1368,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
MCSymbolRefExpr::VK_ARM_PREL31,
getContext());
- EmitValue(PersonalityRef, 4);
+ emitValue(PersonalityRef, 4);
}
// Emit unwind opcodes
@@ -1386,7 +1379,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
Opcodes[I + 1] << 8 |
Opcodes[I + 2] << 16 |
Opcodes[I + 3] << 24;
- EmitIntValue(Intval, 4);
+ emitInt32(Intval);
}
// According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or
@@ -1397,7 +1390,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
// In case that the .handlerdata directive is not specified by the
// programmer, we should emit zero to terminate the handler data.
if (NoHandlerData && !Personality)
- EmitIntValue(0, 4);
+ emitInt32(0);
}
void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); }
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index b36106a78b71..744d919f2fd4 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -288,7 +288,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
case ARM::t2DSB:
switch (MI->getOperand(0).getImm()) {
default:
- if (!printAliasInstr(MI, STI, O))
+ if (!printAliasInstr(MI, Address, STI, O))
printInstruction(MI, Address, STI, O);
break;
case 0:
@@ -302,7 +302,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
return;
}
- if (!printAliasInstr(MI, STI, O))
+ if (!printAliasInstr(MI, Address, STI, O))
printInstruction(MI, Address, STI, O);
printAnnotation(O, Annot);
@@ -1669,15 +1669,6 @@ void ARMInstPrinter::printVPTMask(const MCInst *MI, unsigned OpNum,
}
}
-void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- uint32_t Val = MI->getOperand(OpNum).getImm();
- O << markup("<imm:") << "#0x";
- O.write_hex(Val);
- O << markup(">");
-}
-
void ARMInstPrinter::printMveSaturateOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index 20f901033395..37cb731ff001 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -32,10 +32,10 @@ public:
// Autogenerated by tblgen.
void printInstruction(const MCInst *MI, uint64_t Address,
const MCSubtargetInfo &STI, raw_ostream &O);
- virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
- raw_ostream &O);
- virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
- unsigned PrintMethodIdx,
+ virtual bool printAliasInstr(const MCInst *MI, uint64_t Address,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ virtual void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpIdx, unsigned PrintMethodIdx,
const MCSubtargetInfo &STI,
raw_ostream &O);
static const char *getRegisterName(unsigned RegNo,
@@ -43,6 +43,10 @@ public:
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+ printOperand(MI, OpNum, STI, O);
+ }
void printSORegRegOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -109,6 +113,12 @@ public:
template <unsigned scale>
void printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ template <unsigned scale>
+ void printAdrLabelOperand(const MCInst *MI, uint64_t /*Address*/,
+ unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printAdrLabelOperand<scale>(MI, OpNum, STI, O);
+ }
void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printThumbSRImm(const MCInst *MI, unsigned OpNum,
@@ -206,6 +216,11 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O);
void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printThumbLdrLabelOperand(const MCInst *MI, uint64_t /*Address*/,
+ unsigned OpNum, const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ printThumbLdrLabelOperand(MI, OpNum, STI, O);
+ }
void printFBits16(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
void printFBits32(const MCInst *MI, unsigned OpNum,
@@ -260,8 +275,6 @@ public:
const MCSubtargetInfo &STI, raw_ostream &O);
void printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
- const MCSubtargetInfo &STI, raw_ostream &O);
void printMveSaturateOp(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI, raw_ostream &O);
private:
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index d30d15df3d00..765613cf347d 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -37,8 +37,6 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
ExceptionsType = (TheTriple.isOSDarwin() && !TheTriple.isWatchABI())
? ExceptionHandling::SjLj
: ExceptionHandling::DwarfCFI;
-
- UseIntegratedAssembler = true;
}
void ARMELFMCAsmInfo::anchor() { }
@@ -73,8 +71,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) {
// foo(plt) instead of foo@plt
UseParensForSymbolVariant = true;
-
- UseIntegratedAssembler = true;
}
void ARMELFMCAsmInfo::setUseIntegratedAssembler(bool Value) {
@@ -116,7 +112,6 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
ExceptionsType = ExceptionHandling::DwarfCFI;
UseParensForSymbolVariant = true;
- UseIntegratedAssembler = true;
DwarfRegNumForCFI = false;
// Conditional Thumb 4-byte instructions can have an implicit IT.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 268fe7efd9ce..1cb99534f146 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -413,14 +413,6 @@ public:
unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- template <uint8_t shift, bool invert>
- unsigned getExpandedImmOpValue(const MCInst &MI, unsigned Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- static_assert(shift <= 32, "Shift count must be less than or equal to 32.");
- const MCOperand MO = MI.getOperand(Op);
- return (invert ? (MO.getImm() ^ 0xff) : MO.getImm()) >> shift;
- }
unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
unsigned EncodedValue,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 9f60e70e0e02..05d73ccf6ff2 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -63,6 +63,25 @@ static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
return true;
}
}
+ if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] &&
+ ((MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 10) ||
+ (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 11))) {
+ Info = "since v7, cp10 and cp11 are reserved for advanced SIMD or floating "
+ "point instructions";
+ return true;
+ }
+ return false;
+}
+
+static bool getMRCDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+ std::string &Info) {
+ if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] &&
+ ((MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 10) ||
+ (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 11))) {
+ Info = "since v7, cp10 and cp11 are reserved for advanced SIMD or floating "
+ "point instructions";
+ return true;
+ }
return false;
}
@@ -168,7 +187,7 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
if (!ArchFS.empty())
ArchFS = (Twine(ArchFS) + "," + FS).str();
else
- ArchFS = FS;
+ ArchFS = std::string(FS);
}
return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS);
@@ -200,7 +219,7 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
MAI = new ARMELFMCAsmInfo(TheTriple);
unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
- MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0));
+ MAI->addInitialFrameState(MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0));
return MAI;
}
@@ -266,7 +285,9 @@ public:
bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size, uint64_t &Target) const override {
// We only handle PCRel branches for now.
- if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+ if (Inst.getNumOperands() == 0 ||
+ Info->get(Inst.getOpcode()).OpInfo[0].OperandType !=
+ MCOI::OPERAND_PCREL)
return false;
int64_t Imm = Inst.getOperand(0).getImm();
@@ -285,8 +306,15 @@ public:
switch (Inst.getOpcode()) {
default:
OpId = 0;
+ if (Inst.getNumOperands() == 0)
+ return false;
break;
+ case ARM::MVE_WLSTP_8:
+ case ARM::MVE_WLSTP_16:
+ case ARM::MVE_WLSTP_32:
+ case ARM::MVE_WLSTP_64:
case ARM::t2WLS:
+ case ARM::MVE_LETP:
case ARM::t2LEUpdate:
OpId = 2;
break;
@@ -316,6 +344,14 @@ static MCInstrAnalysis *createThumbMCInstrAnalysis(const MCInstrInfo *Info) {
return new ThumbMCInstrAnalysis(Info);
}
+bool ARM::isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI) {
+ // Unfortunately we don't have ARMTargetInfo in the disassembler, so we have
+ // to rely on feature bits.
+ if (Coproc >= 8)
+ return false;
+ return STI.getFeatureBits()[ARM::FeatureCoprocCDE0 + Coproc];
+}
+
// Force static initialization.
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() {
for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 9cbbd56225ef..7cfe6881b456 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -107,6 +107,9 @@ inline bool isVpred(OperandType op) {
inline bool isVpred(uint8_t op) {
return isVpred(static_cast<OperandType>(op));
}
+
+bool isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI);
+
} // end namespace ARM
} // End llvm namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index 7b30a61e8ccb..1fee354cad93 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -80,7 +80,7 @@ void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {
default:
llvm_unreachable("Invalid Suffix");
}
- getStreamer().EmitBytes(StringRef(Buffer, Size));
+ getStreamer().emitBytes(StringRef(Buffer, Size));
}
// The remaining callbacks should be handled separately by each
@@ -108,7 +108,7 @@ void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute,
unsigned IntValue,
StringRef StringValue) {}
void ARMTargetStreamer::emitArch(ARM::ArchKind Arch) {}
-void ARMTargetStreamer::emitArchExtension(unsigned ArchExt) {}
+void ARMTargetStreamer::emitArchExtension(uint64_t ArchExt) {}
void ARMTargetStreamer::emitObjectArch(ARM::ArchKind Arch) {}
void ARMTargetStreamer::emitFPU(unsigned FPU) {}
void ARMTargetStreamer::finishAttributeSection() {}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index a9460b70da56..781627c3c425 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -134,7 +134,7 @@ void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
uint8_t Buff[16];
Buff[0] = ARM::EHABI::UNWIND_OPCODE_INC_VSP_ULEB128;
size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1);
- EmitBytes(Buff, ULEBSize + 1);
+ emitBytes(Buff, ULEBSize + 1);
} else if (Offset > 0) {
if (Offset > 0x100) {
EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP | 0x3fu);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index 5fb7307159d1..ec11a78f8a7a 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -64,7 +64,7 @@ public:
OpBegins.push_back(OpBegins.back() + Opcodes.size());
}
- /// Finalize the unwind opcode sequence for EmitBytes()
+ /// Finalize the unwind opcode sequence for emitBytes()
void Finalize(unsigned &PersonalityIndex,
SmallVectorImpl<uint8_t> &Result);
@@ -80,7 +80,7 @@ private:
OpBegins.push_back(OpBegins.back() + 2);
}
- void EmitBytes(const uint8_t *Opcode, size_t Size) {
+ void emitBytes(const uint8_t *Opcode, size_t Size) {
Ops.insert(Ops.end(), Opcode, Opcode + Size);
OpBegins.push_back(OpBegins.back() + Size);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index b3c8146a9bde..e6f649164a29 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -22,18 +22,18 @@ public:
std::unique_ptr<MCObjectWriter> OW)
: MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
- void EmitThumbFunc(MCSymbol *Symbol) override;
- void FinishImpl() override;
+ void emitThumbFunc(MCSymbol *Symbol) override;
+ void finishImpl() override;
};
-void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
+void ARMWinCOFFStreamer::emitThumbFunc(MCSymbol *Symbol) {
getAssembler().setIsThumbFunc(Symbol);
}
-void ARMWinCOFFStreamer::FinishImpl() {
- EmitFrames(nullptr);
+void ARMWinCOFFStreamer::finishImpl() {
+ emitFrames(nullptr);
- MCWinCOFFStreamer::FinishImpl();
+ MCWinCOFFStreamer::finishImpl();
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 9f64af02e698..4d7ad6cd60cb 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -15,6 +15,7 @@
#include "ARM.h"
#include "ARMBaseInstrInfo.h"
#include "ARMSubtarget.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -37,6 +38,7 @@
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <cassert>
@@ -67,27 +69,77 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<TargetPassConfig>();
+ AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
private:
+ LoopInfo *LI = nullptr;
+
// Check this is a valid gather with correct alignment
bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize,
- unsigned Alignment);
+ Align Alignment);
// Check whether Ptr is hidden behind a bitcast and look through it
void lookThroughBitcast(Value *&Ptr);
// Check for a getelementptr and deduce base and offsets from it, on success
// returning the base directly and the offsets indirectly using the Offsets
// argument
- Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder);
+ Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP,
+ IRBuilder<> &Builder);
+ // Compute the scale of this gather/scatter instruction
+ int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
+ // If the value is a constant, or derived from constants via additions
+ // and multilications, return its numeric value
+ Optional<int64_t> getIfConst(const Value *V);
+ // If Inst is an add instruction, check whether one summand is a
+ // constant. If so, scale this constant and return it together with
+ // the other summand.
+ std::pair<Value *, int64_t> getVarAndConst(Value *Inst, int TypeScale);
- bool lowerGather(IntrinsicInst *I);
+ Value *lowerGather(IntrinsicInst *I);
// Create a gather from a base + vector of offsets
Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
- IRBuilder<> Builder);
+ Instruction *&Root, IRBuilder<> &Builder);
// Create a gather from a vector of pointers
Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
- IRBuilder<> Builder);
+ IRBuilder<> &Builder, int64_t Increment = 0);
+ // Create an incrementing gather from a vector of pointers
+ Value *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> &Builder,
+ int64_t Increment = 0);
+
+ Value *lowerScatter(IntrinsicInst *I);
+ // Create a scatter to a base + vector of offsets
+ Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Offsets,
+ IRBuilder<> &Builder);
+ // Create a scatter to a vector of pointers
+ Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> &Builder,
+ int64_t Increment = 0);
+ // Create an incrementing scatter from a vector of pointers
+ Value *tryCreateMaskedScatterBaseWB(IntrinsicInst *I, Value *Ptr,
+ IRBuilder<> &Builder,
+ int64_t Increment = 0);
+
+ // QI gathers and scatters can increment their offsets on their own if
+ // the increment is a constant value (digit)
+ Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr,
+ Value *Ptr, GetElementPtrInst *GEP,
+ IRBuilder<> &Builder);
+ // QI gathers/scatters can increment their offsets on their own if the
+ // increment is a constant value (digit) - this creates a writeback QI
+ // gather/scatter
+ Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr,
+ Value *Ptr, unsigned TypeScale,
+ IRBuilder<> &Builder);
+ // Check whether these offsets could be moved out of the loop they're in
+ bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
+ // Pushes the given add out of the loop
+ void pushOutAdd(PHINode *&Phi, Value *OffsSecondOperand, unsigned StartIndex);
+ // Pushes the given mul out of the loop
+ void pushOutMul(PHINode *&Phi, Value *IncrementPerRound,
+ Value *OffsSecondOperand, unsigned LoopIncrement,
+ IRBuilder<> &Builder);
};
} // end anonymous namespace
@@ -103,102 +155,177 @@ Pass *llvm::createMVEGatherScatterLoweringPass() {
bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements,
unsigned ElemSize,
- unsigned Alignment) {
- // Do only allow non-extending gathers for now
- if (((NumElements == 4 && ElemSize == 32) ||
- (NumElements == 8 && ElemSize == 16) ||
+ Align Alignment) {
+ if (((NumElements == 4 &&
+ (ElemSize == 32 || ElemSize == 16 || ElemSize == 8)) ||
+ (NumElements == 8 && (ElemSize == 16 || ElemSize == 8)) ||
(NumElements == 16 && ElemSize == 8)) &&
- ElemSize / 8 <= Alignment)
+ Alignment >= ElemSize / 8)
return true;
- LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid "
- << "alignment or vector type \n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: instruction does not have "
+ << "valid alignment or vector type \n");
return false;
}
-Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
- IRBuilder<> Builder) {
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty,
+ GetElementPtrInst *GEP,
+ IRBuilder<> &Builder) {
if (!GEP) {
- LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n");
+ LLVM_DEBUG(
+ dbgs() << "masked gathers/scatters: no getelementpointer found\n");
return nullptr;
}
- LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading"
- << " from base + vector of offsets\n");
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found."
+ << " Looking at intrinsic for base + vector of offsets\n");
Value *GEPPtr = GEP->getPointerOperand();
if (GEPPtr->getType()->isVectorTy()) {
- LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers"
- << " hidden behind a getelementptr currently not"
- << " supported. Expanding.\n");
return nullptr;
}
if (GEP->getNumOperands() != 2) {
- LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many"
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many"
<< " operands. Expanding.\n");
return nullptr;
}
Offsets = GEP->getOperand(1);
- // SExt offsets inside masked gathers are not permitted by the architecture;
- // we therefore can't fold them
+ // Paranoid check whether the number of parallel lanes is the same
+ assert(cast<FixedVectorType>(Ty)->getNumElements() ==
+ cast<FixedVectorType>(Offsets->getType())->getNumElements());
+ // Only <N x i32> offsets can be integrated into an arm gather, any smaller
+ // type would have to be sign extended by the gep - and arm gathers can only
+ // zero extend. Additionally, the offsets do have to originate from a zext of
+ // a vector with element types smaller or equal the type of the gather we're
+ // looking at
+ if (Offsets->getType()->getScalarSizeInBits() != 32)
+ return nullptr;
if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
Offsets = ZextOffs->getOperand(0);
- Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty));
- // If the offset we found does not have the type the intrinsic expects,
- // i.e., the same type as the gather itself, we need to convert it (only i
- // types) or fall back to expanding the gather
- if (OffsType != Offsets->getType()) {
- if (OffsType->getScalarSizeInBits() >
- Offsets->getType()->getScalarSizeInBits()) {
- LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n");
- Offsets = Builder.CreateZExt(Offsets, OffsType, "");
- } else {
- LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't"
- << " create masked gather\n");
+ else if (!(cast<FixedVectorType>(Offsets->getType())->getNumElements() == 4 &&
+ Offsets->getType()->getScalarSizeInBits() == 32))
+ return nullptr;
+
+ if (Ty != Offsets->getType()) {
+ if ((Ty->getScalarSizeInBits() <
+ Offsets->getType()->getScalarSizeInBits())) {
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type."
+ << " Can't create intrinsic.\n");
return nullptr;
+ } else {
+ Offsets = Builder.CreateZExt(
+ Offsets, VectorType::getInteger(cast<VectorType>(Ty)));
}
}
// If none of the checks failed, return the gep's base pointer
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: found correct offsets\n");
return GEPPtr;
}
void MVEGatherScatterLowering::lookThroughBitcast(Value *&Ptr) {
// Look through bitcast instruction if #elements is the same
if (auto *BitCast = dyn_cast<BitCastInst>(Ptr)) {
- Type *BCTy = BitCast->getType();
- Type *BCSrcTy = BitCast->getOperand(0)->getType();
- if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) {
- LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n");
+ auto *BCTy = cast<FixedVectorType>(BitCast->getType());
+ auto *BCSrcTy = cast<FixedVectorType>(BitCast->getOperand(0)->getType());
+ if (BCTy->getNumElements() == BCSrcTy->getNumElements()) {
+ LLVM_DEBUG(
+ dbgs() << "masked gathers/scatters: looking through bitcast\n");
Ptr = BitCast->getOperand(0);
}
}
}
-bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
+int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize,
+ unsigned MemoryElemSize) {
+ // This can be a 32bit load/store scaled by 4, a 16bit load/store scaled by 2,
+ // or a 8bit, 16bit or 32bit load/store scaled by 1
+ if (GEPElemSize == 32 && MemoryElemSize == 32)
+ return 2;
+ else if (GEPElemSize == 16 && MemoryElemSize == 16)
+ return 1;
+ else if (GEPElemSize == 8)
+ return 0;
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: incorrect scale. Can't "
+ << "create intrinsic\n");
+ return -1;
+}
+
+Optional<int64_t> MVEGatherScatterLowering::getIfConst(const Value *V) {
+ const Constant *C = dyn_cast<Constant>(V);
+ if (C != nullptr)
+ return Optional<int64_t>{C->getUniqueInteger().getSExtValue()};
+ if (!isa<Instruction>(V))
+ return Optional<int64_t>{};
+
+ const Instruction *I = cast<Instruction>(V);
+ if (I->getOpcode() == Instruction::Add ||
+ I->getOpcode() == Instruction::Mul) {
+ Optional<int64_t> Op0 = getIfConst(I->getOperand(0));
+ Optional<int64_t> Op1 = getIfConst(I->getOperand(1));
+ if (!Op0 || !Op1)
+ return Optional<int64_t>{};
+ if (I->getOpcode() == Instruction::Add)
+ return Optional<int64_t>{Op0.getValue() + Op1.getValue()};
+ if (I->getOpcode() == Instruction::Mul)
+ return Optional<int64_t>{Op0.getValue() * Op1.getValue()};
+ }
+ return Optional<int64_t>{};
+}
+
+std::pair<Value *, int64_t>
+MVEGatherScatterLowering::getVarAndConst(Value *Inst, int TypeScale) {
+ std::pair<Value *, int64_t> ReturnFalse =
+ std::pair<Value *, int64_t>(nullptr, 0);
+ // At this point, the instruction we're looking at must be an add or we
+ // bail out
+ Instruction *Add = dyn_cast<Instruction>(Inst);
+ if (Add == nullptr || Add->getOpcode() != Instruction::Add)
+ return ReturnFalse;
+
+ Value *Summand;
+ Optional<int64_t> Const;
+ // Find out which operand the value that is increased is
+ if ((Const = getIfConst(Add->getOperand(0))))
+ Summand = Add->getOperand(1);
+ else if ((Const = getIfConst(Add->getOperand(1))))
+ Summand = Add->getOperand(0);
+ else
+ return ReturnFalse;
+
+ // Check that the constant is small enough for an incrementing gather
+ int64_t Immediate = Const.getValue() << TypeScale;
+ if (Immediate > 512 || Immediate < -512 || Immediate % 4 != 0)
+ return ReturnFalse;
+
+ return std::pair<Value *, int64_t>(Summand, Immediate);
+}
+
+Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
using namespace PatternMatch;
LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n");
// @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
// Attempt to turn the masked gather in I into a MVE intrinsic
// Potentially optimising the addressing modes as we do so.
- Type *Ty = I->getType();
+ auto *Ty = cast<FixedVectorType>(I->getType());
Value *Ptr = I->getArgOperand(0);
- unsigned Alignment = cast<ConstantInt>(I->getArgOperand(1))->getZExtValue();
+ Align Alignment = cast<ConstantInt>(I->getArgOperand(1))->getAlignValue();
Value *Mask = I->getArgOperand(2);
Value *PassThru = I->getArgOperand(3);
- if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(),
- Ty->getScalarSizeInBits(), Alignment))
- return false;
+ if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(),
+ Alignment))
+ return nullptr;
lookThroughBitcast(Ptr);
assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
IRBuilder<> Builder(I->getContext());
Builder.SetInsertPoint(I);
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder);
+
+ Instruction *Root = I;
+ Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
if (!Load)
Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
if (!Load)
- return false;
+ return nullptr;
if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) {
LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - "
@@ -206,72 +333,649 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
Load = Builder.CreateSelect(Mask, Load, PassThru);
}
+ Root->replaceAllUsesWith(Load);
+ Root->eraseFromParent();
+ if (Root != I)
+ // If this was an extending gather, we need to get rid of the sext/zext
+ // sext/zext as well as of the gather itself
+ I->eraseFromParent();
+
LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n");
- I->replaceAllUsesWith(Load);
- I->eraseFromParent();
- return true;
+ return Load;
}
-Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
- IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
+Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I,
+ Value *Ptr,
+ IRBuilder<> &Builder,
+ int64_t Increment) {
using namespace PatternMatch;
+ auto *Ty = cast<FixedVectorType>(I->getType());
LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
- Type *Ty = I->getType();
- if (Ty->getVectorNumElements() != 4)
+ if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
// Can't build an intrinsic for this
return nullptr;
Value *Mask = I->getArgOperand(2);
if (match(Mask, m_One()))
return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base,
{Ty, Ptr->getType()},
- {Ptr, Builder.getInt32(0)});
+ {Ptr, Builder.getInt32(Increment)});
else
return Builder.CreateIntrinsic(
Intrinsic::arm_mve_vldr_gather_base_predicated,
{Ty, Ptr->getType(), Mask->getType()},
- {Ptr, Builder.getInt32(0), Mask});
+ {Ptr, Builder.getInt32(Increment), Mask});
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
+ using namespace PatternMatch;
+ auto *Ty = cast<FixedVectorType>(I->getType());
+ LLVM_DEBUG(
+ dbgs()
+ << "masked gathers: loading from vector of pointers with writeback\n");
+ if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
+ // Can't build an intrinsic for this
+ return nullptr;
+ Value *Mask = I->getArgOperand(2);
+ if (match(Mask, m_One()))
+ return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_wb,
+ {Ty, Ptr->getType()},
+ {Ptr, Builder.getInt32(Increment)});
+ else
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vldr_gather_base_wb_predicated,
+ {Ty, Ptr->getType(), Mask->getType()},
+ {Ptr, Builder.getInt32(Increment), Mask});
}
Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
- IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
+ IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> &Builder) {
using namespace PatternMatch;
- Type *Ty = I->getType();
+
+ Type *OriginalTy = I->getType();
+ Type *ResultTy = OriginalTy;
+
+ unsigned Unsigned = 1;
+ // The size of the gather was already checked in isLegalTypeAndAlignment;
+ // if it was not a full vector width an appropriate extend should follow.
+ auto *Extend = Root;
+ if (OriginalTy->getPrimitiveSizeInBits() < 128) {
+ // Only transform gathers with exactly one use
+ if (!I->hasOneUse())
+ return nullptr;
+
+ // The correct root to replace is not the CallInst itself, but the
+ // instruction which extends it
+ Extend = cast<Instruction>(*I->users().begin());
+ if (isa<SExtInst>(Extend)) {
+ Unsigned = 0;
+ } else if (!isa<ZExtInst>(Extend)) {
+ LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. "
+ << "Expanding\n");
+ return nullptr;
+ }
+ LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n");
+ ResultTy = Extend->getType();
+ // The final size of the gather must be a full vector width
+ if (ResultTy->getPrimitiveSizeInBits() != 128) {
+ LLVM_DEBUG(dbgs() << "masked gathers: extending from the wrong type. "
+ << "Expanding\n");
+ return nullptr;
+ }
+ }
+
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
- Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder);
+ Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder);
if (!BasePtr)
return nullptr;
+ // Check whether the offset is a constant increment that could be merged into
+ // a QI gather
+ Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
+ if (Load)
+ return Load;
- unsigned Scale;
- int GEPElemSize =
- BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits();
- int ResultElemSize = Ty->getScalarSizeInBits();
- // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a
- // 8bit, 16bit or 32bit load scaled by 1
- if (GEPElemSize == 32 && ResultElemSize == 32) {
- Scale = 2;
- } else if (GEPElemSize == 16 && ResultElemSize == 16) {
- Scale = 1;
- } else if (GEPElemSize == 8) {
- Scale = 0;
- } else {
- LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't"
- << " create masked gather\n");
+ int Scale = computeScale(
+ BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+ OriginalTy->getScalarSizeInBits());
+ if (Scale == -1)
return nullptr;
- }
+ Root = Extend;
Value *Mask = I->getArgOperand(2);
if (!match(Mask, m_One()))
return Builder.CreateIntrinsic(
Intrinsic::arm_mve_vldr_gather_offset_predicated,
- {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()},
- {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
- Builder.getInt32(Scale), Builder.getInt32(1), Mask});
+ {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()},
+ {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()),
+ Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask});
else
return Builder.CreateIntrinsic(
Intrinsic::arm_mve_vldr_gather_offset,
- {Ty, BasePtr->getType(), Offsets->getType()},
- {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
- Builder.getInt32(Scale), Builder.getInt32(1)});
+ {ResultTy, BasePtr->getType(), Offsets->getType()},
+ {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()),
+ Builder.getInt32(Scale), Builder.getInt32(Unsigned)});
+}
+
+Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
+ using namespace PatternMatch;
+ LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n");
+
+ // @llvm.masked.scatter.*(data, ptrs, alignment, mask)
+ // Attempt to turn the masked scatter in I into a MVE intrinsic
+ // Potentially optimising the addressing modes as we do so.
+ Value *Input = I->getArgOperand(0);
+ Value *Ptr = I->getArgOperand(1);
+ Align Alignment = cast<ConstantInt>(I->getArgOperand(2))->getAlignValue();
+ auto *Ty = cast<FixedVectorType>(Input->getType());
+
+ if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(),
+ Alignment))
+ return nullptr;
+
+ lookThroughBitcast(Ptr);
+ assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
+
+ IRBuilder<> Builder(I->getContext());
+ Builder.SetInsertPoint(I);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
+
+ Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
+ if (!Store)
+ Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
+ if (!Store)
+ return nullptr;
+
+ LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n");
+ I->eraseFromParent();
+ return Store;
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
+ using namespace PatternMatch;
+ Value *Input = I->getArgOperand(0);
+ auto *Ty = cast<FixedVectorType>(Input->getType());
+ // Only QR variants allow truncating
+ if (!(Ty->getNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) {
+ // Can't build an intrinsic for this
+ return nullptr;
+ }
+ Value *Mask = I->getArgOperand(3);
+ // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask)
+ LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n");
+ if (match(Mask, m_One()))
+ return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base,
+ {Ptr->getType(), Input->getType()},
+ {Ptr, Builder.getInt32(Increment), Input});
+ else
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vstr_scatter_base_predicated,
+ {Ptr->getType(), Input->getType(), Mask->getType()},
+ {Ptr, Builder.getInt32(Increment), Input, Mask});
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
+ using namespace PatternMatch;
+ Value *Input = I->getArgOperand(0);
+ auto *Ty = cast<FixedVectorType>(Input->getType());
+ LLVM_DEBUG(
+ dbgs()
+ << "masked scatters: storing to a vector of pointers with writeback\n");
+ if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
+ // Can't build an intrinsic for this
+ return nullptr;
+ Value *Mask = I->getArgOperand(3);
+ if (match(Mask, m_One()))
+ return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base_wb,
+ {Ptr->getType(), Input->getType()},
+ {Ptr, Builder.getInt32(Increment), Input});
+ else
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vstr_scatter_base_wb_predicated,
+ {Ptr->getType(), Input->getType(), Mask->getType()},
+ {Ptr, Builder.getInt32(Increment), Input, Mask});
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
+ IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
+ using namespace PatternMatch;
+ Value *Input = I->getArgOperand(0);
+ Value *Mask = I->getArgOperand(3);
+ Type *InputTy = Input->getType();
+ Type *MemoryTy = InputTy;
+ LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing"
+ << " to base + vector of offsets\n");
+ // If the input has been truncated, try to integrate that trunc into the
+ // scatter instruction (we don't care about alignment here)
+ if (TruncInst *Trunc = dyn_cast<TruncInst>(Input)) {
+ Value *PreTrunc = Trunc->getOperand(0);
+ Type *PreTruncTy = PreTrunc->getType();
+ if (PreTruncTy->getPrimitiveSizeInBits() == 128) {
+ Input = PreTrunc;
+ InputTy = PreTruncTy;
+ }
+ }
+ if (InputTy->getPrimitiveSizeInBits() != 128) {
+ LLVM_DEBUG(
+ dbgs() << "masked scatters: cannot create scatters for non-standard"
+ << " input types. Expanding.\n");
+ return nullptr;
+ }
+
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ Value *Offsets;
+ Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder);
+ if (!BasePtr)
+ return nullptr;
+ // Check whether the offset is a constant increment that could be merged into
+ // a QI gather
+ Value *Store =
+ tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
+ if (Store)
+ return Store;
+ int Scale = computeScale(
+ BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+ MemoryTy->getScalarSizeInBits());
+ if (Scale == -1)
+ return nullptr;
+
+ if (!match(Mask, m_One()))
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vstr_scatter_offset_predicated,
+ {BasePtr->getType(), Offsets->getType(), Input->getType(),
+ Mask->getType()},
+ {BasePtr, Offsets, Input,
+ Builder.getInt32(MemoryTy->getScalarSizeInBits()),
+ Builder.getInt32(Scale), Mask});
+ else
+ return Builder.CreateIntrinsic(
+ Intrinsic::arm_mve_vstr_scatter_offset,
+ {BasePtr->getType(), Offsets->getType(), Input->getType()},
+ {BasePtr, Offsets, Input,
+ Builder.getInt32(MemoryTy->getScalarSizeInBits()),
+ Builder.getInt32(Scale)});
+}
+
+Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
+ IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP,
+ IRBuilder<> &Builder) {
+ FixedVectorType *Ty;
+ if (I->getIntrinsicID() == Intrinsic::masked_gather)
+ Ty = cast<FixedVectorType>(I->getType());
+ else
+ Ty = cast<FixedVectorType>(I->getArgOperand(0)->getType());
+ // Incrementing gathers only exist for v4i32
+ if (Ty->getNumElements() != 4 ||
+ Ty->getScalarSizeInBits() != 32)
+ return nullptr;
+ Loop *L = LI->getLoopFor(I->getParent());
+ if (L == nullptr)
+ // Incrementing gathers are not beneficial outside of a loop
+ return nullptr;
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
+ "wb gather/scatter\n");
+
+ // The gep was in charge of making sure the offsets are scaled correctly
+ // - calculate that factor so it can be applied by hand
+ DataLayout DT = I->getParent()->getParent()->getParent()->getDataLayout();
+ int TypeScale =
+ computeScale(DT.getTypeSizeInBits(GEP->getOperand(0)->getType()),
+ DT.getTypeSizeInBits(GEP->getType()) /
+ cast<FixedVectorType>(GEP->getType())->getNumElements());
+ if (TypeScale == -1)
+ return nullptr;
+
+ if (GEP->hasOneUse()) {
+ // Only in this case do we want to build a wb gather, because the wb will
+ // change the phi which does affect other users of the gep (which will still
+ // be using the phi in the old way)
+ Value *Load =
+ tryCreateIncrementingWBGatScat(I, BasePtr, Offsets, TypeScale, Builder);
+ if (Load != nullptr)
+ return Load;
+ }
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
+ "non-wb gather/scatter\n");
+
+ std::pair<Value *, int64_t> Add = getVarAndConst(Offsets, TypeScale);
+ if (Add.first == nullptr)
+ return nullptr;
+ Value *OffsetsIncoming = Add.first;
+ int64_t Immediate = Add.second;
+
+ // Make sure the offsets are scaled correctly
+ Instruction *ScaledOffsets = BinaryOperator::Create(
+ Instruction::Shl, OffsetsIncoming,
+ Builder.CreateVectorSplat(Ty->getNumElements(), Builder.getInt32(TypeScale)),
+ "ScaledIndex", I);
+ // Add the base to the offsets
+ OffsetsIncoming = BinaryOperator::Create(
+ Instruction::Add, ScaledOffsets,
+ Builder.CreateVectorSplat(
+ Ty->getNumElements(),
+ Builder.CreatePtrToInt(
+ BasePtr,
+ cast<VectorType>(ScaledOffsets->getType())->getElementType())),
+ "StartIndex", I);
+
+ if (I->getIntrinsicID() == Intrinsic::masked_gather)
+ return cast<IntrinsicInst>(
+ tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate));
+ else
+ return cast<IntrinsicInst>(
+ tryCreateMaskedScatterBase(I, OffsetsIncoming, Builder, Immediate));
+}
+
+Value *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat(
+ IntrinsicInst *I, Value *BasePtr, Value *Offsets, unsigned TypeScale,
+ IRBuilder<> &Builder) {
+ // Check whether this gather's offset is incremented by a constant - if so,
+ // and the load is of the right type, we can merge this into a QI gather
+ Loop *L = LI->getLoopFor(I->getParent());
+ // Offsets that are worth merging into this instruction will be incremented
+ // by a constant, thus we're looking for an add of a phi and a constant
+ PHINode *Phi = dyn_cast<PHINode>(Offsets);
+ if (Phi == nullptr || Phi->getNumIncomingValues() != 2 ||
+ Phi->getParent() != L->getHeader() || Phi->getNumUses() != 2)
+ // No phi means no IV to write back to; if there is a phi, we expect it
+ // to have exactly two incoming values; the only phis we are interested in
+ // will be loop IV's and have exactly two uses, one in their increment and
+ // one in the gather's gep
+ return nullptr;
+
+ unsigned IncrementIndex =
+ Phi->getIncomingBlock(0) == L->getLoopLatch() ? 0 : 1;
+ // Look through the phi to the phi increment
+ Offsets = Phi->getIncomingValue(IncrementIndex);
+
+ std::pair<Value *, int64_t> Add = getVarAndConst(Offsets, TypeScale);
+ if (Add.first == nullptr)
+ return nullptr;
+ Value *OffsetsIncoming = Add.first;
+ int64_t Immediate = Add.second;
+ if (OffsetsIncoming != Phi)
+ // Then the increment we are looking at is not an increment of the
+ // induction variable, and we don't want to do a writeback
+ return nullptr;
+
+ Builder.SetInsertPoint(&Phi->getIncomingBlock(1 - IncrementIndex)->back());
+ unsigned NumElems =
+ cast<FixedVectorType>(OffsetsIncoming->getType())->getNumElements();
+
+ // Make sure the offsets are scaled correctly
+ Instruction *ScaledOffsets = BinaryOperator::Create(
+ Instruction::Shl, Phi->getIncomingValue(1 - IncrementIndex),
+ Builder.CreateVectorSplat(NumElems, Builder.getInt32(TypeScale)),
+ "ScaledIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+ // Add the base to the offsets
+ OffsetsIncoming = BinaryOperator::Create(
+ Instruction::Add, ScaledOffsets,
+ Builder.CreateVectorSplat(
+ NumElems,
+ Builder.CreatePtrToInt(
+ BasePtr,
+ cast<VectorType>(ScaledOffsets->getType())->getElementType())),
+ "StartIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+ // The gather is pre-incrementing
+ OffsetsIncoming = BinaryOperator::Create(
+ Instruction::Sub, OffsetsIncoming,
+ Builder.CreateVectorSplat(NumElems, Builder.getInt32(Immediate)),
+ "PreIncrementStartIndex",
+ &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+ Phi->setIncomingValue(1 - IncrementIndex, OffsetsIncoming);
+
+ Builder.SetInsertPoint(I);
+
+ Value *EndResult;
+ Value *NewInduction;
+ if (I->getIntrinsicID() == Intrinsic::masked_gather) {
+ // Build the incrementing gather
+ Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate);
+ // One value to be handed to whoever uses the gather, one is the loop
+ // increment
+ EndResult = Builder.CreateExtractValue(Load, 0, "Gather");
+ NewInduction = Builder.CreateExtractValue(Load, 1, "GatherIncrement");
+ } else {
+ // Build the incrementing scatter
+ NewInduction = tryCreateMaskedScatterBaseWB(I, Phi, Builder, Immediate);
+ EndResult = NewInduction;
+ }
+ Instruction *AddInst = cast<Instruction>(Offsets);
+ AddInst->replaceAllUsesWith(NewInduction);
+ AddInst->eraseFromParent();
+ Phi->setIncomingValue(IncrementIndex, NewInduction);
+
+ return EndResult;
+}
+
+void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi,
+ Value *OffsSecondOperand,
+ unsigned StartIndex) {
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising add instruction\n");
+ Instruction *InsertionPoint =
+ &cast<Instruction>(Phi->getIncomingBlock(StartIndex)->back());
+ // Initialize the phi with a vector that contains a sum of the constants
+ Instruction *NewIndex = BinaryOperator::Create(
+ Instruction::Add, Phi->getIncomingValue(StartIndex), OffsSecondOperand,
+ "PushedOutAdd", InsertionPoint);
+ unsigned IncrementIndex = StartIndex == 0 ? 1 : 0;
+
+ // Order such that start index comes first (this reduces mov's)
+ Phi->addIncoming(NewIndex, Phi->getIncomingBlock(StartIndex));
+ Phi->addIncoming(Phi->getIncomingValue(IncrementIndex),
+ Phi->getIncomingBlock(IncrementIndex));
+ Phi->removeIncomingValue(IncrementIndex);
+ Phi->removeIncomingValue(StartIndex);
+}
+
+void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi,
+ Value *IncrementPerRound,
+ Value *OffsSecondOperand,
+ unsigned LoopIncrement,
+ IRBuilder<> &Builder) {
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising mul instruction\n");
+
+ // Create a new scalar add outside of the loop and transform it to a splat
+ // by which loop variable can be incremented
+ Instruction *InsertionPoint = &cast<Instruction>(
+ Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back());
+
+ // Create a new index
+ Value *StartIndex = BinaryOperator::Create(
+ Instruction::Mul, Phi->getIncomingValue(LoopIncrement == 1 ? 0 : 1),
+ OffsSecondOperand, "PushedOutMul", InsertionPoint);
+
+ Instruction *Product =
+ BinaryOperator::Create(Instruction::Mul, IncrementPerRound,
+ OffsSecondOperand, "Product", InsertionPoint);
+ // Increment NewIndex by Product instead of the multiplication
+ Instruction *NewIncrement = BinaryOperator::Create(
+ Instruction::Add, Phi, Product, "IncrementPushedOutMul",
+ cast<Instruction>(Phi->getIncomingBlock(LoopIncrement)->back())
+ .getPrevNode());
+
+ Phi->addIncoming(StartIndex,
+ Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1));
+ Phi->addIncoming(NewIncrement, Phi->getIncomingBlock(LoopIncrement));
+ Phi->removeIncomingValue((unsigned)0);
+ Phi->removeIncomingValue((unsigned)0);
+ return;
+}
+
+// Check whether all usages of this instruction are as offsets of
+// gathers/scatters or simple arithmetics only used by gathers/scatters
+static bool hasAllGatScatUsers(Instruction *I) {
+ if (I->hasNUses(0)) {
+ return false;
+ }
+ bool Gatscat = true;
+ for (User *U : I->users()) {
+ if (!isa<Instruction>(U))
+ return false;
+ if (isa<GetElementPtrInst>(U) ||
+ isGatherScatter(dyn_cast<IntrinsicInst>(U))) {
+ return Gatscat;
+ } else {
+ unsigned OpCode = cast<Instruction>(U)->getOpcode();
+ if ((OpCode == Instruction::Add || OpCode == Instruction::Mul) &&
+ hasAllGatScatUsers(cast<Instruction>(U))) {
+ continue;
+ }
+ return false;
+ }
+ }
+ return Gatscat;
+}
+
+bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
+ LoopInfo *LI) {
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize\n");
+ // Optimise the addresses of gathers/scatters by moving invariant
+ // calculations out of the loop
+ if (!isa<Instruction>(Offsets))
+ return false;
+ Instruction *Offs = cast<Instruction>(Offsets);
+ if (Offs->getOpcode() != Instruction::Add &&
+ Offs->getOpcode() != Instruction::Mul)
+ return false;
+ Loop *L = LI->getLoopFor(BB);
+ if (L == nullptr)
+ return false;
+ if (!Offs->hasOneUse()) {
+ if (!hasAllGatScatUsers(Offs))
+ return false;
+ }
+
+ // Find out which, if any, operand of the instruction
+ // is a phi node
+ PHINode *Phi;
+ int OffsSecondOp;
+ if (isa<PHINode>(Offs->getOperand(0))) {
+ Phi = cast<PHINode>(Offs->getOperand(0));
+ OffsSecondOp = 1;
+ } else if (isa<PHINode>(Offs->getOperand(1))) {
+ Phi = cast<PHINode>(Offs->getOperand(1));
+ OffsSecondOp = 0;
+ } else {
+ bool Changed = true;
+ if (isa<Instruction>(Offs->getOperand(0)) &&
+ L->contains(cast<Instruction>(Offs->getOperand(0))))
+ Changed |= optimiseOffsets(Offs->getOperand(0), BB, LI);
+ if (isa<Instruction>(Offs->getOperand(1)) &&
+ L->contains(cast<Instruction>(Offs->getOperand(1))))
+ Changed |= optimiseOffsets(Offs->getOperand(1), BB, LI);
+ if (!Changed) {
+ return false;
+ } else {
+ if (isa<PHINode>(Offs->getOperand(0))) {
+ Phi = cast<PHINode>(Offs->getOperand(0));
+ OffsSecondOp = 1;
+ } else if (isa<PHINode>(Offs->getOperand(1))) {
+ Phi = cast<PHINode>(Offs->getOperand(1));
+ OffsSecondOp = 0;
+ } else {
+ return false;
+ }
+ }
+ }
+ // A phi node we want to perform this function on should be from the
+ // loop header, and shouldn't have more than 2 incoming values
+ if (Phi->getParent() != L->getHeader() ||
+ Phi->getNumIncomingValues() != 2)
+ return false;
+
+ // The phi must be an induction variable
+ Instruction *Op;
+ int IncrementingBlock = -1;
+
+ for (int i = 0; i < 2; i++)
+ if ((Op = dyn_cast<Instruction>(Phi->getIncomingValue(i))) != nullptr)
+ if (Op->getOpcode() == Instruction::Add &&
+ (Op->getOperand(0) == Phi || Op->getOperand(1) == Phi))
+ IncrementingBlock = i;
+ if (IncrementingBlock == -1)
+ return false;
+
+ Instruction *IncInstruction =
+ cast<Instruction>(Phi->getIncomingValue(IncrementingBlock));
+
+ // If the phi is not used by anything else, we can just adapt it when
+ // replacing the instruction; if it is, we'll have to duplicate it
+ PHINode *NewPhi;
+ Value *IncrementPerRound = IncInstruction->getOperand(
+ (IncInstruction->getOperand(0) == Phi) ? 1 : 0);
+
+ // Get the value that is added to/multiplied with the phi
+ Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp);
+
+ if (IncrementPerRound->getType() != OffsSecondOperand->getType())
+ // Something has gone wrong, abort
+ return false;
+
+ // Only proceed if the increment per round is a constant or an instruction
+ // which does not originate from within the loop
+ if (!isa<Constant>(IncrementPerRound) &&
+ !(isa<Instruction>(IncrementPerRound) &&
+ !L->contains(cast<Instruction>(IncrementPerRound))))
+ return false;
+
+ if (Phi->getNumUses() == 2) {
+ // No other users -> reuse existing phi (One user is the instruction
+ // we're looking at, the other is the phi increment)
+ if (IncInstruction->getNumUses() != 1) {
+ // If the incrementing instruction does have more users than
+ // our phi, we need to copy it
+ IncInstruction = BinaryOperator::Create(
+ Instruction::BinaryOps(IncInstruction->getOpcode()), Phi,
+ IncrementPerRound, "LoopIncrement", IncInstruction);
+ Phi->setIncomingValue(IncrementingBlock, IncInstruction);
+ }
+ NewPhi = Phi;
+ } else {
+ // There are other users -> create a new phi
+ NewPhi = PHINode::Create(Phi->getType(), 0, "NewPhi", Phi);
+ std::vector<Value *> Increases;
+ // Copy the incoming values of the old phi
+ NewPhi->addIncoming(Phi->getIncomingValue(IncrementingBlock == 1 ? 0 : 1),
+ Phi->getIncomingBlock(IncrementingBlock == 1 ? 0 : 1));
+ IncInstruction = BinaryOperator::Create(
+ Instruction::BinaryOps(IncInstruction->getOpcode()), NewPhi,
+ IncrementPerRound, "LoopIncrement", IncInstruction);
+ NewPhi->addIncoming(IncInstruction,
+ Phi->getIncomingBlock(IncrementingBlock));
+ IncrementingBlock = 1;
+ }
+
+ IRBuilder<> Builder(BB->getContext());
+ Builder.SetInsertPoint(Phi);
+ Builder.SetCurrentDebugLocation(Offs->getDebugLoc());
+
+ switch (Offs->getOpcode()) {
+ case Instruction::Add:
+ pushOutAdd(NewPhi, OffsSecondOperand, IncrementingBlock == 1 ? 0 : 1);
+ break;
+ case Instruction::Mul:
+ pushOutMul(NewPhi, IncrementPerRound, OffsSecondOperand, IncrementingBlock,
+ Builder);
+ break;
+ default:
+ return false;
+ }
+ LLVM_DEBUG(
+ dbgs() << "masked gathers/scatters: simplified loop variable add/mul\n");
+
+ // The instruction has now been "absorbed" into the phi value
+ Offs->replaceAllUsesWith(NewPhi);
+ if (Offs->hasNUses(0))
+ Offs->eraseFromParent();
+ // Clean up the old increment in case it's unused because we built a new
+ // one
+ if (IncInstruction->hasNUses(0))
+ IncInstruction->eraseFromParent();
+
+ return true;
}
bool MVEGatherScatterLowering::runOnFunction(Function &F) {
@@ -282,20 +986,51 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
if (!ST->hasMVEIntegerOps())
return false;
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SmallVector<IntrinsicInst *, 4> Gathers;
+ SmallVector<IntrinsicInst *, 4> Scatters;
+
+ bool Changed = false;
+
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
- if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
+ if (II && II->getIntrinsicID() == Intrinsic::masked_gather) {
Gathers.push_back(II);
+ if (isa<GetElementPtrInst>(II->getArgOperand(0)))
+ Changed |= optimiseOffsets(
+ cast<Instruction>(II->getArgOperand(0))->getOperand(1),
+ II->getParent(), LI);
+ } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) {
+ Scatters.push_back(II);
+ if (isa<GetElementPtrInst>(II->getArgOperand(1)))
+ Changed |= optimiseOffsets(
+ cast<Instruction>(II->getArgOperand(1))->getOperand(1),
+ II->getParent(), LI);
+ }
}
}
- if (Gathers.empty())
- return false;
+ for (unsigned i = 0; i < Gathers.size(); i++) {
+ IntrinsicInst *I = Gathers[i];
+ Value *L = lowerGather(I);
+ if (L == nullptr)
+ continue;
- for (IntrinsicInst *I : Gathers)
- lowerGather(I);
+ // Get rid of any now dead instructions
+ SimplifyInstructionsInBlock(cast<Instruction>(L)->getParent());
+ Changed = true;
+ }
- return true;
+ for (unsigned i = 0; i < Scatters.size(); i++) {
+ IntrinsicInst *I = Scatters[i];
+ Value *S = lowerScatter(I);
+ if (S == nullptr)
+ continue;
+
+ // Get rid of any now dead instructions
+ SimplifyInstructionsInBlock(cast<Instruction>(S)->getParent());
+ Changed = true;
+ }
+ return Changed;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
index 038c68739cdf..5bf3522ab2e6 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -1,4 +1,4 @@
-//===- MVETailPredication.cpp - MVE Tail Predication ----------------------===//
+//===- MVETailPredication.cpp - MVE Tail Predication ------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,17 @@
//
/// \file
/// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead
-/// branches to help accelerate DSP applications. These two extensions can be
-/// combined to provide implicit vector predication within a low-overhead loop.
+/// branches to help accelerate DSP applications. These two extensions,
+/// combined with a new form of predication called tail-predication, can be used
+/// to provide implicit vector predication within a low-overhead loop.
+/// This is implicit because the predicate of active/inactive lanes is
+/// calculated by hardware, and thus does not need to be explicitly passed
+/// to vector instructions. The instructions responsible for this are the
+/// DLSTP and WLSTP instructions, which setup a tail-predicated loop and the
+/// the total number of data elements processed by the loop. The loop-end
+/// LETP instruction is responsible for decrementing and setting the remaining
+/// elements to be processed and generating the mask of active lanes.
+///
/// The HardwareLoops pass inserts intrinsics identifying loops that the
/// backend will attempt to convert into a low-overhead loop. The vectorizer is
/// responsible for generating a vectorized loop in which the lanes are
@@ -21,36 +30,62 @@
/// - A loop containing multiple VCPT instructions, predicating multiple VPT
/// blocks of instructions operating on different vector types.
///
-/// This pass inserts the inserts the VCTP intrinsic to represent the effect of
-/// tail predication. This will be picked up by the ARM Low-overhead loop pass,
-/// which performs the final transformation to a DLSTP or WLSTP tail-predicated
-/// loop.
+/// This pass:
+/// 1) Checks if the predicates of the masked load/store instructions are
+/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes
+/// the Backedge Taken Count (BTC) of the scalar loop as its second argument,
+/// which we extract to set up the number of elements processed by the loop.
+/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target
+/// specific VCTP intrinsic to represent the effect of tail predication.
+/// This will be picked up by the ARM Low-overhead loop pass, which performs
+/// the final transformation to a DLSTP or WLSTP tail-predicated loop.
#include "ARM.h"
#include "ARMSubtarget.h"
+#include "ARMTargetTransformInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
using namespace llvm;
#define DEBUG_TYPE "mve-tail-predication"
#define DESC "Transform predicated vector loops to use MVE tail predication"
-cl::opt<bool>
-DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
- cl::init(true),
- cl::desc("Disable MVE Tail Predication"));
+cl::opt<TailPredication::Mode> EnableTailPredication(
+ "tail-predication", cl::desc("MVE tail-predication options"),
+ cl::init(TailPredication::Disabled),
+ cl::values(clEnumValN(TailPredication::Disabled, "disabled",
+ "Don't tail-predicate loops"),
+ clEnumValN(TailPredication::EnabledNoReductions,
+ "enabled-no-reductions",
+ "Enable tail-predication, but not for reduction loops"),
+ clEnumValN(TailPredication::Enabled,
+ "enabled",
+ "Enable tail-predication, including reduction loops"),
+ clEnumValN(TailPredication::ForceEnabledNoReductions,
+ "force-enabled-no-reductions",
+ "Enable tail-predication, but not for reduction loops, "
+ "and force this which might be unsafe"),
+ clEnumValN(TailPredication::ForceEnabled,
+ "force-enabled",
+ "Enable tail-predication, including reduction loops, "
+ "and force this which might be unsafe")));
+
+
namespace {
class MVETailPredication : public LoopPass {
@@ -58,6 +93,7 @@ class MVETailPredication : public LoopPass {
Loop *L = nullptr;
ScalarEvolution *SE = nullptr;
TargetTransformInfo *TTI = nullptr;
+ const ARMSubtarget *ST = nullptr;
public:
static char ID;
@@ -76,7 +112,6 @@ public:
bool runOnLoop(Loop *L, LPPassManager&) override;
private:
-
/// Perform the relevant checks on the loop and convert if possible.
bool TryConvert(Value *TripCount);
@@ -84,19 +119,21 @@ private:
/// load/stores.
bool IsPredicatedVectorLoop();
- /// Compute a value for the total number of elements that the predicated
- /// loop will process.
- Value *ComputeElements(Value *TripCount, VectorType *VecTy);
-
- /// Is the icmp that generates an i1 vector, based upon a loop counter
- /// and a limit that is defined outside the loop.
- bool isTailPredicate(Instruction *Predicate, Value *NumElements);
+ /// Perform checks on the arguments of @llvm.get.active.lane.mask
+ /// intrinsic: check if the first is a loop induction variable, and for the
+ /// the second check that no overflow can occur in the expression that use
+ /// this backedge-taken count.
+ bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
+ FixedVectorType *VecTy);
/// Insert the intrinsic to represent the effect of tail predication.
- void InsertVCTPIntrinsic(Instruction *Predicate,
- DenseMap<Instruction*, Instruction*> &NewPredicates,
- VectorType *VecTy,
- Value *NumElements);
+ void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
+ FixedVectorType *VecTy);
+
+ /// Rematerialize the iteration count in exit blocks, which enables
+ /// ARMLowOverheadLoops to better optimise away loop update statements inside
+ /// hardware-loops.
+ void RematerializeIterCount();
};
} // end namespace
@@ -121,13 +158,14 @@ static bool IsMasked(Instruction *I) {
}
bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
- if (skipLoop(L) || DisableTailPredication)
+ if (skipLoop(L) || !EnableTailPredication)
return false;
+ MaskedInsts.clear();
Function &F = *L->getHeader()->getParent();
auto &TPC = getAnalysis<TargetPassConfig>();
auto &TM = TPC.getTM<TargetMachine>();
- auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+ ST = &TM.getSubtarget<ARMSubtarget>(F);
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
this->L = L;
@@ -185,125 +223,59 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
<< *Decrement << "\n");
- return TryConvert(Setup->getArgOperand(0));
-}
-bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
- // Look for the following:
-
- // %trip.count.minus.1 = add i32 %N, -1
- // %broadcast.splatinsert10 = insertelement <4 x i32> undef,
- // i32 %trip.count.minus.1, i32 0
- // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10,
- // <4 x i32> undef,
- // <4 x i32> zeroinitializer
- // ...
- // ...
- // %index = phi i32
- // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
- // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert,
- // <4 x i32> undef,
- // <4 x i32> zeroinitializer
- // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
- // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11
-
- // And return whether V == %pred.
-
- using namespace PatternMatch;
-
- CmpInst::Predicate Pred;
- Instruction *Shuffle = nullptr;
- Instruction *Induction = nullptr;
-
- // The vector icmp
- if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
- m_Instruction(Shuffle))) ||
- Pred != ICmpInst::ICMP_ULE)
- return false;
-
- // First find the stuff outside the loop which is setting up the limit
- // vector....
- // The invariant shuffle that broadcast the limit into a vector.
- Instruction *Insert = nullptr;
- if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
- m_Zero())))
- return false;
-
- // Insert the limit into a vector.
- Instruction *BECount = nullptr;
- if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount),
- m_Zero())))
- return false;
-
- // The limit calculation, backedge count.
- Value *TripCount = nullptr;
- if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
- return false;
-
- if (TripCount != NumElements || !L->isLoopInvariant(BECount))
- return false;
-
- // Now back to searching inside the loop body...
- // Find the add with takes the index iv and adds a constant vector to it.
- Instruction *BroadcastSplat = nullptr;
- Constant *Const = nullptr;
- if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
- m_Constant(Const))))
- return false;
-
- // Check that we're adding <0, 1, 2, 3...
- if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) {
- for (unsigned i = 0; i < CDS->getNumElements(); ++i) {
- if (CDS->getElementAsInteger(i) != i)
- return false;
- }
- } else
- return false;
-
- // The shuffle which broadcasts the index iv into a vector.
- if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
- m_Zero())))
- return false;
-
- // The insert element which initialises a vector with the index iv.
- Instruction *IV = nullptr;
- if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero())))
- return false;
-
- // The index iv.
- auto *Phi = dyn_cast<PHINode>(IV);
- if (!Phi)
- return false;
-
- // TODO: Don't think we need to check the entry value.
- Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
- if (!match(OnEntry, m_Zero()))
- return false;
-
- Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
- unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
-
- Instruction *LHS = nullptr;
- if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
+ if (!TryConvert(Setup->getArgOperand(0))) {
+ LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
return false;
+ }
- return LHS == Phi;
+ return true;
}
-static VectorType* getVectorType(IntrinsicInst *I) {
+static FixedVectorType *getVectorType(IntrinsicInst *I) {
unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType());
- return cast<VectorType>(PtrTy->getElementType());
+ auto *VecTy = cast<FixedVectorType>(PtrTy->getElementType());
+ assert(VecTy && "No scalable vectors expected here");
+ return VecTy;
}
bool MVETailPredication::IsPredicatedVectorLoop() {
// Check that the loop contains at least one masked load/store intrinsic.
// We only support 'normal' vector instructions - other than masked
// load/stores.
+ bool ActiveLaneMask = false;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
+ auto *Int = dyn_cast<IntrinsicInst>(&I);
+ if (!Int)
+ continue;
+
+ switch (Int->getIntrinsicID()) {
+ case Intrinsic::get_active_lane_mask:
+ ActiveLaneMask = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::sadd_sat:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::ssub_sat:
+ case Intrinsic::usub_sat:
+ continue;
+ case Intrinsic::fma:
+ case Intrinsic::trunc:
+ case Intrinsic::rint:
+ case Intrinsic::round:
+ case Intrinsic::floor:
+ case Intrinsic::ceil:
+ case Intrinsic::fabs:
+ if (ST->hasMVEFloatOps())
+ continue;
+ LLVM_FALLTHROUGH;
+ default:
+ break;
+ }
+
if (IsMasked(&I)) {
- VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+ auto *VecTy = getVectorType(Int);
unsigned Lanes = VecTy->getNumElements();
unsigned ElementWidth = VecTy->getScalarSizeInBits();
// MVE vectors are 128-bit, but don't support 128 x i1.
@@ -312,94 +284,23 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
return false;
MaskedInsts.push_back(cast<IntrinsicInst>(&I));
- } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
- for (auto &U : Int->args()) {
- if (isa<VectorType>(U->getType()))
- return false;
- }
+ continue;
+ }
+
+ for (const Use &U : Int->args()) {
+ if (isa<VectorType>(U->getType()))
+ return false;
}
}
}
+ if (!ActiveLaneMask) {
+ LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
+ return false;
+ }
return !MaskedInsts.empty();
}
-Value* MVETailPredication::ComputeElements(Value *TripCount,
- VectorType *VecTy) {
- const SCEV *TripCountSE = SE->getSCEV(TripCount);
- ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()),
- VecTy->getNumElements());
-
- if (VF->equalsInt(1))
- return nullptr;
-
- // TODO: Support constant trip counts.
- auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* {
- if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
- if (Const->getAPInt() != -VF->getValue())
- return nullptr;
- } else
- return nullptr;
- return dyn_cast<SCEVMulExpr>(S->getOperand(1));
- };
-
- auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* {
- if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
- if (Const->getValue() != VF)
- return nullptr;
- } else
- return nullptr;
- return dyn_cast<SCEVUDivExpr>(S->getOperand(1));
- };
-
- auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* {
- if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) {
- if (Const->getValue() != VF)
- return nullptr;
- } else
- return nullptr;
-
- if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
- if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
- if (Const->getAPInt() != (VF->getValue() - 1))
- return nullptr;
- } else
- return nullptr;
-
- return RoundUp->getOperand(1);
- }
- return nullptr;
- };
-
- // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to
- // determine the numbers of elements instead? Looks like this is what is used
- // for delinearization, but I'm not sure if it can be applied to the
- // vectorized form - at least not without a bit more work than I feel
- // comfortable with.
-
- // Search for Elems in the following SCEV:
- // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw>
- const SCEV *Elems = nullptr;
- if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE))
- if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
- if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS()))
- if (auto *Mul = VisitAdd(Add))
- if (auto *Div = VisitMul(Mul))
- if (auto *Res = VisitDiv(Div))
- Elems = Res;
-
- if (!Elems)
- return nullptr;
-
- Instruction *InsertPt = L->getLoopPreheader()->getTerminator();
- if (!isSafeToExpandAt(Elems, InsertPt, *SE))
- return nullptr;
-
- auto DL = L->getHeader()->getModule()->getDataLayout();
- SCEVExpander Expander(*SE, DL, "elements");
- return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
-}
-
// Look through the exit block to see whether there's a duplicate predicate
// instruction. This can happen when we need to perform a select on values
// from the last and previous iteration. Instead of doing a straight
@@ -407,31 +308,13 @@ Value* MVETailPredication::ComputeElements(Value *TripCount,
// in the block. This means that the VPR doesn't have to be live into the
// exit block which should make it easier to convert this loop into a proper
// tail predicated loop.
-static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
- SetVector<Instruction*> &MaybeDead, Loop *L) {
+static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
BasicBlock *Exit = L->getUniqueExitBlock();
if (!Exit) {
LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
return;
}
- for (auto &Pair : NewPredicates) {
- Instruction *OldPred = Pair.first;
- Instruction *NewPred = Pair.second;
-
- for (auto &I : *Exit) {
- if (I.isSameOperationAs(OldPred)) {
- Instruction *PredClone = NewPred->clone();
- PredClone->insertBefore(&I);
- I.replaceAllUsesWith(PredClone);
- MaybeDead.insert(&I);
- LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
- dbgs() << "ARM TP: with: "; PredClone->dump());
- break;
- }
- }
- }
-
// Drop references and add operands to check for dead.
SmallPtrSet<Instruction*, 4> Dead;
while (!MaybeDead.empty()) {
@@ -440,11 +323,10 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
if (I->hasNUsesOrMore(1))
continue;
- for (auto &U : I->operands()) {
+ for (auto &U : I->operands())
if (auto *OpI = dyn_cast<Instruction>(U))
MaybeDead.insert(OpI);
- }
- I->dropAllReferences();
+
Dead.insert(I);
}
@@ -457,24 +339,211 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
DeleteDeadPHIs(I);
}
-void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
- DenseMap<Instruction*, Instruction*> &NewPredicates,
- VectorType *VecTy, Value *NumElements) {
- IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
+// The active lane intrinsic has this form:
+//
+// @llvm.get.active.lane.mask(IV, BTC)
+//
+// Here we perform checks that this intrinsic behaves as expected,
+// which means:
+//
+// 1) The element count, which is calculated with BTC + 1, cannot overflow.
+// 2) The element count needs to be sufficiently large that the decrement of
+// element counter doesn't overflow, which means that we need to prove:
+// ceil(ElementCount / VectorWidth) >= TripCount
+// by rounding up ElementCount up:
+// ((ElementCount + (VectorWidth - 1)) / VectorWidth
+// and evaluate if expression isKnownNonNegative:
+// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
+// 3) The IV must be an induction phi with an increment equal to the
+// vector width.
+bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
+ Value *TripCount, FixedVectorType *VecTy) {
+ bool ForceTailPredication =
+ EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
+ EnableTailPredication == TailPredication::ForceEnabled;
+ // 1) Test whether entry to the loop is protected by a conditional
+ // BTC + 1 < 0. In other words, if the scalar trip count overflows,
+ // becomes negative, we shouldn't enter the loop and creating
+ // tripcount expression BTC + 1 is not safe. So, check that BTC
+ // isn't max. This is evaluated in unsigned, because the semantics
+ // of @get.active.lane.mask is a ULE comparison.
+
+ int VectorWidth = VecTy->getNumElements();
+ auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
+ auto *BTC = SE->getSCEV(BackedgeTakenCount);
+
+ if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) &&
+ !ForceTailPredication) {
+ LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: ";
+ BTC->dump());
+ return false;
+ }
+
+ // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow:
+ //
+ // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
+ //
+ // 2.1) First prove overflow can't happen in:
+ //
+ // ElementCount + (VectorWidth - 1)
+ //
+ // Because of a lack of context, it is difficult to get a useful bounds on
+ // this expression. But since ElementCount uses the same variables as the
+ // TripCount (TC), for which we can find meaningful value ranges, we use that
+ // instead and assert that:
+ //
+ // upperbound(TC) <= UINT_MAX - VectorWidth
+ //
+ auto *TC = SE->getSCEV(TripCount);
+ unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
+ auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
+ uint64_t MaxMinusVW = Diff.getZExtValue();
+ uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue();
+
+ if (UpperboundTC > MaxMinusVW && !ForceTailPredication) {
+ LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";
+ dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n";
+ dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";);
+ return false;
+ }
+
+ // 2.2) Make sure overflow doesn't happen in final expression:
+ // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount,
+ // To do this, compare the full ranges of these subexpressions:
+ //
+ // Range(Ceil) <= Range(TC)
+ //
+ // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime
+ // values (and not constants), we have to compensate for the lowerbound value
+ // range to be off by 1. The reason is that BTC lives in the preheader in
+ // this form:
+ //
+ // %trip.count.minus = add nsw nuw i32 %N, -1
+ //
+ // For the loop to be executed, %N has to be >= 1 and as a result the value
+ // range of %trip.count.minus has a lower bound of 0. Value %TC has this form:
+ //
+ // %5 = add nuw nsw i32 %4, 1
+ // call void @llvm.set.loop.iterations.i32(i32 %5)
+ //
+ // where %5 is some expression using %N, which needs to have a lower bound of
+ // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set,
+ // we first add 0 to TC such that we can do the <= comparison on both sets.
+ //
+ auto *One = SE->getOne(TripCount->getType());
+ // ElementCount = BTC + 1
+ auto *ElementCount = SE->getAddExpr(BTC, One);
+ // Tmp = ElementCount + (VW-1)
+ auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount,
+ SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
+ // Ceil = ElementCount + (VW-1) / VW
+ auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1,
+ SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)));
+
+ ConstantRange RangeCeil = SE->getSignedRange(Ceil) ;
+ ConstantRange RangeTC = SE->getSignedRange(TC) ;
+ if (!RangeTC.isSingleElement()) {
+ auto ZeroRange =
+ ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0));
+ RangeTC = RangeTC.unionWith(ZeroRange);
+ }
+ if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) {
+ LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n");
+ return false;
+ }
+
+ // 3) Find out if IV is an induction phi. Note that We can't use Loop
+ // helpers here to get the induction variable, because the hardware loop is
+ // no longer in loopsimplify form, and also the hwloop intrinsic use a
+ // different counter. Using SCEV, we check that the induction is of the
+ // form i = i + 4, where the increment must be equal to the VectorWidth.
+ auto *IV = ActiveLaneMask->getOperand(0);
+ auto *IVExpr = SE->getSCEV(IV);
+ auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);
+ if (!AddExpr) {
+ LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump());
+ return false;
+ }
+ // Check that this AddRec is associated with this loop.
+ if (AddExpr->getLoop() != L) {
+ LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n");
+ return false;
+ }
+ auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));
+ if (!Step) {
+ LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: ";
+ AddExpr->getOperand(1)->dump());
+ return false;
+ }
+ auto StepValue = Step->getValue()->getSExtValue();
+ if (VectorWidth == StepValue)
+ return true;
+
+ LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
+ "vector width " << VectorWidth << "\n");
+
+ return false;
+}
+
+// Materialize NumElements in the preheader block.
+static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
+ // First, check the preheader if it not already exist:
+ //
+ // preheader:
+ // %BTC = add i32 %N, -1
+ // ..
+ // vector.body:
+ //
+ // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1,
+ // but instead can just return %N.
+ for (auto &I : *Preheader) {
+ if (I.getOpcode() != Instruction::Add || &I != BTC)
+ continue;
+ ConstantInt *MinusOne = nullptr;
+ if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1))))
+ continue;
+ if (MinusOne->getSExtValue() == -1) {
+ LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n");
+ return I.getOperand(0);
+ }
+ }
+
+ // But we do need to materialise BTC if it is not already there,
+ // e.g. if it is a constant.
+ IRBuilder<> Builder(Preheader->getTerminator());
+ Value *NumElements = Builder.CreateAdd(BTC,
+ ConstantInt::get(BTC->getType(), 1), "num.elements");
+ LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n");
+ return NumElements;
+}
+
+void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
+ Value *TripCount, FixedVectorType *VecTy) {
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Module *M = L->getHeader()->getModule();
Type *Ty = IntegerType::get(M->getContext(), 32);
+ unsigned VectorWidth = VecTy->getNumElements();
+
+ // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
+ // is one less than the trip count. So we need to find or create
+ // %num.elements = %BTC + 1 in the preheader.
+ Value *BTC = ActiveLaneMask->getOperand(1);
+ Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+ Value *NumElements = getNumElements(L->getLoopPreheader(), BTC);
// Insert a phi to count the number of elements processed by the loop.
+ Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() );
PHINode *Processed = Builder.CreatePHI(Ty, 2);
Processed->addIncoming(NumElements, L->getLoopPreheader());
- // Insert the intrinsic to represent the effect of tail predication.
- Builder.SetInsertPoint(cast<Instruction>(Predicate));
+ // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus
+ // represent the effect of tail predication.
+ Builder.SetInsertPoint(ActiveLaneMask);
ConstantInt *Factor =
- ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+ ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
Intrinsic::ID VCTPID;
- switch (VecTy->getNumElements()) {
+ switch (VectorWidth) {
default:
llvm_unreachable("unexpected number of lanes");
case 4: VCTPID = Intrinsic::arm_mve_vctp32; break;
@@ -488,9 +557,8 @@ void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
// purposes, but takes a v4i1 instead of a v2i1.
}
Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
- Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
- Predicate->replaceAllUsesWith(TailPredicate);
- NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
+ Value *VCTPCall = Builder.CreateCall(VCTP, Processed);
+ ActiveLaneMask->replaceAllUsesWith(VCTPCall);
// Add the incoming value to the new phi.
// TODO: This add likely already exists in the loop.
@@ -498,47 +566,45 @@ void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
Processed->addIncoming(Remaining, L->getLoopLatch());
LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: "
<< *Processed << "\n"
- << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n");
+ << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n");
}
bool MVETailPredication::TryConvert(Value *TripCount) {
if (!IsPredicatedVectorLoop()) {
- LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop");
+ LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n");
return false;
}
LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
-
- // Walk through the masked intrinsics and try to find whether the predicate
- // operand is generated from an induction variable.
SetVector<Instruction*> Predicates;
- DenseMap<Instruction*, Instruction*> NewPredicates;
+ // Walk through the masked intrinsics and try to find whether the predicate
+ // operand is generated by intrinsic @llvm.get.active.lane.mask().
for (auto *I : MaskedInsts) {
- Intrinsic::ID ID = I->getIntrinsicID();
- unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
+ unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3;
auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
if (!Predicate || Predicates.count(Predicate))
continue;
- VectorType *VecTy = getVectorType(I);
- Value *NumElements = ComputeElements(TripCount, VecTy);
- if (!NumElements)
- continue;
-
- if (!isTailPredicate(Predicate, NumElements)) {
- LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n");
+ auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
+ if (!ActiveLaneMask ||
+ ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
continue;
- }
- LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n");
Predicates.insert(Predicate);
+ LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
+ << *ActiveLaneMask << "\n");
- InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements);
+ auto *VecTy = getVectorType(I);
+ if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
+ LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
+ InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
}
- // Now clean up.
- Cleanup(NewPredicates, Predicates, L);
+ Cleanup(Predicates, L);
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
index a5df46c94f42..dc769ae526bc 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -22,9 +22,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineInstrBundle.h"
#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/ReachingDefAnalysis.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Debug.h"
#include <cassert>
#include <new>
@@ -34,83 +34,220 @@ using namespace llvm;
#define DEBUG_TYPE "arm-mve-vpt"
namespace {
- class MVEVPTBlock : public MachineFunctionPass {
- public:
- static char ID;
+class MVEVPTBlock : public MachineFunctionPass {
+public:
+ static char ID;
+ const Thumb2InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
- MVEVPTBlock() : MachineFunctionPass(ID) {}
+ MVEVPTBlock() : MachineFunctionPass(ID) {}
- bool runOnMachineFunction(MachineFunction &Fn) override;
+ bool runOnMachineFunction(MachineFunction &Fn) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<ReachingDefAnalysis>();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoVRegs).set(
- MachineFunctionProperties::Property::TracksLiveness);
- }
-
- StringRef getPassName() const override {
- return "MVE VPT block insertion pass";
- }
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
- private:
- bool InsertVPTBlocks(MachineBasicBlock &MBB);
+ StringRef getPassName() const override {
+ return "MVE VPT block insertion pass";
+ }
- const Thumb2InstrInfo *TII = nullptr;
- ReachingDefAnalysis *RDA = nullptr;
- };
+private:
+ bool InsertVPTBlocks(MachineBasicBlock &MBB);
+};
- char MVEVPTBlock::ID = 0;
+char MVEVPTBlock::ID = 0;
} // end anonymous namespace
INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
-static MachineInstr *findVCMPToFoldIntoVPST(MachineInstr *MI,
- ReachingDefAnalysis *RDA,
+static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI,
+ const TargetRegisterInfo *TRI,
unsigned &NewOpcode) {
- // First, search backwards to the instruction that defines VPR
- auto *Def = RDA->getReachingMIDef(MI, ARM::VPR);
- if (!Def)
- return nullptr;
+ // Search backwards to the instruction that defines VPR. This may or not
+ // be a VCMP, we check that after this loop. If we find another instruction
+ // that reads cpsr, we return nullptr.
+ MachineBasicBlock::iterator CmpMI = MI;
+ while (CmpMI != MI->getParent()->begin()) {
+ --CmpMI;
+ if (CmpMI->modifiesRegister(ARM::VPR, TRI))
+ break;
+ if (CmpMI->readsRegister(ARM::VPR, TRI))
+ break;
+ }
- // Now check that Def is a VCMP
- if (!(NewOpcode = VCMPOpcodeToVPT(Def->getOpcode())))
+ if (CmpMI == MI)
+ return nullptr;
+ NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode());
+ if (NewOpcode == 0)
return nullptr;
- // Check that Def's operands are not defined between the VCMP and MI, i.e.
- // check that they have the same reaching def.
- if (!RDA->hasSameReachingDef(Def, MI, Def->getOperand(1).getReg()) ||
- !RDA->hasSameReachingDef(Def, MI, Def->getOperand(2).getReg()))
+ // Search forward from CmpMI to MI, checking if either register was def'd
+ if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI),
+ MI, TRI))
+ return nullptr;
+ if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI),
+ MI, TRI))
return nullptr;
+ return &*CmpMI;
+}
+
+// Advances Iter past a block of predicated instructions.
+// Returns true if it successfully skipped the whole block of predicated
+// instructions. Returns false when it stopped early (due to MaxSteps), or if
+// Iter didn't point to a predicated instruction.
+static bool StepOverPredicatedInstrs(MachineBasicBlock::instr_iterator &Iter,
+ MachineBasicBlock::instr_iterator EndIter,
+ unsigned MaxSteps,
+ unsigned &NumInstrsSteppedOver) {
+ ARMVCC::VPTCodes NextPred = ARMVCC::None;
+ Register PredReg;
+ NumInstrsSteppedOver = 0;
+
+ while (Iter != EndIter) {
+ NextPred = getVPTInstrPredicate(*Iter, PredReg);
+ assert(NextPred != ARMVCC::Else &&
+ "VPT block pass does not expect Else preds");
+ if (NextPred == ARMVCC::None || MaxSteps == 0)
+ break;
+ --MaxSteps;
+ ++Iter;
+ ++NumInstrsSteppedOver;
+ };
+
+ return NumInstrsSteppedOver != 0 &&
+ (NextPred == ARMVCC::None || Iter == EndIter);
+}
+
+// Returns true if at least one instruction in the range [Iter, End) defines
+// or kills VPR.
+static bool IsVPRDefinedOrKilledByBlock(MachineBasicBlock::iterator Iter,
+ MachineBasicBlock::iterator End) {
+ for (; Iter != End; ++Iter)
+ if (Iter->definesRegister(ARM::VPR) || Iter->killsRegister(ARM::VPR))
+ return true;
+ return false;
+}
+
+// Creates a T, TT, TTT or TTTT BlockMask depending on BlockSize.
+static ARM::PredBlockMask GetInitialBlockMask(unsigned BlockSize) {
+ switch (BlockSize) {
+ case 1:
+ return ARM::PredBlockMask::T;
+ case 2:
+ return ARM::PredBlockMask::TT;
+ case 3:
+ return ARM::PredBlockMask::TTT;
+ case 4:
+ return ARM::PredBlockMask::TTTT;
+ default:
+ llvm_unreachable("Invalid BlockSize!");
+ }
+}
+
+// Given an iterator (Iter) that points at an instruction with a "Then"
+// predicate, tries to create the largest block of continuous predicated
+// instructions possible, and returns the VPT Block Mask of that block.
+//
+// This will try to perform some minor optimization in order to maximize the
+// size of the block.
+static ARM::PredBlockMask
+CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter,
+ MachineBasicBlock::instr_iterator EndIter,
+ SmallVectorImpl<MachineInstr *> &DeadInstructions) {
+ MachineBasicBlock::instr_iterator BlockBeg = Iter;
+ (void)BlockBeg;
+ assert(getVPTInstrPredicate(*Iter) == ARMVCC::Then &&
+ "Expected a Predicated Instruction");
+
+ LLVM_DEBUG(dbgs() << "VPT block created for: "; Iter->dump());
+
+ unsigned BlockSize;
+ StepOverPredicatedInstrs(Iter, EndIter, 4, BlockSize);
+
+ LLVM_DEBUG(for (MachineBasicBlock::instr_iterator AddedInstIter =
+ std::next(BlockBeg);
+ AddedInstIter != Iter; ++AddedInstIter) {
+ dbgs() << " adding: ";
+ AddedInstIter->dump();
+ });
+
+ // Generate the initial BlockMask
+ ARM::PredBlockMask BlockMask = GetInitialBlockMask(BlockSize);
+
+ // Remove VPNOTs while there's still room in the block, so we can make the
+ // largest block possible.
+ ARMVCC::VPTCodes CurrentPredicate = ARMVCC::Else;
+ while (BlockSize < 4 && Iter != EndIter &&
+ Iter->getOpcode() == ARM::MVE_VPNOT) {
+
+ // Try to skip all of the predicated instructions after the VPNOT, stopping
+ // after (4 - BlockSize). If we can't skip them all, stop.
+ unsigned ElseInstCnt = 0;
+ MachineBasicBlock::instr_iterator VPNOTBlockEndIter = std::next(Iter);
+ if (!StepOverPredicatedInstrs(VPNOTBlockEndIter, EndIter, (4 - BlockSize),
+ ElseInstCnt))
+ break;
+
+ // Check if this VPNOT can be removed or not: It can only be removed if at
+ // least one of the predicated instruction that follows it kills or sets
+ // VPR.
+ if (!IsVPRDefinedOrKilledByBlock(Iter, VPNOTBlockEndIter))
+ break;
+
+ LLVM_DEBUG(dbgs() << " removing VPNOT: "; Iter->dump(););
+
+ // Record the new size of the block
+ BlockSize += ElseInstCnt;
+ assert(BlockSize <= 4 && "Block is too large!");
+
+ // Record the VPNot to remove it later.
+ DeadInstructions.push_back(&*Iter);
+ ++Iter;
+
+ // Replace the predicates of the instructions we're adding.
+ // Note that we are using "Iter" to iterate over the block so we can update
+ // it at the same time.
+ for (; Iter != VPNOTBlockEndIter; ++Iter) {
+ // Find the register in which the predicate is
+ int OpIdx = findFirstVPTPredOperandIdx(*Iter);
+ assert(OpIdx != -1);
+
+ // Change the predicate and update the mask
+ Iter->getOperand(OpIdx).setImm(CurrentPredicate);
+ BlockMask = expandPredBlockMask(BlockMask, CurrentPredicate);
+
+ LLVM_DEBUG(dbgs() << " adding : "; Iter->dump());
+ }
- return Def;
+ CurrentPredicate =
+ (CurrentPredicate == ARMVCC::Then ? ARMVCC::Else : ARMVCC::Then);
+ }
+ return BlockMask;
}
bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
bool Modified = false;
MachineBasicBlock::instr_iterator MBIter = Block.instr_begin();
MachineBasicBlock::instr_iterator EndIter = Block.instr_end();
- SmallSet<MachineInstr *, 4> RemovedVCMPs;
+
+ SmallVector<MachineInstr *, 4> DeadInstructions;
while (MBIter != EndIter) {
MachineInstr *MI = &*MBIter;
- unsigned PredReg = 0;
- DebugLoc dl = MI->getDebugLoc();
+ Register PredReg;
+ DebugLoc DL = MI->getDebugLoc();
ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
// The idea of the predicate is that None, Then and Else are for use when
// handling assembly language: they correspond to the three possible
// suffixes "", "t" and "e" on the mnemonic. So when instructions are read
- // from assembly source or disassembled from object code, you expect to see
- // a mixture whenever there's a long VPT block. But in code generation, we
- // hope we'll never generate an Else as input to this pass.
+ // from assembly source or disassembled from object code, you expect to
+ // see a mixture whenever there's a long VPT block. But in code
+ // generation, we hope we'll never generate an Else as input to this pass.
assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
if (Pred == ARMVCC::None) {
@@ -118,46 +255,25 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
continue;
}
- LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump());
- int VPTInstCnt = 1;
- ARMVCC::VPTCodes NextPred;
-
- // Look at subsequent instructions, checking if they can be in the same VPT
- // block.
- ++MBIter;
- while (MBIter != EndIter && VPTInstCnt < 4) {
- NextPred = getVPTInstrPredicate(*MBIter, PredReg);
- assert(NextPred != ARMVCC::Else &&
- "VPT block pass does not expect Else preds");
- if (NextPred != Pred)
- break;
- LLVM_DEBUG(dbgs() << " adding : "; MBIter->dump());
- ++VPTInstCnt;
- ++MBIter;
- };
-
- unsigned BlockMask = getARMVPTBlockMask(VPTInstCnt);
+ ARM::PredBlockMask BlockMask =
+ CreateVPTBlock(MBIter, EndIter, DeadInstructions);
- // Search back for a VCMP that can be folded to create a VPT, or else create
- // a VPST directly
+ // Search back for a VCMP that can be folded to create a VPT, or else
+ // create a VPST directly
MachineInstrBuilder MIBuilder;
unsigned NewOpcode;
- MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, RDA, NewOpcode);
- if (VCMP) {
+ LLVM_DEBUG(dbgs() << " final block mask: " << (unsigned)BlockMask << "\n");
+ if (MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode)) {
LLVM_DEBUG(dbgs() << " folding VCMP into VPST: "; VCMP->dump());
- MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode));
- MIBuilder.addImm(BlockMask);
+ MIBuilder = BuildMI(Block, MI, DL, TII->get(NewOpcode));
+ MIBuilder.addImm((uint64_t)BlockMask);
MIBuilder.add(VCMP->getOperand(1));
MIBuilder.add(VCMP->getOperand(2));
MIBuilder.add(VCMP->getOperand(3));
- // We delay removing the actual VCMP instruction by saving it to a list
- // and deleting all instructions in this list in one go after we have
- // created the VPT blocks. We do this in order not to invalidate the
- // ReachingDefAnalysis that is queried by 'findVCMPToFoldIntoVPST'.
- RemovedVCMPs.insert(VCMP);
+ VCMP->eraseFromParent();
} else {
- MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST));
- MIBuilder.addImm(BlockMask);
+ MIBuilder = BuildMI(Block, MI, DL, TII->get(ARM::MVE_VPST));
+ MIBuilder.addImm((uint64_t)BlockMask);
}
finalizeBundle(
@@ -166,16 +282,18 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
Modified = true;
}
- for (auto *I : RemovedVCMPs)
- I->eraseFromParent();
+ // Erase all dead instructions
+ for (MachineInstr *DeadMI : DeadInstructions) {
+ if (DeadMI->isInsideBundle())
+ DeadMI->eraseFromBundle();
+ else
+ DeadMI->eraseFromParent();
+ }
return Modified;
}
bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
- if (skipFunction(Fn.getFunction()))
- return false;
-
const ARMSubtarget &STI =
static_cast<const ARMSubtarget &>(Fn.getSubtarget());
@@ -183,7 +301,7 @@ bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
return false;
TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
- RDA = &getAnalysis<ReachingDefAnalysis>();
+ TRI = STI.getRegisterInfo();
LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
<< "********** Function: " << Fn.getName() << '\n');
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
new file mode 100644
index 000000000000..382ddd4572c7
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
@@ -0,0 +1,464 @@
+//===-- MVEVPTOptimisationsPass.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass does a few optimisations related to MVE VPT blocks before
+/// register allocation is performed. The goal is to maximize the sizes of the
+/// blocks that will be created by the MVE VPT Block Insertion pass (which runs
+/// after register allocation). The first optimisation done by this pass is the
+/// replacement of "opposite" VCMPs with VPNOTs, so the Block Insertion pass
+/// can delete them later to create larger VPT blocks.
+/// The second optimisation replaces re-uses of old VCCR values with VPNOTs when
+/// inside a block of predicated instructions. This is done to avoid
+/// spill/reloads of VPR in the middle of a block, which prevents the Block
+/// Insertion pass from creating large blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-mve-vpt-opts"
+
+namespace {
+class MVEVPTOptimisations : public MachineFunctionPass {
+public:
+ static char ID;
+ const Thumb2InstrInfo *TII;
+ MachineRegisterInfo *MRI;
+
+ MVEVPTOptimisations() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ StringRef getPassName() const override {
+ return "ARM MVE VPT Optimisation Pass";
+ }
+
+private:
+ MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
+ MachineInstr &Instr,
+ MachineOperand &User,
+ Register Target);
+ bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
+ bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
+};
+
+char MVEVPTOptimisations::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(MVEVPTOptimisations, DEBUG_TYPE,
+ "ARM MVE VPT Optimisations pass", false, false)
+
+// Returns true if Opcode is any VCMP Opcode.
+static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; }
+
+// Returns true if a VCMP with this Opcode can have its operands swapped.
+// There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs,
+// and VCMPr instructions (since the r is always on the right).
+static bool CanHaveSwappedOperands(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ return true;
+ case ARM::MVE_VCMPf32:
+ case ARM::MVE_VCMPf16:
+ case ARM::MVE_VCMPf32r:
+ case ARM::MVE_VCMPf16r:
+ case ARM::MVE_VCMPi8r:
+ case ARM::MVE_VCMPi16r:
+ case ARM::MVE_VCMPi32r:
+ case ARM::MVE_VCMPu8r:
+ case ARM::MVE_VCMPu16r:
+ case ARM::MVE_VCMPu32r:
+ case ARM::MVE_VCMPs8r:
+ case ARM::MVE_VCMPs16r:
+ case ARM::MVE_VCMPs32r:
+ return false;
+ }
+}
+
+// Returns the CondCode of a VCMP Instruction.
+static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) {
+ assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP");
+ return ARMCC::CondCodes(Instr.getOperand(3).getImm());
+}
+
+// Returns true if Cond is equivalent to a VPNOT instruction on the result of
+// Prev. Cond and Prev must be VCMPs.
+static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) {
+ assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode()));
+
+ // Opcodes must match.
+ if (Cond.getOpcode() != Prev.getOpcode())
+ return false;
+
+ MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2);
+ MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2);
+
+ // If the VCMP has the opposite condition with the same operands, we can
+ // replace it with a VPNOT
+ ARMCC::CondCodes ExpectedCode = GetCondCode(Cond);
+ ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode);
+ if (ExpectedCode == GetCondCode(Prev))
+ if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2))
+ return true;
+ // Check again with operands swapped if possible
+ if (!CanHaveSwappedOperands(Cond.getOpcode()))
+ return false;
+ ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode);
+ return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) &&
+ CondOP2.isIdenticalTo(PrevOP1);
+}
+
+// Returns true if Instr writes to VCCR.
+static bool IsWritingToVCCR(MachineInstr &Instr) {
+ if (Instr.getNumOperands() == 0)
+ return false;
+ MachineOperand &Dst = Instr.getOperand(0);
+ if (!Dst.isReg())
+ return false;
+ Register DstReg = Dst.getReg();
+ if (!DstReg.isVirtual())
+ return false;
+ MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo();
+ const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg);
+ return RegClass && (RegClass->getID() == ARM::VCCRRegClassID);
+}
+
+// Transforms
+// <Instr that uses %A ('User' Operand)>
+// Into
+// %K = VPNOT %Target
+// <Instr that uses %K ('User' Operand)>
+// And returns the newly inserted VPNOT.
+// This optimization is done in the hopes of preventing spills/reloads of VPR by
+// reducing the number of VCCR values with overlapping lifetimes.
+MachineInstr &MVEVPTOptimisations::ReplaceRegisterUseWithVPNOT(
+ MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User,
+ Register Target) {
+ Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target));
+
+ MachineInstrBuilder MIBuilder =
+ BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
+ .addDef(NewResult)
+ .addReg(Target);
+ addUnpredicatedMveVpredNOp(MIBuilder);
+
+ // Make the user use NewResult instead, and clear its kill flag.
+ User.setReg(NewResult);
+ User.setIsKill(false);
+
+ LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): ";
+ MIBuilder.getInstr()->dump());
+
+ return *MIBuilder.getInstr();
+}
+
+// Moves a VPNOT before its first user if an instruction that uses Reg is found
+// in-between the VPNOT and its user.
+// Returns true if there is at least one user of the VPNOT in the block.
+static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator Iter,
+ Register Reg) {
+ assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!");
+ assert(getVPTInstrPredicate(*Iter) == ARMVCC::None &&
+ "The VPNOT cannot be predicated");
+
+ MachineInstr &VPNOT = *Iter;
+ Register VPNOTResult = VPNOT.getOperand(0).getReg();
+ Register VPNOTOperand = VPNOT.getOperand(1).getReg();
+
+ // Whether the VPNOT will need to be moved, and whether we found a user of the
+ // VPNOT.
+ bool MustMove = false, HasUser = false;
+ MachineOperand *VPNOTOperandKiller = nullptr;
+ for (; Iter != MBB.end(); ++Iter) {
+ if (MachineOperand *MO =
+ Iter->findRegisterUseOperand(VPNOTOperand, /*isKill*/ true)) {
+ // If we find the operand that kills the VPNOTOperand's result, save it.
+ VPNOTOperandKiller = MO;
+ }
+
+ if (Iter->findRegisterUseOperandIdx(Reg) != -1) {
+ MustMove = true;
+ continue;
+ }
+
+ if (Iter->findRegisterUseOperandIdx(VPNOTResult) == -1)
+ continue;
+
+ HasUser = true;
+ if (!MustMove)
+ break;
+
+ // Move the VPNOT right before Iter
+ LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << " Before: ";
+ Iter->dump());
+ MBB.splice(Iter, &MBB, VPNOT.getIterator());
+ // If we move the instr, and its operand was killed earlier, remove the kill
+ // flag.
+ if (VPNOTOperandKiller)
+ VPNOTOperandKiller->setIsKill(false);
+
+ break;
+ }
+ return HasUser;
+}
+
+// This optimisation attempts to reduce the number of overlapping lifetimes of
+// VCCR values by replacing uses of old VCCR values with VPNOTs. For example,
+// this replaces
+// %A:vccr = (something)
+// %B:vccr = VPNOT %A
+// %Foo = (some op that uses %B)
+// %Bar = (some op that uses %A)
+// With
+// %A:vccr = (something)
+// %B:vccr = VPNOT %A
+// %Foo = (some op that uses %B)
+// %TMP2:vccr = VPNOT %B
+// %Bar = (some op that uses %A)
+bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
+ MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end();
+ SmallVector<MachineInstr *, 4> DeadInstructions;
+ bool Modified = false;
+
+ while (Iter != End) {
+ Register VCCRValue, OppositeVCCRValue;
+ // The first loop looks for 2 unpredicated instructions:
+ // %A:vccr = (instr) ; A is stored in VCCRValue
+ // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue
+ for (; Iter != End; ++Iter) {
+ // We're only interested in unpredicated instructions that write to VCCR.
+ if (!IsWritingToVCCR(*Iter) ||
+ getVPTInstrPredicate(*Iter) != ARMVCC::None)
+ continue;
+ Register Dst = Iter->getOperand(0).getReg();
+
+ // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've
+ // found what we were looking for.
+ if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT &&
+ Iter->findRegisterUseOperandIdx(VCCRValue) != -1) {
+ // Move the VPNOT closer to its first user if needed, and ignore if it
+ // has no users.
+ if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue))
+ continue;
+
+ OppositeVCCRValue = Dst;
+ ++Iter;
+ break;
+ }
+
+ // Else, just set VCCRValue.
+ VCCRValue = Dst;
+ }
+
+ // If the first inner loop didn't find anything, stop here.
+ if (Iter == End)
+ break;
+
+ assert(VCCRValue && OppositeVCCRValue &&
+ "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop "
+ "stopped before the end of the block!");
+ assert(VCCRValue != OppositeVCCRValue &&
+ "VCCRValue should not be equal to OppositeVCCRValue!");
+
+ // LastVPNOTResult always contains the same value as OppositeVCCRValue.
+ Register LastVPNOTResult = OppositeVCCRValue;
+
+ // This second loop tries to optimize the remaining instructions.
+ for (; Iter != End; ++Iter) {
+ bool IsInteresting = false;
+
+ if (MachineOperand *MO = Iter->findRegisterUseOperand(VCCRValue)) {
+ IsInteresting = true;
+
+ // - If the instruction is a VPNOT, it can be removed, and we can just
+ // replace its uses with LastVPNOTResult.
+ // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue.
+ if (Iter->getOpcode() == ARM::MVE_VPNOT) {
+ Register Result = Iter->getOperand(0).getReg();
+
+ MRI->replaceRegWith(Result, LastVPNOTResult);
+ DeadInstructions.push_back(&*Iter);
+ Modified = true;
+
+ LLVM_DEBUG(dbgs()
+ << "Replacing all uses of '" << printReg(Result)
+ << "' with '" << printReg(LastVPNOTResult) << "'\n");
+ } else {
+ MachineInstr &VPNOT =
+ ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult);
+ Modified = true;
+
+ LastVPNOTResult = VPNOT.getOperand(0).getReg();
+ std::swap(VCCRValue, OppositeVCCRValue);
+
+ LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue)
+ << "' with '" << printReg(LastVPNOTResult)
+ << "' in instr: " << *Iter);
+ }
+ } else {
+ // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult
+ // instead as they contain the same value.
+ if (MachineOperand *MO =
+ Iter->findRegisterUseOperand(OppositeVCCRValue)) {
+ IsInteresting = true;
+
+ // This is pointless if LastVPNOTResult == OppositeVCCRValue.
+ if (LastVPNOTResult != OppositeVCCRValue) {
+ LLVM_DEBUG(dbgs() << "Replacing usage of '"
+ << printReg(OppositeVCCRValue) << "' with '"
+ << printReg(LastVPNOTResult) << " for instr: ";
+ Iter->dump());
+ MO->setReg(LastVPNOTResult);
+ Modified = true;
+ }
+
+ MO->setIsKill(false);
+ }
+
+ // If this is an unpredicated VPNOT on
+ // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it.
+ if (Iter->getOpcode() == ARM::MVE_VPNOT &&
+ getVPTInstrPredicate(*Iter) == ARMVCC::None) {
+ Register VPNOTOperand = Iter->getOperand(1).getReg();
+ if (VPNOTOperand == LastVPNOTResult ||
+ VPNOTOperand == OppositeVCCRValue) {
+ IsInteresting = true;
+
+ std::swap(VCCRValue, OppositeVCCRValue);
+ LastVPNOTResult = Iter->getOperand(0).getReg();
+ }
+ }
+ }
+
+ // If this instruction was not interesting, and it writes to VCCR, stop.
+ if (!IsInteresting && IsWritingToVCCR(*Iter))
+ break;
+ }
+ }
+
+ for (MachineInstr *DeadInstruction : DeadInstructions)
+ DeadInstruction->removeFromParent();
+
+ return Modified;
+}
+
+// This optimisation replaces VCMPs with VPNOTs when they are equivalent.
+bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
+ SmallVector<MachineInstr *, 4> DeadInstructions;
+
+ // The last VCMP that we have seen and that couldn't be replaced.
+ // This is reset when an instruction that writes to VCCR/VPR is found, or when
+ // a VCMP is replaced with a VPNOT.
+ // We'll only replace VCMPs with VPNOTs when this is not null, and when the
+ // current VCMP is the opposite of PrevVCMP.
+ MachineInstr *PrevVCMP = nullptr;
+ // If we find an instruction that kills the result of PrevVCMP, we save the
+ // operand here to remove the kill flag in case we need to use PrevVCMP's
+ // result.
+ MachineOperand *PrevVCMPResultKiller = nullptr;
+
+ for (MachineInstr &Instr : MBB.instrs()) {
+ if (PrevVCMP) {
+ if (MachineOperand *MO = Instr.findRegisterUseOperand(
+ PrevVCMP->getOperand(0).getReg(), /*isKill*/ true)) {
+ // If we come accross the instr that kills PrevVCMP's result, record it
+ // so we can remove the kill flag later if we need to.
+ PrevVCMPResultKiller = MO;
+ }
+ }
+
+ // Ignore predicated instructions.
+ if (getVPTInstrPredicate(Instr) != ARMVCC::None)
+ continue;
+
+ // Only look at VCMPs
+ if (!IsVCMP(Instr.getOpcode())) {
+ // If the instruction writes to VCCR, forget the previous VCMP.
+ if (IsWritingToVCCR(Instr))
+ PrevVCMP = nullptr;
+ continue;
+ }
+
+ if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) {
+ PrevVCMP = &Instr;
+ continue;
+ }
+
+ // The register containing the result of the VCMP that we're going to
+ // replace.
+ Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg();
+
+ // Build a VPNOT to replace the VCMP, reusing its operands.
+ MachineInstrBuilder MIBuilder =
+ BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
+ .add(Instr.getOperand(0))
+ .addReg(PrevVCMPResultReg);
+ addUnpredicatedMveVpredNOp(MIBuilder);
+ LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): ";
+ MIBuilder.getInstr()->dump(); dbgs() << " Removed VCMP: ";
+ Instr.dump());
+
+ // If we found an instruction that uses, and kills PrevVCMP's result,
+ // remove the kill flag.
+ if (PrevVCMPResultKiller)
+ PrevVCMPResultKiller->setIsKill(false);
+
+ // Finally, mark the old VCMP for removal and reset
+ // PrevVCMP/PrevVCMPResultKiller.
+ DeadInstructions.push_back(&Instr);
+ PrevVCMP = nullptr;
+ PrevVCMPResultKiller = nullptr;
+ }
+
+ for (MachineInstr *DeadInstruction : DeadInstructions)
+ DeadInstruction->removeFromParent();
+
+ return !DeadInstructions.empty();
+}
+
+bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
+ const ARMSubtarget &STI =
+ static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+
+ if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
+ return false;
+
+ TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
+ MRI = &Fn.getRegInfo();
+
+ LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
+ << "********** Function: " << Fn.getName() << '\n');
+
+ bool Modified = false;
+ for (MachineBasicBlock &MBB : Fn) {
+ Modified |= ReplaceVCMPsByVPNOTs(MBB);
+ Modified |= ReduceOldVCCRValueUses(MBB);
+ }
+
+ LLVM_DEBUG(dbgs() << "**************************************\n");
+ return Modified;
+}
+
+/// createMVEVPTOptimisationsPass
+FunctionPass *llvm::createMVEVPTOptimisationsPass() {
+ return new MVEVPTOptimisations();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 956d474f1d79..d568e9afe432 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -88,8 +88,10 @@ emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB,
0, MIFlags);
}
BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDhirr), ARM::SP)
- .addReg(ARM::SP).addReg(ScratchReg, RegState::Kill)
- .add(predOps(ARMCC::AL));
+ .addReg(ARM::SP)
+ .addReg(ScratchReg, RegState::Kill)
+ .add(predOps(ARMCC::AL))
+ .setMIFlags(MIFlags);
return;
}
// FIXME: This is assuming the heuristics in emitThumbRegPlusImmediate
@@ -127,7 +129,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
// alignment boundary.
- Amount = alignTo(Amount, getStackAlignment());
+ Amount = alignTo(Amount, getStackAlign());
// Replace the pseudo instruction with a new instruction...
unsigned Opc = Old.getOpcode();
@@ -180,9 +182,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
if (ArgRegsSaveSize) {
emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
ARM::NoRegister, MachineInstr::FrameSetup);
- CFAOffset -= ArgRegsSaveSize;
- unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ CFAOffset += ArgRegsSaveSize;
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -193,9 +195,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
-(NumBytes - ArgRegsSaveSize),
ARM::NoRegister, MachineInstr::FrameSetup);
- CFAOffset -= NumBytes - ArgRegsSaveSize;
+ CFAOffset += NumBytes - ArgRegsSaveSize;
unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -257,9 +259,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
}
if (adjustedGPRCS1Size) {
- CFAOffset -= adjustedGPRCS1Size;
- unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ CFAOffset += adjustedGPRCS1Size;
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -305,8 +307,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlags(MachineInstr::FrameSetup)
.add(predOps(ARMCC::AL));
if(FramePtrOffsetInBlock) {
- CFAOffset += FramePtrOffsetInBlock;
- unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+ CFAOffset -= FramePtrOffsetInBlock;
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
@@ -384,9 +386,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
ScratchRegister, MachineInstr::FrameSetup);
if (!HasFP) {
- CFAOffset -= NumBytes;
+ CFAOffset += NumBytes;
unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -402,7 +404,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
if (RegInfo->needsStackRealignment(MF)) {
- const unsigned NrBitsToZero = countTrailingZeros(MFI.getMaxAlignment());
+ const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
// Emit the following sequence, using R4 as a temporary, since we cannot use
// SP as a source or destination register for the shifts:
// mov r4, sp
@@ -804,11 +806,9 @@ static const unsigned *findNextOrderedReg(const unsigned *CurrentReg,
return CurrentReg;
}
-bool Thumb1FrameLowering::
-spillCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
+bool Thumb1FrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
if (CSI.empty())
return false;
@@ -927,11 +927,9 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
return true;
}
-bool Thumb1FrameLowering::
-restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
+bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
if (CSI.empty())
return false;
@@ -1049,6 +1047,10 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
if (!STI.hasV5TOps())
continue;
+ // CMSE entry functions must return via BXNS, see emitEpilogue.
+ if (AFI->isCmseNSEntryFunction())
+ continue;
+
// Pop LR into PC.
Reg = ARM::PC;
(*MIB).setDesc(TII.get(ARM::tPOP_RET));
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.h
index 61af48712b6c..a4b2a085ea38 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -27,12 +27,13 @@ public:
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;
- bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const override;
+ bool
+ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index b08b71a4952d..79afa378cb62 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -76,7 +76,7 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
void Thumb1InstrInfo::
storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned SrcReg, bool isKill, int FI,
+ Register SrcReg, bool isKill, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
assert((RC == &ARM::tGPRRegClass ||
@@ -92,7 +92,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
BuildMI(MBB, I, DL, get(ARM::tSTRspi))
.addReg(SrcReg, getKillRegState(isKill))
.addFrameIndex(FI)
@@ -104,7 +104,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
void Thumb1InstrInfo::
loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned DestReg, int FI,
+ Register DestReg, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
assert(
@@ -121,7 +121,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
.addFrameIndex(FI)
.addImm(0)
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 530289fe8c5d..017b7222337c 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -42,13 +42,13 @@ public:
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- unsigned SrcReg, bool isKill, int FrameIndex,
+ Register SrcReg, bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- unsigned DestReg, int FrameIndex,
+ Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 786fc78d0233..5cdaa7f02201 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -183,7 +183,7 @@ Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI,
++I;
if (I != E) {
- unsigned NPredReg = 0;
+ Register NPredReg;
ARMCC::CondCodes NCC = getITInstrPredicate(*I, NPredReg);
if (NCC == CC || NCC == OCC)
return true;
@@ -199,7 +199,7 @@ bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) {
while (MBBI != E) {
MachineInstr *MI = &*MBBI;
DebugLoc dl = MI->getDebugLoc();
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
if (CC == ARMCC::AL) {
++MBBI;
@@ -239,7 +239,7 @@ bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) {
MachineInstr *NMI = &*MBBI;
MI = NMI;
- unsigned NPredReg = 0;
+ Register NPredReg;
ARMCC::CondCodes NCC = getITInstrPredicate(*NMI, NPredReg);
if (NCC == CC || NCC == OCC) {
Mask |= ((NCC ^ CC) & 1) << Pos;
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index e06bb9546c03..48c6b47f2154 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -66,7 +66,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
// If the first instruction of Tail is predicated, we may have to update
// the IT instruction.
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes CC = getInstrPredicate(*Tail, PredReg);
MachineBasicBlock::iterator MBBI = Tail;
if (CC != ARMCC::AL)
@@ -114,7 +114,7 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
return false;
}
- unsigned PredReg = 0;
+ Register PredReg;
return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL;
}
@@ -133,7 +133,7 @@ void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
void Thumb2InstrInfo::
storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned SrcReg, bool isKill, int FI,
+ Register SrcReg, bool isKill, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
DebugLoc DL;
@@ -143,7 +143,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
if (ARM::GPRRegClass.hasSubClassEq(RC)) {
BuildMI(MBB, I, DL, get(ARM::t2STRi12))
@@ -176,14 +176,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
void Thumb2InstrInfo::
loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- unsigned DestReg, int FI,
+ Register DestReg, int FI,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
DebugLoc DL;
if (I != MBB.end()) DL = I->getDebugLoc();
@@ -229,9 +229,9 @@ void Thumb2InstrInfo::expandLoadStackGuard(
void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg,
- unsigned BaseReg, int NumBytes,
- ARMCC::CondCodes Pred, unsigned PredReg,
+ const DebugLoc &dl, Register DestReg,
+ Register BaseReg, int NumBytes,
+ ARMCC::CondCodes Pred, Register PredReg,
const ARMBaseInstrInfo &TII,
unsigned MIFlags) {
if (NumBytes == 0 && DestReg != BaseReg) {
@@ -471,7 +471,7 @@ immediateOffsetOpcode(unsigned opcode)
}
bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ Register FrameReg, int &Offset,
const ARMBaseInstrInfo &TII,
const TargetRegisterInfo *TRI) {
unsigned Opcode = MI.getOpcode();
@@ -491,7 +491,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
if (IsSP || Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
Offset += MI.getOperand(FrameRegIdx+1).getImm();
- unsigned PredReg;
+ Register PredReg;
if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL &&
!MI.definesRegister(ARM::CPSR)) {
// Turn it into a move.
@@ -634,7 +634,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
assert((Offset & OffsetMask) == 0 && "Can't encode this offset!");
(void)OffsetMask; // squash unused-variable warning at -NDEBUG
} else if (AddrMode == ARMII::AddrModeT2_i8s4) {
- Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
+ Offset += MI.getOperand(FrameRegIdx + 1).getImm();
NumBits = 8 + 2;
// MCInst operand expects already scaled value.
Scale = 1;
@@ -706,7 +706,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
}
ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
- unsigned &PredReg) {
+ Register &PredReg) {
unsigned Opc = MI.getOpcode();
if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
return ARMCC::AL;
@@ -727,7 +727,7 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) {
}
ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
- unsigned &PredReg) {
+ Register &PredReg) {
int PIdx = findFirstVPTPredOperandIdx(MI);
if (PIdx == -1) {
PredReg = 0;
@@ -737,3 +737,33 @@ ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
PredReg = MI.getOperand(PIdx+1).getReg();
return (ARMVCC::VPTCodes)MI.getOperand(PIdx).getImm();
}
+
+void llvm::recomputeVPTBlockMask(MachineInstr &Instr) {
+ assert(isVPTOpcode(Instr.getOpcode()) && "Not a VPST or VPT Instruction!");
+
+ MachineOperand &MaskOp = Instr.getOperand(0);
+ assert(MaskOp.isImm() && "Operand 0 is not the block mask of the VPT/VPST?!");
+
+ MachineBasicBlock::iterator Iter = ++Instr.getIterator(),
+ End = Instr.getParent()->end();
+
+ // Verify that the instruction after the VPT/VPST is predicated (it should
+ // be), and skip it.
+ assert(
+ getVPTInstrPredicate(*Iter) == ARMVCC::Then &&
+ "VPT/VPST should be followed by an instruction with a 'then' predicate!");
+ ++Iter;
+
+ // Iterate over the predicated instructions, updating the BlockMask as we go.
+ ARM::PredBlockMask BlockMask = ARM::PredBlockMask::T;
+ while (Iter != End) {
+ ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*Iter);
+ if (Pred == ARMVCC::None)
+ break;
+ BlockMask = expandPredBlockMask(BlockMask, Pred);
+ ++Iter;
+ }
+
+ // Rewrite the BlockMask.
+ MaskOp.setImm((int64_t)(BlockMask));
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 7d8dff14e1e7..ec3763632239 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -44,13 +44,13 @@ public:
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- unsigned SrcReg, bool isKill, int FrameIndex,
+ Register SrcReg, bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- unsigned DestReg, int FrameIndex,
+ Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
@@ -67,13 +67,24 @@ private:
/// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
/// to llvm::getInstrPredicate except it returns AL for conditional branch
/// instructions which are "predicated", but are not in IT blocks.
-ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, Register &PredReg);
// getVPTInstrPredicate: VPT analogue of that, plus a helper function
// corresponding to MachineInstr::findFirstPredOperandIdx.
int findFirstVPTPredOperandIdx(const MachineInstr &MI);
ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI,
- unsigned &PredReg);
+ Register &PredReg);
+inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) {
+ Register PredReg;
+ return getVPTInstrPredicate(MI, PredReg);
}
+// Recomputes the Block Mask of Instr, a VPT or VPST instruction.
+// This rebuilds the block mask of the instruction depending on the predicates
+// of the instructions following it. This should only be used after the
+// MVEVPTBlockInsertion pass has run, and should be used whenever a predicated
+// instruction is added to/removed from the block.
+void recomputeVPTBlockMask(MachineInstr &Instr);
+} // namespace llvm
+
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index c5a62aa33990..ae661594bdc9 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -457,7 +457,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
return false;
if (!MI->hasOneMemOperand() ||
- (*MI->memoperands_begin())->getAlignment() < 4)
+ (*MI->memoperands_begin())->getAlign() < Align(4))
return false;
// We're creating a completely different type of load/store - LDM from LDR.
@@ -516,13 +516,23 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
isLdStMul = true;
break;
}
- case ARM::t2STMIA:
- // If the base register is killed, we don't care what its value is after the
- // instruction, so we can use an updating STMIA.
+ case ARM::t2STMIA: {
+ // t2STMIA is reduced to tSTMIA_UPD which has writeback. We can only do this
+ // if the base register is killed, as then it doesn't matter what its value
+ // is after the instruction.
if (!MI->getOperand(0).isKill())
return false;
+ // If the base register is in the register list and isn't the lowest
+ // numbered register (i.e. it's in operand 4 onwards) then with writeback
+ // the stored value is unknown, so we can't convert to tSTMIA_UPD.
+ Register BaseReg = MI->getOperand(0).getReg();
+ for (unsigned i = 4; i < MI->getNumOperands(); ++i)
+ if (MI->getOperand(i).getReg() == BaseReg)
+ return false;
+
break;
+ }
case ARM::t2LDMIA_RET: {
Register BaseReg = MI->getOperand(1).getReg();
if (BaseReg != ARM::SP)
@@ -676,7 +686,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
default: break;
case ARM::t2ADDSri:
case ARM::t2ADDSrr: {
- unsigned PredReg = 0;
+ Register PredReg;
if (getInstrPredicate(*MI, PredReg) == ARMCC::AL) {
switch (Opc) {
default: break;
@@ -718,7 +728,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
}
case ARM::t2TEQrr: {
- unsigned PredReg = 0;
+ Register PredReg;
// Can only convert to eors if we're not in an IT block.
if (getInstrPredicate(*MI, PredReg) != ARMCC::AL)
break;
@@ -789,7 +799,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
// Check if it's possible / necessary to transfer the predicate.
const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2);
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
bool SkipPred = false;
if (Pred != ARMCC::AL) {
@@ -882,7 +892,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
// Check if it's possible / necessary to transfer the predicate.
const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1);
- unsigned PredReg = 0;
+ Register PredReg;
ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
bool SkipPred = false;
if (Pred != ARMCC::AL) {
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index b0ba58d8dc4a..4da6f6ab6994 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -70,7 +70,7 @@ static void emitThumb1LoadConstPool(MachineBasicBlock &MBB,
MachineConstantPool *ConstantPool = MF.getConstantPool();
const Constant *C = ConstantInt::get(
Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val);
- unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci))
.addReg(DestReg, getDefRegState(true), SubIdx)
@@ -89,7 +89,7 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
MachineConstantPool *ConstantPool = MF.getConstantPool();
const Constant *C = ConstantInt::get(
Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val);
- unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+ unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
.addReg(DestReg, getDefRegState(true), SubIdx)
@@ -102,14 +102,13 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
/// specified immediate.
void ThumbRegisterInfo::emitLoadConstPool(
MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
- ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
+ const DebugLoc &dl, Register DestReg, unsigned SubIdx, int Val,
+ ARMCC::CondCodes Pred, Register PredReg, unsigned MIFlags) const {
MachineFunction &MF = *MBB.getParent();
const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
if (STI.isThumb1Only()) {
- assert(
- (isARMLowRegister(DestReg) || Register::isVirtualRegister(DestReg)) &&
- "Thumb1 does not have ldr to high register");
+ assert((isARMLowRegister(DestReg) || DestReg.isVirtual()) &&
+ "Thumb1 does not have ldr to high register");
return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred,
PredReg, MIFlags);
}
@@ -123,7 +122,7 @@ void ThumbRegisterInfo::emitLoadConstPool(
/// constpool entry.
static void emitThumbRegPlusImmInReg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg, unsigned BaseReg, int NumBytes,
+ const DebugLoc &dl, Register DestReg, Register BaseReg, int NumBytes,
bool CanChangeCC, const TargetInstrInfo &TII,
const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) {
MachineFunction &MF = *MBB.getParent();
@@ -139,7 +138,7 @@ static void emitThumbRegPlusImmInReg(
isSub = true;
NumBytes = -NumBytes;
}
- unsigned LdReg = DestReg;
+ Register LdReg = DestReg;
if (DestReg == ARM::SP)
assert(BaseReg == ARM::SP && "Unexpected!");
if (!isARMLowRegister(DestReg) && !Register::isVirtualRegister(DestReg))
@@ -185,8 +184,8 @@ static void emitThumbRegPlusImmInReg(
/// be too long. This is allowed to modify the condition flags.
void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg,
- unsigned BaseReg, int NumBytes,
+ const DebugLoc &dl, Register DestReg,
+ Register BaseReg, int NumBytes,
const TargetInstrInfo &TII,
const ARMBaseRegisterInfo &MRI,
unsigned MIFlags) {
@@ -358,7 +357,7 @@ static unsigned convertToNonSPOpcode(unsigned Opcode) {
bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ Register FrameReg, int &Offset,
const ARMBaseInstrInfo &TII) const {
MachineInstr &MI = *II;
MachineBasicBlock &MBB = *MI.getParent();
@@ -427,8 +426,8 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
return Offset == 0;
}
-void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
- int64_t Offset) const {
+void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
+ int64_t Offset) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
if (!STI.isThumb1Only())
@@ -458,12 +457,12 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
return ARMBaseRegisterInfo::eliminateFrameIndex(II, SPAdj, FIOperandNum,
RS);
- unsigned VReg = 0;
+ Register VReg;
const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
MachineInstrBuilder MIB(*MBB.getParent(), &MI);
- unsigned FrameReg;
+ Register FrameReg;
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
const ARMFrameLowering *TFI = getFrameLowering(MF);
int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.h
index 08cf67284d4c..e05a24dbaca5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.h
@@ -38,18 +38,18 @@ public:
/// specified immediate.
void
emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
- const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+ const DebugLoc &dl, Register DestReg, unsigned SubIdx,
int Val, ARMCC::CondCodes Pred = ARMCC::AL,
- unsigned PredReg = 0,
+ Register PredReg = Register(),
unsigned MIFlags = MachineInstr::NoFlags) const override;
// rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
// however much remains to be handled. Return 'true' if no further
// work is required.
bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
- unsigned FrameReg, int &Offset,
+ Register FrameReg, int &Offset,
const ARMBaseInstrInfo &TII) const;
- void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+ void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
int64_t Offset) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
index 4ace61cccd0f..3356d56481e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
@@ -15,6 +15,37 @@
using namespace llvm;
namespace llvm {
+ARM::PredBlockMask expandPredBlockMask(ARM::PredBlockMask BlockMask,
+ ARMVCC::VPTCodes Kind) {
+ using PredBlockMask = ARM::PredBlockMask;
+ assert(Kind != ARMVCC::None && "Cannot expand a mask with None!");
+ assert(countTrailingZeros((unsigned)BlockMask) != 0 &&
+ "Mask is already full");
+
+ auto ChooseMask = [&](PredBlockMask AddedThen, PredBlockMask AddedElse) {
+ return Kind == ARMVCC::Then ? AddedThen : AddedElse;
+ };
+
+ switch (BlockMask) {
+ case PredBlockMask::T:
+ return ChooseMask(PredBlockMask::TT, PredBlockMask::TE);
+ case PredBlockMask::TT:
+ return ChooseMask(PredBlockMask::TTT, PredBlockMask::TTE);
+ case PredBlockMask::TE:
+ return ChooseMask(PredBlockMask::TET, PredBlockMask::TEE);
+ case PredBlockMask::TTT:
+ return ChooseMask(PredBlockMask::TTTT, PredBlockMask::TTTE);
+ case PredBlockMask::TTE:
+ return ChooseMask(PredBlockMask::TTET, PredBlockMask::TTEE);
+ case PredBlockMask::TET:
+ return ChooseMask(PredBlockMask::TETT, PredBlockMask::TETE);
+ case PredBlockMask::TEE:
+ return ChooseMask(PredBlockMask::TEET, PredBlockMask::TEEE);
+ default:
+ llvm_unreachable("Unknown Mask");
+ }
+}
+
namespace ARMSysReg {
// lookup system register using 12-bit SYSm value.
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
index 27605422983d..80b7276adb4e 100644
--- a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
@@ -91,41 +91,41 @@ namespace ARMVCC {
Then,
Else
};
-
- enum VPTMaskValue {
- T = 8, // 0b1000
- TT = 4, // 0b0100
- TE = 12, // 0b1100
- TTT = 2, // 0b0010
- TTE = 6, // 0b0110
- TEE = 10, // 0b1010
- TET = 14, // 0b1110
- TTTT = 1, // 0b0001
- TTTE = 3, // 0b0011
- TTEE = 5, // 0b0101
- TTET = 7, // 0b0111
- TEEE = 9, // 0b1001
- TEET = 11, // 0b1011
- TETT = 13, // 0b1101
- TETE = 15 // 0b1111
+} // namespace ARMVCC
+
+namespace ARM {
+ /// Mask values for IT and VPT Blocks, to be used by MCOperands.
+ /// Note that this is different from the "real" encoding used by the
+ /// instructions. In this encoding, the lowest set bit indicates the end of
+ /// the encoding, and above that, "1" indicates an else, while "0" indicates
+ /// a then.
+ /// Tx = x100
+ /// Txy = xy10
+ /// Txyz = xyz1
+ enum class PredBlockMask {
+ T = 0b1000,
+ TT = 0b0100,
+ TE = 0b1100,
+ TTT = 0b0010,
+ TTE = 0b0110,
+ TEE = 0b1110,
+ TET = 0b1010,
+ TTTT = 0b0001,
+ TTTE = 0b0011,
+ TTEE = 0b0111,
+ TTET = 0b0101,
+ TEEE = 0b1111,
+ TEET = 0b1101,
+ TETT = 0b1001,
+ TETE = 0b1011
};
-}
+} // namespace ARM
-inline static unsigned getARMVPTBlockMask(unsigned NumInsts) {
- switch (NumInsts) {
- case 1:
- return ARMVCC::T;
- case 2:
- return ARMVCC::TT;
- case 3:
- return ARMVCC::TTT;
- case 4:
- return ARMVCC::TTTT;
- default:
- break;
- };
- llvm_unreachable("Unexpected number of instruction in a VPT block");
-}
+// Expands a PredBlockMask by adding an E or a T at the end, depending on Kind.
+// e.g ExpandPredBlockMask(T, Then) = TT, ExpandPredBlockMask(TT, Else) = TTE,
+// and so on.
+ARM::PredBlockMask expandPredBlockMask(ARM::PredBlockMask BlockMask,
+ ARMVCC::VPTCodes Kind);
inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) {
switch (CC) {