diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-07-31 21:22:58 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-07-31 21:22:58 +0000 |
commit | 5ffd83dbcc34f10e07f6d3e968ae6365869615f4 (patch) | |
tree | 0e9f5cf729dde39f949698fddef45a34e2bc7f44 /contrib/llvm-project/llvm/lib/Target/ARM | |
parent | 1799696096df87b52968b8996d00c91e0a5de8d9 (diff) | |
parent | cfca06d7963fa0909f90483b42a6d7d194d01e08 (diff) | |
download | src-5ffd83dbcc34f10e07f6d3e968ae6365869615f4.tar.gz src-5ffd83dbcc34f10e07f6d3e968ae6365869615f4.zip |
Merge llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp
master 2e10b7a39b9, the last commit before the llvmorg-12-init tag, from
which release/11.x was branched.
Note that for now, I rolled back all our local changes to make merging
easier, and I will reapply the still-relevant ones after updating to
11.0.0-rc1.
Notes
Notes:
svn path=/projects/clang1100-import/; revision=363742
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/ARM')
88 files changed, 12102 insertions, 3442 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h index 3412813a3ef2..7398968bb24a 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.h @@ -47,6 +47,7 @@ FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); FunctionPass *createMVEVPTBlockPass(); +FunctionPass *createMVEVPTOptimisationsPass(); FunctionPass *createARMOptimizeBarriersPass(); FunctionPass *createThumb2SizeReductionPass( std::function<bool(const Function &)> Ftor = nullptr); @@ -66,6 +67,7 @@ void initializeARMExpandPseudoPass(PassRegistry &); void initializeThumb2SizeReducePass(PassRegistry &); void initializeThumb2ITBlockPass(PassRegistry &); void initializeMVEVPTBlockPass(PassRegistry &); +void initializeMVEVPTOptimisationsPass(PassRegistry &); void initializeARMLowOverheadLoopsPass(PassRegistry &); void initializeMVETailPredicationPass(PassRegistry &); void initializeMVEGatherScatterLoweringPass(PassRegistry &); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td index 380eaa863689..0468f7f1cf8e 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARM.td @@ -424,6 +424,13 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler", def FeatureSB : SubtargetFeature<"sb", "HasSB", "true", "Enable v8.5a Speculation Barrier" >; +// Armv8.6-A extensions +def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", "true", + "Enable support for BFloat16 instructions", [FeatureNEON]>; + +def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", + "true", "Enable Matrix Multiply Int8 Extension", [FeatureNEON]>; + // Armv8.1-M extensions def FeatureLOB : SubtargetFeature<"lob", "HasLOB", "true", @@ -523,6 +530,11 @@ def HasV8_5aOps : SubtargetFeature<"v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions", [HasV8_4aOps, FeatureSB]>; +def HasV8_6aOps : SubtargetFeature<"v8.6a", "HasV8_6aOps", "true", + "Support ARM v8.6a instructions", + [HasV8_5aOps, FeatureBF16, + FeatureMatMulInt8]>; + def HasV8_1MMainlineOps : SubtargetFeature< "v8.1m.main", "HasV8_1MMainlineOps", "true", "Support ARM v8-1M Mainline instructions", @@ -536,6 +548,16 @@ def HasMVEFloatOps : SubtargetFeature< "Support M-Class Vector Extension with integer and floating ops", [HasMVEIntegerOps, FeatureFPARMv8_D16_SP, FeatureFullFP16]>; +def HasCDEOps : SubtargetFeature<"cde", "HasCDEOps", "true", + "Support CDE instructions", + [HasV8MMainlineOps]>; + +foreach i = {0-7} in + def FeatureCoprocCDE#i : SubtargetFeature<"cdecp"#i, + "CoprocCDE["#i#"]", "true", + "Coprocessor "#i#" ISA is CDEv1", + [HasCDEOps]>; + //===----------------------------------------------------------------------===// // ARM Processor subtarget features. // @@ -572,6 +594,12 @@ def ProcA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", "Cortex-A75 ARM processors", []>; def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", "Cortex-A76 ARM processors", []>; +def ProcA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", + "Cortex-A77 ARM processors", []>; +def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily", "CortexA78", + "Cortex-A78 ARM processors", []>; +def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1", + "Cortex-X1 ARM processors", []>; def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", "Qualcomm Krait processors", []>; @@ -787,6 +815,19 @@ def ARMv85a : Architecture<"armv8.5-a", "ARMv85a", [HasV8_5aOps, FeatureCRC, FeatureRAS, FeatureDotProd]>; +def ARMv86a : Architecture<"armv8.6-a", "ARMv86a", [HasV8_6aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC, + FeatureRAS, + FeatureDotProd]>; def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops, FeatureRClass, @@ -1114,6 +1155,14 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model, [ARMv8mMainline, FeatureUseMISched, FeatureHasNoBranchPredictor]>; +def : ProcessorModel<"cortex-m55", CortexM4Model, [ARMv81mMainline, + FeatureDSP, + FeatureFPARMv8_D16, + FeatureUseMISched, + FeatureHasNoBranchPredictor, + FeaturePrefLoopAlign32, + FeatureHasSlowFPVMLx, + HasMVEFloatOps]>; def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, @@ -1181,6 +1230,30 @@ def : ProcNoItin<"cortex-a76ae", [ARMv82a, ProcA76, FeatureFullFP16, FeatureDotProd]>; +def : ProcNoItin<"cortex-a77", [ARMv82a, ProcA77, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFullFP16, + FeatureDotProd]>; + +def : ProcNoItin<"cortex-a78", [ARMv82a, ProcA78, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFullFP16, + FeatureDotProd]>; + +def : ProcNoItin<"cortex-x1", [ARMv82a, ProcX1, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFullFP16, + FeatureDotProd]>; + def : ProcNoItin<"neoverse-n1", [ARMv82a, FeatureHWDivThumb, FeatureHWDivARM, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 6f26ca127f94..d6c1efa6327c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -57,26 +57,36 @@ ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM, : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr), AFI(nullptr), MCP(nullptr), InConstantPool(false), OptimizationGoals(-1) {} -void ARMAsmPrinter::EmitFunctionBodyEnd() { +void ARMAsmPrinter::emitFunctionBodyEnd() { // Make sure to terminate any constant pools that were at the end // of the function. if (!InConstantPool) return; InConstantPool = false; - OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); + OutStreamer->emitDataRegion(MCDR_DataRegionEnd); } -void ARMAsmPrinter::EmitFunctionEntryLabel() { +void ARMAsmPrinter::emitFunctionEntryLabel() { if (AFI->isThumbFunction()) { - OutStreamer->EmitAssemblerFlag(MCAF_Code16); - OutStreamer->EmitThumbFunc(CurrentFnSym); + OutStreamer->emitAssemblerFlag(MCAF_Code16); + OutStreamer->emitThumbFunc(CurrentFnSym); } else { - OutStreamer->EmitAssemblerFlag(MCAF_Code32); + OutStreamer->emitAssemblerFlag(MCAF_Code32); } - OutStreamer->EmitLabel(CurrentFnSym); + + // Emit symbol for CMSE non-secure entry point + if (AFI->isCmseNSEntryFunction()) { + MCSymbol *S = + OutContext.getOrCreateSymbol("__acle_se_" + CurrentFnSym->getName()); + emitLinkage(&MF->getFunction(), S); + OutStreamer->emitSymbolAttribute(S, MCSA_ELF_TypeFunction); + OutStreamer->emitLabel(S); + } + + OutStreamer->emitLabel(CurrentFnSym); } -void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) { +void ARMAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) { uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType()); assert(Size && "C++ constructor pointer had zero size!"); @@ -90,17 +100,17 @@ void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) { : MCSymbolRefExpr::VK_None), OutContext); - OutStreamer->EmitValue(E, Size); + OutStreamer->emitValue(E, Size); } -void ARMAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { +void ARMAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { if (PromotedGlobals.count(GV)) // The global was promoted into a constant pool. It should not be emitted. return; - AsmPrinter::EmitGlobalVariable(GV); + AsmPrinter::emitGlobalVariable(GV); } -/// runOnMachineFunction - This uses the EmitInstruction() +/// runOnMachineFunction - This uses the emitInstruction() /// method to print assembly for each instruction. /// bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -158,7 +168,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } // Emit the rest of the function body. - EmitFunctionBody(); + emitFunctionBody(); // Emit the XRay table for this function. emitXRayTable(); @@ -167,10 +177,10 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { // These are created per function, rather than per TU, since it's // relatively easy to exceed the thumb branch range within a TU. if (! ThumbIndirectPads.empty()) { - OutStreamer->EmitAssemblerFlag(MCAF_Code16); - EmitAlignment(Align(2)); + OutStreamer->emitAssemblerFlag(MCAF_Code16); + emitAlignment(Align(2)); for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) { - OutStreamer->EmitLabel(TIP.second); + OutStreamer->emitLabel(TIP.second); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX) .addReg(TIP.first) // Add predicate operands. @@ -467,14 +477,14 @@ void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo, // the start mode, then restore the start mode. const bool WasThumb = isThumb(StartInfo); if (!EndInfo || WasThumb != isThumb(*EndInfo)) { - OutStreamer->EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32); + OutStreamer->emitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32); } } -void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { +void ARMAsmPrinter::emitStartOfAsmFile(Module &M) { const Triple &TT = TM.getTargetTriple(); // Use unified assembler syntax. - OutStreamer->EmitAssemblerFlag(MCAF_SyntaxUnified); + OutStreamer->emitAssemblerFlag(MCAF_SyntaxUnified); // Emit ARM Build Attributes if (TT.isOSBinFormatELF()) @@ -484,20 +494,20 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) { // if we're thumb for the purposes of the top level code16 assembler // flag. if (!M.getModuleInlineAsm().empty() && TT.isThumb()) - OutStreamer->EmitAssemblerFlag(MCAF_Code16); + OutStreamer->emitAssemblerFlag(MCAF_Code16); } static void emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, MachineModuleInfoImpl::StubValueTy &MCSym) { // L_foo$stub: - OutStreamer.EmitLabel(StubLabel); + OutStreamer.emitLabel(StubLabel); // .indirect_symbol _foo - OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol); + OutStreamer.emitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol); if (MCSym.getInt()) // External to current translation unit. - OutStreamer.EmitIntValue(0, 4/*size*/); + OutStreamer.emitIntValue(0, 4/*size*/); else // Internal to current translation unit. // @@ -505,13 +515,13 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, // pointers need to be indirect and pc-rel. We accomplish this by // using NLPs; however, sometimes the types are local to the file. // We need to fill in the value for the NLP in those cases. - OutStreamer.EmitValue( + OutStreamer.emitValue( MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()), 4 /*size*/); } -void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { +void ARMAsmPrinter::emitEndOfAsmFile(Module &M) { const Triple &TT = TM.getTargetTriple(); if (TT.isOSBinFormatMachO()) { // All darwin targets use mach-o. @@ -526,7 +536,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { if (!Stubs.empty()) { // Switch with ".non_lazy_symbol_pointer" directive. OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); - EmitAlignment(Align(4)); + emitAlignment(Align(4)); for (auto &Stub : Stubs) emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); @@ -539,7 +549,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { if (!Stubs.empty()) { // Switch with ".non_lazy_symbol_pointer" directive. OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection()); - EmitAlignment(Align(4)); + emitAlignment(Align(4)); for (auto &Stub : Stubs) emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second); @@ -553,7 +563,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { // implementation of multiple entry points). If this doesn't occur, the // linker can safely perform dead code stripping. Since LLVM never // generates code that does this, it is always safe to set. - OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols); } // The last attribute to be emitted is ABI_optimization_goals @@ -570,18 +580,28 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { } //===----------------------------------------------------------------------===// -// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile() +// Helper routines for emitStartOfAsmFile() and emitEndOfAsmFile() // FIXME: // The following seem like one-off assembler flags, but they actually need // to appear in the .ARM.attributes section in ELF. // Instead of subclassing the MCELFStreamer, we do the work here. -// Returns true if all functions have the same function attribute value. -// It also returns true when the module has no functions. + // Returns true if all functions have the same function attribute value. + // It also returns true when the module has no functions. static bool checkFunctionsAttributeConsistency(const Module &M, StringRef Attr, StringRef Value) { + return !any_of(M, [&](const Function &F) { + return F.getFnAttribute(Attr).getValueAsString() != Value; + }); +} +// Returns true if all functions have the same denormal mode. +// It also returns true when the module has no functions. +static bool checkDenormalAttributeConsistency(const Module &M, + StringRef Attr, + DenormalMode Value) { return !any_of(M, [&](const Function &F) { - return F.getFnAttribute(Attr).getValueAsString() != Value; + StringRef AttrVal = F.getFnAttribute(Attr).getValueAsString(); + return parseDenormalFPAttribute(AttrVal) != Value; }); } @@ -606,11 +626,12 @@ void ARMAsmPrinter::emitAttributes() { if (!ArchFS.empty()) ArchFS = (Twine(ArchFS) + "," + FS).str(); else - ArchFS = FS; + ArchFS = std::string(FS); } const ARMBaseTargetMachine &ATM = static_cast<const ARMBaseTargetMachine &>(TM); - const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian()); + const ARMSubtarget STI(TT, std::string(CPU), ArchFS, ATM, + ATM.isLittleEndian()); // Emit build attributes for the available hardware. ATS.emitTargetAttributes(STI); @@ -641,16 +662,13 @@ void ARMAsmPrinter::emitAttributes() { } // Set FP Denormals. - if (checkFunctionsAttributeConsistency(*MMI->getModule(), - "denormal-fp-math", - "preserve-sign") || - TM.Options.FPDenormalMode == FPDenormal::PreserveSign) + if (checkDenormalAttributeConsistency(*MMI->getModule(), "denormal-fp-math", + DenormalMode::getPreserveSign())) ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::PreserveFPSign); - else if (checkFunctionsAttributeConsistency(*MMI->getModule(), - "denormal-fp-math", - "positive-zero") || - TM.Options.FPDenormalMode == FPDenormal::PositiveZero) + else if (checkDenormalAttributeConsistency(*MMI->getModule(), + "denormal-fp-math", + DenormalMode::getPositiveZero())) ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::PositiveZero); else if (!TM.Options.UnsafeFPMath) @@ -855,8 +873,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, llvm_unreachable("unexpected target"); } -void ARMAsmPrinter:: -EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { +void ARMAsmPrinter::emitMachineConstantPoolValue( + MachineConstantPoolValue *MCPV) { const DataLayout &DL = getDataLayout(); int Size = DL.getTypeAllocSize(MCPV->getType()); @@ -876,11 +894,11 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { for (const auto *GV : ACPC->promotedGlobals()) { if (!EmittedPromotedGlobalLabels.count(GV)) { MCSymbol *GVSym = getSymbol(GV); - OutStreamer->EmitLabel(GVSym); + OutStreamer->emitLabel(GVSym); EmittedPromotedGlobalLabels.insert(GV); } } - return EmitGlobalConstant(DL, ACPC->getPromotedGlobalInit()); + return emitGlobalConstant(DL, ACPC->getPromotedGlobalInit()); } MCSymbol *MCSym; @@ -925,29 +943,29 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { // We want "(<expr> - .)", but MC doesn't have a concept of the '.' // label, so just emit a local label end reference that instead. MCSymbol *DotSym = OutContext.createTempSymbol(); - OutStreamer->EmitLabel(DotSym); + OutStreamer->emitLabel(DotSym); const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext); PCRelExpr = MCBinaryExpr::createSub(PCRelExpr, DotExpr, OutContext); } Expr = MCBinaryExpr::createSub(Expr, PCRelExpr, OutContext); } - OutStreamer->EmitValue(Expr, Size); + OutStreamer->emitValue(Expr, Size); } -void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) { +void ARMAsmPrinter::emitJumpTableAddrs(const MachineInstr *MI) { const MachineOperand &MO1 = MI->getOperand(1); unsigned JTI = MO1.getIndex(); // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for // ARM mode tables. - EmitAlignment(Align(4)); + emitAlignment(Align(4)); // Emit a label for the jump table. MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); - OutStreamer->EmitLabel(JTISymbol); + OutStreamer->emitLabel(JTISymbol); // Mark the jump table as data-in-code. - OutStreamer->EmitDataRegion(MCDR_DataRegionJT32); + OutStreamer->emitDataRegion(MCDR_DataRegionJT32); // Emit each entry of the table. const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); @@ -974,23 +992,23 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) { else if (AFI->isThumbFunction()) Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(1,OutContext), OutContext); - OutStreamer->EmitValue(Expr, 4); + OutStreamer->emitValue(Expr, 4); } // Mark the end of jump table data-in-code region. - OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); + OutStreamer->emitDataRegion(MCDR_DataRegionEnd); } -void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) { +void ARMAsmPrinter::emitJumpTableInsts(const MachineInstr *MI) { const MachineOperand &MO1 = MI->getOperand(1); unsigned JTI = MO1.getIndex(); // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for // ARM mode tables. - EmitAlignment(Align(4)); + emitAlignment(Align(4)); // Emit a label for the jump table. MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); - OutStreamer->EmitLabel(JTISymbol); + OutStreamer->emitLabel(JTISymbol); // Emit each entry of the table. const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); @@ -1008,17 +1026,17 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) { } } -void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI, +void ARMAsmPrinter::emitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth) { assert((OffsetWidth == 1 || OffsetWidth == 2) && "invalid tbb/tbh width"); const MachineOperand &MO1 = MI->getOperand(1); unsigned JTI = MO1.getIndex(); if (Subtarget->isThumb1Only()) - EmitAlignment(Align(4)); + emitAlignment(Align(4)); MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI); - OutStreamer->EmitLabel(JTISymbol); + OutStreamer->emitLabel(JTISymbol); // Emit each entry of the table. const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo(); @@ -1026,7 +1044,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI, const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs; // Mark the jump table as data-in-code. - OutStreamer->EmitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8 + OutStreamer->emitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8 : MCDR_DataRegionJT16); for (auto MBB : JTBBs) { @@ -1050,15 +1068,15 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI, Expr = MCBinaryExpr::createSub(MBBSymbolExpr, Expr, OutContext); Expr = MCBinaryExpr::createDiv(Expr, MCConstantExpr::create(2, OutContext), OutContext); - OutStreamer->EmitValue(Expr, OffsetWidth); + OutStreamer->emitValue(Expr, OffsetWidth); } // Mark the end of jump table data-in-code region. 32-bit offsets use // actual branch instructions here, so we don't mark those as a data-region // at all. - OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); + OutStreamer->emitDataRegion(MCDR_DataRegionEnd); // Make sure the next instruction is 2-byte aligned. - EmitAlignment(Align(2)); + emitAlignment(Align(2)); } void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { @@ -1076,16 +1094,26 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { unsigned Opc = MI->getOpcode(); unsigned SrcReg, DstReg; - if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) { - // Two special cases: - // 1) tPUSH does not have src/dst regs. - // 2) for Thumb1 code we sometimes materialize the constant via constpool - // load. Yes, this is pretty fragile, but for now I don't see better - // way... :( + switch (Opc) { + case ARM::tPUSH: + // special case: tPUSH does not have src/dst regs. SrcReg = DstReg = ARM::SP; - } else { + break; + case ARM::tLDRpci: + case ARM::t2MOVi16: + case ARM::t2MOVTi16: + // special cases: + // 1) for Thumb1 code we sometimes materialize the constant via constpool + // load. + // 2) for Thumb2 execute only code we materialize the constant via + // immediate constants in 2 separate instructions (MOVW/MOVT). + SrcReg = ~0U; + DstReg = MI->getOperand(0).getReg(); + break; + default: SrcReg = MI->getOperand(1).getReg(); DstReg = MI->getOperand(0).getReg(); + break; } // Try to figure out the unwinding opcode out of src / dst regs. @@ -1189,23 +1217,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { case ARM::tADDrSPi: Offset = -MI->getOperand(2).getImm()*4; break; - case ARM::tLDRpci: { - // Grab the constpool index and check, whether it corresponds to - // original or cloned constpool entry. - unsigned CPI = MI->getOperand(1).getIndex(); - const MachineConstantPool *MCP = MF.getConstantPool(); - if (CPI >= MCP->getConstants().size()) - CPI = AFI->getOriginalCPIdx(CPI); - assert(CPI != -1U && "Invalid constpool index"); - - // Derive the actual offset. - const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI]; - assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry"); - // FIXME: Check for user, it should be "add" instruction! - Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue(); + case ARM::tADDhirr: + Offset = + -AFI->EHPrologueOffsetInRegs.lookup(MI->getOperand(2).getReg()); break; } - } if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) { if (DstReg == FramePtr && FramePtr != ARM::SP) @@ -1225,14 +1241,43 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { } else if (DstReg == ARM::SP) { MI->print(errs()); llvm_unreachable("Unsupported opcode for unwinding information"); - } else if (Opc == ARM::tMOVr) { - // If a Thumb1 function spills r8-r11, we copy the values to low - // registers before pushing them. Record the copy so we can emit the - // correct ".save" later. - AFI->EHPrologueRemappedRegs[DstReg] = SrcReg; } else { - MI->print(errs()); - llvm_unreachable("Unsupported opcode for unwinding information"); + int64_t Offset = 0; + switch (Opc) { + case ARM::tMOVr: + // If a Thumb1 function spills r8-r11, we copy the values to low + // registers before pushing them. Record the copy so we can emit the + // correct ".save" later. + AFI->EHPrologueRemappedRegs[DstReg] = SrcReg; + break; + case ARM::tLDRpci: { + // Grab the constpool index and check, whether it corresponds to + // original or cloned constpool entry. + unsigned CPI = MI->getOperand(1).getIndex(); + const MachineConstantPool *MCP = MF.getConstantPool(); + if (CPI >= MCP->getConstants().size()) + CPI = AFI->getOriginalCPIdx(CPI); + assert(CPI != -1U && "Invalid constpool index"); + + // Derive the actual offset. + const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI]; + assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry"); + Offset = cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue(); + AFI->EHPrologueOffsetInRegs[DstReg] = Offset; + break; + } + case ARM::t2MOVi16: + Offset = MI->getOperand(1).getImm(); + AFI->EHPrologueOffsetInRegs[DstReg] = Offset; + break; + case ARM::t2MOVTi16: + Offset = MI->getOperand(2).getImm(); + AFI->EHPrologueOffsetInRegs[DstReg] |= (Offset << 16); + break; + default: + MI->print(errs()); + llvm_unreachable("Unsupported opcode for unwinding information"); + } } } } @@ -1241,7 +1286,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { // instructions) auto-generated. #include "ARMGenMCPseudoLowering.inc" -void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { +void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { const DataLayout &DL = getDataLayout(); MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); @@ -1252,7 +1297,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // If we just ended a constant pool, mark it as such. if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) { - OutStreamer->EmitDataRegion(MCDR_DataRegionEnd); + OutStreamer->emitDataRegion(MCDR_DataRegionEnd); InConstantPool = false; } @@ -1513,7 +1558,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // This is a pseudo op for a label used by a branch future instruction // Emit the label. - OutStreamer->EmitLabel(getBFLabel(DL.getPrivateGlobalPrefix(), + OutStreamer->emitLabel(getBFLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), MI->getOperand(0).getIndex(), OutContext)); return; @@ -1525,7 +1570,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // This adds the address of LPC0 to r0. // Emit the label. - OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), + OutStreamer->emitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), MI->getOperand(2).getImm(), OutContext)); @@ -1546,7 +1591,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // This adds the address of LPC0 to r0. // Emit the label. - OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), + OutStreamer->emitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), MI->getOperand(2).getImm(), OutContext)); @@ -1577,7 +1622,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // a PC-relative address at the ldr instruction. // Emit the label. - OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), + OutStreamer->emitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), MI->getOperand(2).getImm(), OutContext)); @@ -1620,28 +1665,28 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // If this is the first entry of the pool, mark it. if (!InConstantPool) { - OutStreamer->EmitDataRegion(MCDR_DataRegion); + OutStreamer->emitDataRegion(MCDR_DataRegion); InConstantPool = true; } - OutStreamer->EmitLabel(GetCPISymbol(LabelId)); + OutStreamer->emitLabel(GetCPISymbol(LabelId)); const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx]; if (MCPE.isMachineConstantPoolEntry()) - EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); + emitMachineConstantPoolValue(MCPE.Val.MachineCPVal); else - EmitGlobalConstant(DL, MCPE.Val.ConstVal); + emitGlobalConstant(DL, MCPE.Val.ConstVal); return; } case ARM::JUMPTABLE_ADDRS: - EmitJumpTableAddrs(MI); + emitJumpTableAddrs(MI); return; case ARM::JUMPTABLE_INSTS: - EmitJumpTableInsts(MI); + emitJumpTableInsts(MI); return; case ARM::JUMPTABLE_TBB: case ARM::JUMPTABLE_TBH: - EmitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2); + emitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2); return; case ARM::t2BR_JT: { EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) @@ -1656,7 +1701,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { case ARM::t2TBH_JT: { unsigned Opc = MI->getOpcode() == ARM::t2TBB_JT ? ARM::t2TBB : ARM::t2TBH; // Lower and emit the PC label, then the instruction itself. - OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm())); + OutStreamer->emitLabel(GetCPISymbol(MI->getOperand(3).getImm())); EmitToStreamer(*OutStreamer, MCInstBuilder(Opc) .addReg(MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()) @@ -1698,7 +1743,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // FIXME: Ideally we could vary the LDRB index based on the padding // between the sequence and jump table, however that relies on MCExprs // for load indexes which are currently not supported. - OutStreamer->EmitCodeAlignment(4); + OutStreamer->emitCodeAlignment(4); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr) .addReg(Idx) .addReg(Idx) @@ -1740,7 +1785,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { .addImm(ARMCC::AL) .addReg(0)); - OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm())); + OutStreamer->emitLabel(GetCPISymbol(MI->getOperand(3).getImm())); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr) .addReg(ARM::PC) .addReg(ARM::PC) @@ -1809,7 +1854,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case ARM::SPACE: - OutStreamer->EmitZeros(MI->getOperand(1).getImm()); + OutStreamer->emitZeros(MI->getOperand(1).getImm()); return; case ARM::TRAP: { // Non-Darwin binutils don't yet support the "trap" mnemonic. @@ -1904,7 +1949,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { .addImm(ARMCC::AL) .addReg(0)); - OutStreamer->EmitLabel(Label); + OutStreamer->emitLabel(Label); return; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.h index a4b37fa2331f..f8ff047a1d06 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMAsmPrinter.h @@ -84,21 +84,21 @@ public: void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo, const MCSubtargetInfo *EndInfo) const override; - void EmitJumpTableAddrs(const MachineInstr *MI); - void EmitJumpTableInsts(const MachineInstr *MI); - void EmitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth); - void EmitInstruction(const MachineInstr *MI) override; + void emitJumpTableAddrs(const MachineInstr *MI); + void emitJumpTableInsts(const MachineInstr *MI); + void emitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth); + void emitInstruction(const MachineInstr *MI) override; bool runOnMachineFunction(MachineFunction &F) override; - void EmitConstantPool() override { + void emitConstantPool() override { // we emit constant pools customly! } - void EmitFunctionBodyEnd() override; - void EmitFunctionEntryLabel() override; - void EmitStartOfAsmFile(Module &M) override; - void EmitEndOfAsmFile(Module &M) override; - void EmitXXStructor(const DataLayout &DL, const Constant *CV) override; - void EmitGlobalVariable(const GlobalVariable *GV) override; + void emitFunctionBodyEnd() override; + void emitFunctionEntryLabel() override; + void emitStartOfAsmFile(Module &M) override; + void emitEndOfAsmFile(Module &M) override; + void emitXXStructor(const DataLayout &DL, const Constant *CV) override; + void emitGlobalVariable(const GlobalVariable *GV) override; MCSymbol *GetCPISymbol(unsigned CPID) const override; @@ -117,7 +117,7 @@ public: private: void EmitSled(const MachineInstr &MI, SledKind Kind); - // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile() + // Helpers for emitStartOfAsmFile() and emitEndOfAsmFile() void emitAttributes(); // Generic helper used to emit e.g. ARMv5 mul pseudos @@ -150,7 +150,7 @@ private: public: /// EmitMachineConstantPoolValue - Print a machine constantpool value to /// the .s file. - void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override; + void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override; }; } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 48f781510254..4cc2b6bf7e7e 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ScoreboardHazardRecognizer.h" @@ -495,6 +496,31 @@ bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const { return PIdx != -1 && MI.getOperand(PIdx).getImm() != ARMCC::AL; } +std::string ARMBaseInstrInfo::createMIROperandComment( + const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx, + const TargetRegisterInfo *TRI) const { + + // First, let's see if there is a generic comment for this operand + std::string GenericComment = + TargetInstrInfo::createMIROperandComment(MI, Op, OpIdx, TRI); + if (!GenericComment.empty()) + return GenericComment; + + // If not, check if we have an immediate operand. + if (Op.getType() != MachineOperand::MO_Immediate) + return std::string(); + + // And print its corresponding condition code if the immediate is a + // predicate. + int FirstPredOp = MI.findFirstPredOperandIdx(); + if (FirstPredOp != (int) OpIdx) + return std::string(); + + std::string CC = "CC::"; + CC += ARMCondCodeToString((ARMCC::CondCodes)Op.getImm()); + return CC; +} + bool ARMBaseInstrInfo::PredicateInstruction( MachineInstr &MI, ArrayRef<MachineOperand> Pred) const { unsigned Opc = MI.getOpcode(); @@ -811,7 +837,7 @@ void llvm::addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB) { } void llvm::addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, - unsigned DestReg) { + Register DestReg) { addUnpredicatedMveVpredNOp(MIB); MIB.addReg(DestReg, RegState::Undef); } @@ -1009,6 +1035,36 @@ ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; } +Optional<ParamLoadedValue> +ARMBaseInstrInfo::describeLoadedValue(const MachineInstr &MI, + Register Reg) const { + if (auto DstSrcPair = isCopyInstrImpl(MI)) { + Register DstReg = DstSrcPair->Destination->getReg(); + + // TODO: We don't handle cases where the forwarding reg is narrower/wider + // than the copy registers. Consider for example: + // + // s16 = VMOVS s0 + // s17 = VMOVS s1 + // call @callee(d0) + // + // We'd like to describe the call site value of d0 as d8, but this requires + // gathering and merging the descriptions for the two VMOVS instructions. + // + // We also don't handle the reverse situation, where the forwarding reg is + // narrower than the copy destination: + // + // d8 = VMOVD d0 + // call @callee(s1) + // + // We need to produce a fragment description (the call site value of s1 is + // /not/ just d8). + if (DstReg != Reg) + return None; + } + return TargetInstrInfo::describeLoadedValue(MI, Reg); +} + const MachineInstrBuilder & ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg, unsigned SubIdx, unsigned State, @@ -1023,16 +1079,16 @@ ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg, void ARMBaseInstrInfo:: storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned SrcReg, bool isKill, int FI, + Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned Align = MFI.getObjectAlignment(FI); + Align Alignment = MFI.getObjectAlign(FI); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, - MFI.getObjectSize(FI), Align); + MFI.getObjectSize(FI), Alignment); switch (TRI->getSpillSize(*RC)) { case 2: @@ -1102,7 +1158,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, case 16: if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) { // Use aligned spills if the stack can be realigned. - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF)) { BuildMI(MBB, I, DebugLoc(), get(ARM::VST1q64)) .addFrameIndex(FI) .addImm(16) @@ -1130,7 +1186,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, case 24: if (ARM::DTripleRegClass.hasSubClassEq(RC)) { // Use aligned spills if the stack can be realigned. - if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && Subtarget.hasNEON()) { BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo)) .addFrameIndex(FI) @@ -1153,7 +1209,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 32: if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && Subtarget.hasNEON()) { // FIXME: It's possible to only store part of the QQ register if the // spilled def has a sub-register index. @@ -1264,17 +1320,17 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, void ARMBaseInstrInfo:: loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, int FI, + Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned Align = MFI.getObjectAlignment(FI); + const Align Alignment = MFI.getObjectAlign(FI); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), Align); + MFI.getObjectSize(FI), Alignment); switch (TRI->getSpillSize(*RC)) { case 2: @@ -1343,7 +1399,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 16: if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { + if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF)) { BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg) .addFrameIndex(FI) .addImm(16) @@ -1367,7 +1423,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 24: if (ARM::DTripleRegClass.hasSubClassEq(RC)) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && Subtarget.hasNEON()) { BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg) .addFrameIndex(FI) @@ -1390,7 +1446,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, break; case 32: if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) { - if (Align >= 16 && getRegisterInfo().canRealignStack(MF) && + if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) && Subtarget.hasNEON()) { BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg) .addFrameIndex(FI) @@ -1682,13 +1738,13 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4); else llvm_unreachable("Unexpected ARM constantpool value type!!"); - CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment()); + CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlign()); return PCLabelId; } void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SubIdx, + Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const { unsigned Opcode = Orig.getOpcode(); @@ -1959,6 +2015,10 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI, if (MI.isTerminator() || MI.isPosition()) return true; + // INLINEASM_BR can jump to another block + if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) + return true; + // Treat the start of the IT block as a scheduling boundary, but schedule // t2IT along with all instructions following it. // FIXME: This is a big hammer. But the alternative is to add all potential @@ -2120,7 +2180,7 @@ ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB, /// condition, otherwise returns AL. It also returns the condition code /// register by reference. ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI, - unsigned &PredReg) { + Register &PredReg) { int PIdx = MI.findFirstPredOperandIdx(); if (PIdx == -1) { PredReg = 0; @@ -2150,7 +2210,7 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI, case ARM::MOVCCr: case ARM::t2MOVCCr: { // MOVCC can be commuted by inverting the condition. - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg); // MOVCC AL can't be inverted. Shouldn't happen. if (CC == ARMCC::AL || PredReg != ARM::CPSR) @@ -2171,9 +2231,9 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI, /// Identify instructions that can be folded into a MOVCC instruction, and /// return the defining instruction. MachineInstr * -ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI, +ARMBaseInstrInfo::canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI, const TargetInstrInfo *TII) const { - if (!Register::isVirtualRegister(Reg)) + if (!Reg.isVirtual()) return nullptr; if (!MRI.hasOneNonDBGUse(Reg)) return nullptr; @@ -2353,9 +2413,9 @@ unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) { void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, - unsigned BaseReg, int NumBytes, - ARMCC::CondCodes Pred, unsigned PredReg, + const DebugLoc &dl, Register DestReg, + Register BaseReg, int NumBytes, + ARMCC::CondCodes Pred, Register PredReg, const ARMBaseInstrInfo &TII, unsigned MIFlags) { if (NumBytes == 0 && DestReg != BaseReg) { @@ -2515,7 +2575,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, } bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + Register FrameReg, int &Offset, const ARMBaseInstrInfo &TII) { unsigned Opcode = MI.getOpcode(); const MCInstrDesc &Desc = MI.getDesc(); @@ -2671,8 +2731,8 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, /// in SrcReg and SrcReg2 if having two register operands, and the value it /// compares against in CmpValue. Return true if the comparison instruction /// can be analyzed. -bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, +bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, + Register &SrcReg2, int &CmpMask, int &CmpValue) const { switch (MI.getOpcode()) { default: break; @@ -2708,7 +2768,7 @@ bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, /// operates on the given source register and applies the same mask /// as a 'tst' instruction. Provide a limited look-through for copies. /// When successful, MI will hold the found instruction. -static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg, +static bool isSuitableForMask(MachineInstr *&MI, Register SrcReg, int CmpMask, bool CommonUse) { switch (MI->getOpcode()) { case ARM::ANDri: @@ -2743,7 +2803,7 @@ inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) { /// CMPrr(r0, r1) can be made redundant by ADDr[ri](r0, r1, X). /// This function can be extended later on. inline static bool isRedundantFlagInstr(const MachineInstr *CmpI, - unsigned SrcReg, unsigned SrcReg2, + Register SrcReg, Register SrcReg2, int ImmValue, const MachineInstr *OI, bool &IsThumb1) { if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) && @@ -2879,7 +2939,7 @@ static bool isOptimizeCompareCandidate(MachineInstr *MI, bool &IsThumb1) { /// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the /// condition code of instructions which use the flags. bool ARMBaseInstrInfo::optimizeCompareInstr( - MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { // Get the unique definition of SrcReg. MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); @@ -3166,7 +3226,7 @@ bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const { return true; MachineBasicBlock::const_iterator Next = &MI; ++Next; - unsigned SrcReg, SrcReg2; + Register SrcReg, SrcReg2; int CmpMask, CmpValue; bool IsThumb1; if (Next != MI.getParent()->end() && @@ -3177,7 +3237,7 @@ bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const { } bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, - unsigned Reg, + Register Reg, MachineRegisterInfo *MRI) const { // Fold large immediates into add, sub, or, xor. unsigned DefOpc = DefMI.getOpcode(); @@ -3729,7 +3789,7 @@ unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, // If there are odd number of registers or if it's not 64-bit aligned, // then it takes an extra AGU (Address Generation Unit) cycle. if ((NumRegs % 2) || !MI.hasOneMemOperand() || - (*MI.memoperands_begin())->getAlignment() < 8) + (*MI.memoperands_begin())->getAlign() < Align(8)) ++UOps; return UOps; } @@ -4316,10 +4376,10 @@ int ARMBaseInstrInfo::getOperandLatencyImpl( return -1; unsigned DefAlign = DefMI.hasOneMemOperand() - ? (*DefMI.memoperands_begin())->getAlignment() + ? (*DefMI.memoperands_begin())->getAlign().value() : 0; unsigned UseAlign = UseMI.hasOneMemOperand() - ? (*UseMI.memoperands_begin())->getAlignment() + ? (*UseMI.memoperands_begin())->getAlign().value() : 0; // Get the itinerary's latency if possible, and handle variable_ops. @@ -4366,10 +4426,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode()); auto *DefMN = cast<MachineSDNode>(DefNode); unsigned DefAlign = !DefMN->memoperands_empty() - ? (*DefMN->memoperands_begin())->getAlignment() : 0; + ? (*DefMN->memoperands_begin())->getAlign().value() + : 0; auto *UseMN = cast<MachineSDNode>(UseNode); unsigned UseAlign = !UseMN->memoperands_empty() - ? (*UseMN->memoperands_begin())->getAlignment() : 0; + ? (*UseMN->memoperands_begin())->getAlign().value() + : 0; int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign, UseMCID, UseIdx, UseAlign); @@ -4660,7 +4722,7 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, // Adjust for dynamic def-side opcode variants not captured by the itinerary. unsigned DefAlign = - MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlignment() : 0; + MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlign().value() : 0; int Adj = adjustDefLatency(Subtarget, MI, MCID, DefAlign); if (Adj >= 0 || (int)Latency > -Adj) { return Latency + Adj; @@ -4782,7 +4844,7 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( - MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4); + MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, Align(4)); MIB.addMemOperand(MMO).add(predOps(ARMCC::AL)); } @@ -5353,7 +5415,8 @@ Optional<RegImmPair> ARMBaseInstrInfo::isAddImmediate(const MachineInstr &MI, // TODO: Handle cases where Reg is a super- or sub-register of the // destination register. - if (Reg != MI.getOperand(0).getReg()) + const MachineOperand &Op0 = MI.getOperand(0); + if (!Op0.isReg() || Reg != Op0.getReg()) return None; // We describe SUBri or ADDri instructions. @@ -5365,8 +5428,7 @@ Optional<RegImmPair> ARMBaseInstrInfo::isAddImmediate(const MachineInstr &MI, // TODO: Third operand can be global address (usually some string). Since // strings can be relocated we cannot calculate their offsets for // now. - if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || - !MI.getOperand(2).isImm()) + if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm()) return None; Offset = MI.getOperand(2).getImm() * Sign; @@ -5402,7 +5464,7 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br, if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri) return nullptr; Register Reg = CmpMI->getOperand(0).getReg(); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg); if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0) return nullptr; @@ -5460,3 +5522,521 @@ bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, return ConstantMaterializationCost(Val1, Subtarget, !ForCodesize) < ConstantMaterializationCost(Val2, Subtarget, !ForCodesize); } + +/// Constants defining how certain sequences should be outlined. +/// This encompasses how an outlined function should be called, and what kind of +/// frame should be emitted for that outlined function. +/// +/// \p MachineOutlinerTailCall implies that the function is being created from +/// a sequence of instructions ending in a return. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> B OUTLINED_FUNCTION I1 +/// BX LR I2 +/// BX LR +/// +/// +-------------------------+--------+-----+ +/// | | Thumb2 | ARM | +/// +-------------------------+--------+-----+ +/// | Call overhead in Bytes | 4 | 4 | +/// | Frame overhead in Bytes | 0 | 0 | +/// | Stack fixup required | No | No | +/// +-------------------------+--------+-----+ +/// +/// \p MachineOutlinerThunk implies that the function is being created from +/// a sequence of instructions ending in a call. The outlined function is +/// called with a BL instruction, and the outlined function tail-calls the +/// original call destination. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// BL f I2 +/// B f +/// +/// +-------------------------+--------+-----+ +/// | | Thumb2 | ARM | +/// +-------------------------+--------+-----+ +/// | Call overhead in Bytes | 4 | 4 | +/// | Frame overhead in Bytes | 0 | 0 | +/// | Stack fixup required | No | No | +/// +-------------------------+--------+-----+ +/// +/// \p MachineOutlinerNoLRSave implies that the function should be called using +/// a BL instruction, but doesn't require LR to be saved and restored. This +/// happens when LR is known to be dead. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// I3 I2 +/// I3 +/// BX LR +/// +/// +-------------------------+--------+-----+ +/// | | Thumb2 | ARM | +/// +-------------------------+--------+-----+ +/// | Call overhead in Bytes | 4 | 4 | +/// | Frame overhead in Bytes | 4 | 4 | +/// | Stack fixup required | No | No | +/// +-------------------------+--------+-----+ +/// +/// \p MachineOutlinerRegSave implies that the function should be called with a +/// save and restore of LR to an available register. This allows us to avoid +/// stack fixups. Note that this outlining variant is compatible with the +/// NoLRSave case. +/// +/// That is, +/// +/// I1 Save LR OUTLINED_FUNCTION: +/// I2 --> BL OUTLINED_FUNCTION I1 +/// I3 Restore LR I2 +/// I3 +/// BX LR +/// +/// +-------------------------+--------+-----+ +/// | | Thumb2 | ARM | +/// +-------------------------+--------+-----+ +/// | Call overhead in Bytes | 8 | 12 | +/// | Frame overhead in Bytes | 2 | 4 | +/// | Stack fixup required | No | No | +/// +-------------------------+--------+-----+ + +enum MachineOutlinerClass { + MachineOutlinerTailCall, + MachineOutlinerThunk, + MachineOutlinerNoLRSave, + MachineOutlinerRegSave +}; + +enum MachineOutlinerMBBFlags { + LRUnavailableSomewhere = 0x2, + HasCalls = 0x4, + UnsafeRegsDead = 0x8 +}; + +struct OutlinerCosts { + const int CallTailCall; + const int FrameTailCall; + const int CallThunk; + const int FrameThunk; + const int CallNoLRSave; + const int FrameNoLRSave; + const int CallRegSave; + const int FrameRegSave; + + OutlinerCosts(const ARMSubtarget &target) + : CallTailCall(target.isThumb() ? 4 : 4), + FrameTailCall(target.isThumb() ? 0 : 0), + CallThunk(target.isThumb() ? 4 : 4), + FrameThunk(target.isThumb() ? 0 : 0), + CallNoLRSave(target.isThumb() ? 4 : 4), + FrameNoLRSave(target.isThumb() ? 4 : 4), + CallRegSave(target.isThumb() ? 8 : 12), + FrameRegSave(target.isThumb() ? 2 : 4) {} +}; + +unsigned +ARMBaseInstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { + assert(C.LRUWasSet && "LRU wasn't set?"); + MachineFunction *MF = C.getMF(); + const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); + + BitVector regsReserved = ARI->getReservedRegs(*MF); + // Check if there is an available register across the sequence that we can + // use. + for (unsigned Reg : ARM::rGPRRegClass) { + if (!(Reg < regsReserved.size() && regsReserved.test(Reg)) && + Reg != ARM::LR && // LR is not reserved, but don't use it. + Reg != ARM::R12 && // R12 is not guaranteed to be preserved. + C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) + return Reg; + } + + // No suitable register. Return 0. + return 0u; +} + +outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( + std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { + outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; + unsigned SequenceSize = + std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, + [this](unsigned Sum, const MachineInstr &MI) { + return Sum + getInstSizeInBytes(MI); + }); + + // Properties about candidate MBBs that hold for all of them. + unsigned FlagsSetInAll = 0xF; + + // Compute liveness information for each candidate, and set FlagsSetInAll. + const TargetRegisterInfo &TRI = getRegisterInfo(); + std::for_each( + RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), + [&FlagsSetInAll](outliner::Candidate &C) { FlagsSetInAll &= C.Flags; }); + + // According to the ARM Procedure Call Standard, the following are + // undefined on entry/exit from a function call: + // + // * Register R12(IP), + // * Condition codes (and thus the CPSR register) + // + // Since we control the instructions which are part of the outlined regions + // we don't need to be fully compliant with the AAPCS, but we have to + // guarantee that if a veneer is inserted at link time the code is still + // correct. Because of this, we can't outline any sequence of instructions + // where one of these registers is live into/across it. Thus, we need to + // delete those candidates. + auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { + // If the unsafe registers in this block are all dead, then we don't need + // to compute liveness here. + if (C.Flags & UnsafeRegsDead) + return false; + C.initLRU(TRI); + LiveRegUnits LRU = C.LRU; + return (!LRU.available(ARM::R12) || !LRU.available(ARM::CPSR)); + }; + + // Are there any candidates where those registers are live? + if (!(FlagsSetInAll & UnsafeRegsDead)) { + // Erase every candidate that violates the restrictions above. (It could be + // true that we have viable candidates, so it's not worth bailing out in + // the case that, say, 1 out of 20 candidates violate the restructions.) + RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), + RepeatedSequenceLocs.end(), + CantGuaranteeValueAcrossCall), + RepeatedSequenceLocs.end()); + + // If the sequence doesn't have enough candidates left, then we're done. + if (RepeatedSequenceLocs.size() < 2) + return outliner::OutlinedFunction(); + } + + // At this point, we have only "safe" candidates to outline. Figure out + // frame + call instruction information. + + unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); + + // Helper lambda which sets call information for every candidate. + auto SetCandidateCallInfo = + [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { + for (outliner::Candidate &C : RepeatedSequenceLocs) + C.setCallInfo(CallID, NumBytesForCall); + }; + + OutlinerCosts Costs(Subtarget); + unsigned FrameID = 0; + unsigned NumBytesToCreateFrame = 0; + + // If the last instruction in any candidate is a terminator, then we should + // tail call all of the candidates. + if (RepeatedSequenceLocs[0].back()->isTerminator()) { + FrameID = MachineOutlinerTailCall; + NumBytesToCreateFrame = Costs.FrameTailCall; + SetCandidateCallInfo(MachineOutlinerTailCall, Costs.CallTailCall); + } else if (LastInstrOpcode == ARM::BL || LastInstrOpcode == ARM::BLX || + LastInstrOpcode == ARM::tBL || LastInstrOpcode == ARM::tBLXr || + LastInstrOpcode == ARM::tBLXi) { + FrameID = MachineOutlinerThunk; + NumBytesToCreateFrame = Costs.FrameThunk; + SetCandidateCallInfo(MachineOutlinerThunk, Costs.CallThunk); + } else { + // We need to decide how to emit calls + frames. We can always emit the same + // frame if we don't need to save to the stack. + unsigned NumBytesNoStackCalls = 0; + std::vector<outliner::Candidate> CandidatesWithoutStackFixups; + + for (outliner::Candidate &C : RepeatedSequenceLocs) { + C.initLRU(TRI); + + // Is LR available? If so, we don't need a save. + if (C.LRU.available(ARM::LR)) { + FrameID = MachineOutlinerNoLRSave; + NumBytesNoStackCalls += Costs.CallNoLRSave; + C.setCallInfo(MachineOutlinerNoLRSave, Costs.CallNoLRSave); + CandidatesWithoutStackFixups.push_back(C); + } + + // Is an unused register available? If so, we won't modify the stack, so + // we can outline with the same frame type as those that don't save LR. + else if (findRegisterToSaveLRTo(C)) { + FrameID = MachineOutlinerRegSave; + NumBytesNoStackCalls += Costs.CallRegSave; + C.setCallInfo(MachineOutlinerRegSave, Costs.CallRegSave); + CandidatesWithoutStackFixups.push_back(C); + } + } + + if (!CandidatesWithoutStackFixups.empty()) { + RepeatedSequenceLocs = CandidatesWithoutStackFixups; + } else + return outliner::OutlinedFunction(); + } + + return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, + NumBytesToCreateFrame, FrameID); +} + +bool ARMBaseInstrInfo::isFunctionSafeToOutlineFrom( + MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { + const Function &F = MF.getFunction(); + + // Can F be deduplicated by the linker? If it can, don't outline from it. + if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) + return false; + + // Don't outline from functions with section markings; the program could + // expect that all the code is in the named section. + // FIXME: Allow outlining from multiple functions with the same section + // marking. + if (F.hasSection()) + return false; + + // FIXME: Thumb1 outlining is not handled + if (MF.getInfo<ARMFunctionInfo>()->isThumb1OnlyFunction()) + return false; + + // It's safe to outline from MF. + return true; +} + +bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, + unsigned &Flags) const { + // Check if LR is available through all of the MBB. If it's not, then set + // a flag. + assert(MBB.getParent()->getRegInfo().tracksLiveness() && + "Suitable Machine Function for outlining must track liveness"); + + LiveRegUnits LRU(getRegisterInfo()); + + std::for_each(MBB.rbegin(), MBB.rend(), + [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); + + // Check if each of the unsafe registers are available... + bool R12AvailableInBlock = LRU.available(ARM::R12); + bool CPSRAvailableInBlock = LRU.available(ARM::CPSR); + + // If all of these are dead (and not live out), we know we don't have to check + // them later. + if (R12AvailableInBlock && CPSRAvailableInBlock) + Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; + + // Now, add the live outs to the set. + LRU.addLiveOuts(MBB); + + // If any of these registers is available in the MBB, but also a live out of + // the block, then we know outlining is unsafe. + if (R12AvailableInBlock && !LRU.available(ARM::R12)) + return false; + if (CPSRAvailableInBlock && !LRU.available(ARM::CPSR)) + return false; + + // Check if there's a call inside this MachineBasicBlock. If there is, then + // set a flag. + if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) + Flags |= MachineOutlinerMBBFlags::HasCalls; + + if (!LRU.available(ARM::LR)) + Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; + + return true; +} + +outliner::InstrType +ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, + unsigned Flags) const { + MachineInstr &MI = *MIT; + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + // Be conservative with inline ASM + if (MI.isInlineAsm()) + return outliner::InstrType::Illegal; + + // Don't allow debug values to impact outlining type. + if (MI.isDebugInstr() || MI.isIndirectDebugValue()) + return outliner::InstrType::Invisible; + + // At this point, KILL or IMPLICIT_DEF instructions don't really tell us much + // so we can go ahead and skip over them. + if (MI.isKill() || MI.isImplicitDef()) + return outliner::InstrType::Invisible; + + // PIC instructions contain labels, outlining them would break offset + // computing. unsigned Opc = MI.getOpcode(); + unsigned Opc = MI.getOpcode(); + if (Opc == ARM::tPICADD || Opc == ARM::PICADD || Opc == ARM::PICSTR || + Opc == ARM::PICSTRB || Opc == ARM::PICSTRH || Opc == ARM::PICLDR || + Opc == ARM::PICLDRB || Opc == ARM::PICLDRH || Opc == ARM::PICLDRSB || + Opc == ARM::PICLDRSH || Opc == ARM::t2LDRpci_pic || + Opc == ARM::t2MOVi16_ga_pcrel || Opc == ARM::t2MOVTi16_ga_pcrel || + Opc == ARM::t2MOV_ga_pcrel) + return outliner::InstrType::Illegal; + + // Be conservative with ARMv8.1 MVE instructions. + if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart || + Opc == ARM::t2WhileLoopStart || Opc == ARM::t2LoopDec || + Opc == ARM::t2LoopEnd) + return outliner::InstrType::Illegal; + + const MCInstrDesc &MCID = MI.getDesc(); + uint64_t MIFlags = MCID.TSFlags; + if ((MIFlags & ARMII::DomainMask) == ARMII::DomainMVE) + return outliner::InstrType::Illegal; + + // Is this a terminator for a basic block? + if (MI.isTerminator()) { + // Don't outline if the branch is not unconditional. + if (isPredicated(MI)) + return outliner::InstrType::Illegal; + + // Is this the end of a function? + if (MI.getParent()->succ_empty()) + return outliner::InstrType::Legal; + + // It's not, so don't outline it. + return outliner::InstrType::Illegal; + } + + // Make sure none of the operands are un-outlinable. + for (const MachineOperand &MOP : MI.operands()) { + if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || + MOP.isTargetIndex()) + return outliner::InstrType::Illegal; + } + + // Don't outline if link register or program counter value are used. + if (MI.readsRegister(ARM::LR, TRI) || MI.readsRegister(ARM::PC, TRI)) + return outliner::InstrType::Illegal; + + if (MI.isCall()) { + // If we don't know anything about the callee, assume it depends on the + // stack layout of the caller. In that case, it's only legal to outline + // as a tail-call. Explicitly list the call instructions we know about so + // we don't get unexpected results with call pseudo-instructions. + auto UnknownCallOutlineType = outliner::InstrType::Illegal; + if (Opc == ARM::BL || Opc == ARM::tBL || Opc == ARM::BLX || + Opc == ARM::tBLXr || Opc == ARM::tBLXi) + UnknownCallOutlineType = outliner::InstrType::LegalTerminator; + + return UnknownCallOutlineType; + } + + // Since calls are handled, don't touch LR or PC + if (MI.modifiesRegister(ARM::LR, TRI) || MI.modifiesRegister(ARM::PC, TRI)) + return outliner::InstrType::Illegal; + + // Does this use the stack? + if (MI.modifiesRegister(ARM::SP, TRI) || MI.readsRegister(ARM::SP, TRI)) { + // True if there is no chance that any outlined candidate from this range + // could require stack fixups. That is, both + // * LR is available in the range (No save/restore around call) + // * The range doesn't include calls (No save/restore in outlined frame) + // are true. + // FIXME: This is very restrictive; the flags check the whole block, + // not just the bit we will try to outline. + bool MightNeedStackFixUp = + (Flags & (MachineOutlinerMBBFlags::LRUnavailableSomewhere | + MachineOutlinerMBBFlags::HasCalls)); + + if (!MightNeedStackFixUp) + return outliner::InstrType::Legal; + + return outliner::InstrType::Illegal; + } + + // Be conservative with IT blocks. + if (MI.readsRegister(ARM::ITSTATE, TRI) || + MI.modifiesRegister(ARM::ITSTATE, TRI)) + return outliner::InstrType::Illegal; + + // Don't outline positions. + if (MI.isPosition()) + return outliner::InstrType::Illegal; + + return outliner::InstrType::Legal; +} + +void ARMBaseInstrInfo::buildOutlinedFrame( + MachineBasicBlock &MBB, MachineFunction &MF, + const outliner::OutlinedFunction &OF) const { + // Nothing is needed for tail-calls. + if (OF.FrameConstructionID == MachineOutlinerTailCall) + return; + + // For thunk outlining, rewrite the last instruction from a call to a + // tail-call. + if (OF.FrameConstructionID == MachineOutlinerThunk) { + MachineInstr *Call = &*--MBB.instr_end(); + bool isThumb = Subtarget.isThumb(); + unsigned FuncOp = isThumb ? 2 : 0; + unsigned Opc = Call->getOperand(FuncOp).isReg() + ? isThumb ? ARM::tTAILJMPr : ARM::TAILJMPr + : isThumb ? Subtarget.isTargetMachO() ? ARM::tTAILJMPd + : ARM::tTAILJMPdND + : ARM::TAILJMPd; + MachineInstrBuilder MIB = BuildMI(MBB, MBB.end(), DebugLoc(), get(Opc)) + .add(Call->getOperand(FuncOp)); + if (isThumb && !Call->getOperand(FuncOp).isReg()) + MIB.add(predOps(ARMCC::AL)); + Call->eraseFromParent(); + return; + } + + // Here we have to insert the return ourselves. Get the correct opcode from + // current feature set. + BuildMI(MBB, MBB.end(), DebugLoc(), get(Subtarget.getReturnOpcode())) + .add(predOps(ARMCC::AL)); +} + +MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall( + Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, + MachineFunction &MF, const outliner::Candidate &C) const { + MachineInstrBuilder MIB; + MachineBasicBlock::iterator CallPt; + unsigned Opc; + bool isThumb = Subtarget.isThumb(); + + // Are we tail calling? + if (C.CallConstructionID == MachineOutlinerTailCall) { + // If yes, then we can just branch to the label. + Opc = isThumb + ? Subtarget.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND + : ARM::TAILJMPd; + MIB = BuildMI(MF, DebugLoc(), get(Opc)) + .addGlobalAddress(M.getNamedValue(MF.getName())); + if (isThumb) + MIB.add(predOps(ARMCC::AL)); + It = MBB.insert(It, MIB); + return It; + } + + // Create the call instruction. + Opc = isThumb ? ARM::tBL : ARM::BL; + MachineInstrBuilder CallMIB = BuildMI(MF, DebugLoc(), get(Opc)); + if (isThumb) + CallMIB.add(predOps(ARMCC::AL)); + CallMIB.addGlobalAddress(M.getNamedValue(MF.getName())); + + // Can we save to a register? + if (C.CallConstructionID == MachineOutlinerRegSave) { + unsigned Reg = findRegisterToSaveLRTo(C); + assert(Reg != 0 && "No callee-saved register available?"); + + // Save and restore LR from that register. + if (!MBB.isLiveIn(ARM::LR)) + MBB.addLiveIn(ARM::LR); + copyPhysReg(MBB, It, DebugLoc(), Reg, ARM::LR, true); + CallPt = MBB.insert(It, CallMIB); + copyPhysReg(MBB, It, DebugLoc(), ARM::LR, Reg, true); + It--; + return CallPt; + } + // Insert the call. + It = MBB.insert(It, CallMIB); + return It; +} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index f6d4ebe3a090..1a75b011ca59 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -21,6 +21,8 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsARM.h" #include <array> #include <cstdint> @@ -105,6 +107,11 @@ protected: Optional<DestSourcePair> isCopyInstrImpl(const MachineInstr &MI) const override; + /// Specialization of \ref TargetInstrInfo::describeLoadedValue, used to + /// enhance debug entry value descriptions for ARM targets. + Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI, + Register Reg) const override; + public: // Return whether the target has an explicit NOP encoding. bool hasNOP() const; @@ -146,6 +153,12 @@ public: // Predication support. bool isPredicated(const MachineInstr &MI) const override; + // MIR printer helper function to annotate Operands with a comment. + std::string + createMIROperandComment(const MachineInstr &MI, const MachineOperand &Op, + unsigned OpIdx, + const TargetRegisterInfo *TRI) const override; + ARMCC::CondCodes getPredicate(const MachineInstr &MI) const { int PIdx = MI.findFirstPredOperandIdx(); return PIdx != -1 ? (ARMCC::CondCodes)MI.getOperand(PIdx).getImm() @@ -207,13 +220,13 @@ public: void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned SrcReg, bool isKill, int FrameIndex, + Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned DestReg, int FrameIndex, + Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; @@ -222,7 +235,7 @@ public: bool shouldSink(const MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned DestReg, unsigned SubIdx, + Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override; @@ -286,16 +299,16 @@ public: /// in SrcReg and SrcReg2 if having two register operands, and the value it /// compares against in CmpValue. Return true if the comparison instruction /// can be analyzed. - bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, + bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, + Register &SrcReg2, int &CmpMask, int &CmpValue) const override; /// optimizeCompareInstr - Convert the instruction to set the zero flag so /// that we can remove a "comparison with zero"; Remove a redundant CMP /// instruction if the flags can be updated in the same way by an earlier /// instruction such as SUB. - bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, - unsigned SrcReg2, int CmpMask, int CmpValue, + bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, + Register SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; bool analyzeSelect(const MachineInstr &MI, @@ -308,7 +321,7 @@ public: /// FoldImmediate - 'Reg' is known to be defined by a move immediate /// instruction, try to fold the immediate into the use instruction. - bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override; unsigned getNumMicroOps(const InstrItineraryData *ItinData, @@ -343,7 +356,27 @@ public: ArrayRef<std::pair<unsigned, const char *>> getSerializableBitmaskMachineOperandTargetFlags() const override; + /// ARM supports the MachineOutliner. + bool isFunctionSafeToOutlineFrom(MachineFunction &MF, + bool OutlineFromLinkOnceODRs) const override; + outliner::OutlinedFunction getOutliningCandidateInfo( + std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override; + outliner::InstrType getOutliningType(MachineBasicBlock::iterator &MIT, + unsigned Flags) const override; + bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, + unsigned &Flags) const override; + void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF, + const outliner::OutlinedFunction &OF) const override; + MachineBasicBlock::iterator + insertOutlinedCall(Module &M, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &It, MachineFunction &MF, + const outliner::Candidate &C) const override; + private: + /// Returns an unused general-purpose register which can be used for + /// constructing an outlined call if one exists. Returns 0 otherwise. + unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const; + unsigned getInstBundleLength(const MachineInstr &MI) const; int getVLDMDefCycle(const InstrItineraryData *ItinData, @@ -403,7 +436,7 @@ private: /// Identify instructions that can be folded into a MOVCC instruction, and /// return the defining instruction. - MachineInstr *canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI, + MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI, const TargetInstrInfo *TII) const; private: @@ -491,24 +524,6 @@ bool isUncondBranchOpcode(int Opc) { // This table shows the VPT instruction variants, i.e. the different // mask field encodings, see also B5.6. Predication/conditional execution in // the ArmARM. -enum VPTMaskValue { - T = 8, // 0b1000 - TT = 4, // 0b0100 - TE = 12, // 0b1100 - TTT = 2, // 0b0010 - TTE = 6, // 0b0110 - TEE = 10, // 0b1010 - TET = 14, // 0b1110 - TTTT = 1, // 0b0001 - TTTE = 3, // 0b0011 - TTEE = 5, // 0b0101 - TTET = 7, // 0b0111 - TEEE = 9, // 0b1001 - TEET = 11, // 0b1011 - TETT = 13, // 0b1101 - TETE = 15 // 0b1111 -}; - static inline bool isVPTOpcode(int Opc) { return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 || Opc == ARM::MVE_VPTv16s8 || Opc == ARM::MVE_VPTv8i16 || @@ -595,6 +610,18 @@ unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) { return 0; } +static inline unsigned getTailPredVectorWidth(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("unhandled vctp opcode"); + case ARM::MVE_VCTP8: return 16; + case ARM::MVE_VCTP16: return 8; + case ARM::MVE_VCTP32: return 4; + case ARM::MVE_VCTP64: return 2; + } + return 0; +} + static inline bool isVCTP(MachineInstr *MI) { switch (MI->getOpcode()) { @@ -642,20 +669,31 @@ static inline bool isPushOpcode(int Opc) { Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD; } +static inline bool isSubImmOpcode(int Opc) { + return Opc == ARM::SUBri || + Opc == ARM::tSUBi3 || Opc == ARM::tSUBi8 || + Opc == ARM::tSUBSi3 || Opc == ARM::tSUBSi8 || + Opc == ARM::t2SUBri || Opc == ARM::t2SUBri12 || Opc == ARM::t2SUBSri; +} + +static inline bool isMovRegOpcode(int Opc) { + return Opc == ARM::MOVr || Opc == ARM::tMOVr || Opc == ARM::t2MOVr; +} /// isValidCoprocessorNumber - decide whether an explicit coprocessor /// number is legal in generic instructions like CDP. The answer can /// vary with the subtarget. static inline bool isValidCoprocessorNumber(unsigned Num, const FeatureBitset& featureBits) { + // In Armv7 and Armv8-M CP10 and CP11 clash with VFP/NEON, however, the + // coprocessor is still valid for CDP/MCR/MRC and friends. Allowing it is + // useful for code which is shared with older architectures which do not know + // the new VFP/NEON mnemonics. + // Armv8-A disallows everything *other* than 111x (CP14 and CP15). if (featureBits[ARM::HasV8Ops] && (Num & 0xE) != 0xE) return false; - // Armv7 disallows 101x (CP10 and CP11), which clash with VFP/NEON. - if (featureBits[ARM::HasV7Ops] && (Num & 0xE) == 0xA) - return false; - - // Armv8.1-M also disallows 100x (CP8,CP9) and 111x (CP14,CP15) + // Armv8.1-M disallows 100x (CP8,CP9) and 111x (CP14,CP15) // which clash with MVE. if (featureBits[ARM::HasV8_1MMainlineOps] && ((Num & 0xE) == 0x8 || (Num & 0xE) == 0xE)) @@ -667,7 +705,7 @@ static inline bool isValidCoprocessorNumber(unsigned Num, /// getInstrPredicate - If instruction is predicated, returns its predicate /// condition, otherwise returns AL. It also returns the condition code /// register by reference. -ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg); +ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, Register &PredReg); unsigned getMatchingCondBranchOpcode(unsigned Opc); @@ -681,21 +719,21 @@ unsigned convertAddSubFlagsOpcode(unsigned OldOpc); /// code. void emitARMRegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, - unsigned BaseReg, int NumBytes, - ARMCC::CondCodes Pred, unsigned PredReg, + const DebugLoc &dl, Register DestReg, + Register BaseReg, int NumBytes, + ARMCC::CondCodes Pred, Register PredReg, const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); void emitT2RegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, - unsigned BaseReg, int NumBytes, - ARMCC::CondCodes Pred, unsigned PredReg, + const DebugLoc &dl, Register DestReg, + Register BaseReg, int NumBytes, + ARMCC::CondCodes Pred, Register PredReg, const ARMBaseInstrInfo &TII, unsigned MIFlags = 0); void emitThumbRegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, - unsigned BaseReg, int NumBytes, + const DebugLoc &dl, Register DestReg, + Register BaseReg, int NumBytes, const TargetInstrInfo &TII, const ARMBaseRegisterInfo &MRI, unsigned MIFlags = 0); @@ -714,11 +752,11 @@ bool tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, /// offset could not be handled directly in MI, and return the left-over /// portion by reference. bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + Register FrameReg, int &Offset, const ARMBaseInstrInfo &TII); bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + Register FrameReg, int &Offset, const ARMBaseInstrInfo &TII, const TargetRegisterInfo *TRI); @@ -733,7 +771,7 @@ MachineInstr *findCMPToFoldIntoCBZ(MachineInstr *Br, const TargetRegisterInfo *TRI); void addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB); -void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned DestReg); +void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, Register DestReg); void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond); void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond, @@ -753,6 +791,70 @@ bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize = false); +// Return the immediate if this is ADDri or SUBri, scaled as appropriate. +// Returns 0 for unknown instructions. +inline int getAddSubImmediate(MachineInstr &MI) { + int Scale = 1; + unsigned ImmOp; + switch (MI.getOpcode()) { + case ARM::t2ADDri: + ImmOp = 2; + break; + case ARM::t2SUBri: + case ARM::t2SUBri12: + ImmOp = 2; + Scale = -1; + break; + case ARM::tSUBi3: + case ARM::tSUBi8: + ImmOp = 3; + Scale = -1; + break; + default: + return 0; + } + return Scale * MI.getOperand(ImmOp).getImm(); +} + +// Given a memory access Opcode, check that the give Imm would be a valid Offset +// for this instruction using its addressing mode. +inline bool isLegalAddressImm(unsigned Opcode, int Imm, + const TargetInstrInfo *TII) { + const MCInstrDesc &Desc = TII->get(Opcode); + unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); + switch (AddrMode) { + case ARMII::AddrModeT2_i7: + return std::abs(Imm) < (((1 << 7) * 1) - 1); + case ARMII::AddrModeT2_i7s2: + return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0; + case ARMII::AddrModeT2_i7s4: + return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0; + default: + llvm_unreachable("Unhandled Addressing mode"); + } +} + +// Return true if the given intrinsic is a gather or scatter +inline bool isGatherScatter(IntrinsicInst *IntInst) { + if (IntInst == nullptr) + return false; + unsigned IntrinsicID = IntInst->getIntrinsicID(); + return (IntrinsicID == Intrinsic::masked_gather || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_predicated || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated || + IntrinsicID == Intrinsic::masked_scatter || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated); +} + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 52e6d05c3155..3579635f83b5 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -220,10 +220,25 @@ getReservedRegs(const MachineFunction &MF) const { } bool ARMBaseRegisterInfo:: -isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const { +isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const { return !getReservedRegs(MF).test(PhysReg); } +bool ARMBaseRegisterInfo::isInlineAsmReadOnlyReg(const MachineFunction &MF, + unsigned PhysReg) const { + const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); + const ARMFrameLowering *TFI = getFrameLowering(MF); + + BitVector Reserved(getNumRegs()); + markSuperRegs(Reserved, ARM::PC); + if (TFI->hasFP(MF)) + markSuperRegs(Reserved, getFramePointerReg(STI)); + if (hasBasePointer(MF)) + markSuperRegs(Reserved, BasePtr); + assert(checkAllSuperRegsMarked(Reserved)); + return Reserved.test(PhysReg); +} + const TargetRegisterClass * ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const { @@ -289,7 +304,8 @@ ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, } // Get the other register in a GPRPair. -static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) { +static MCPhysReg getPairedGPR(MCPhysReg Reg, bool Odd, + const MCRegisterInfo *RI) { for (MCSuperRegIterator Supers(Reg, RI); Supers.isValid(); ++Supers) if (ARM::GPRPairRegClass.contains(*Supers)) return RI->getSubReg(*Supers, Odd ? ARM::gsub_1 : ARM::gsub_0); @@ -297,15 +313,12 @@ static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) { } // Resolve the RegPairEven / RegPairOdd register allocator hints. -bool -ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, - ArrayRef<MCPhysReg> Order, - SmallVectorImpl<MCPhysReg> &Hints, - const MachineFunction &MF, - const VirtRegMap *VRM, - const LiveRegMatrix *Matrix) const { +bool ARMBaseRegisterInfo::getRegAllocationHints( + Register VirtReg, ArrayRef<MCPhysReg> Order, + SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF, + const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); - std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg); + std::pair<Register, Register> Hint = MRI.getRegAllocationHint(VirtReg); unsigned Odd; switch (Hint.first) { @@ -323,12 +336,12 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, // This register should preferably be even (Odd == 0) or odd (Odd == 1). // Check if the other part of the pair has already been assigned, and provide // the paired register as the first hint. - unsigned Paired = Hint.second; - if (Paired == 0) + Register Paired = Hint.second; + if (!Paired) return false; - unsigned PairedPhys = 0; - if (Register::isPhysicalRegister(Paired)) { + Register PairedPhys; + if (Paired.isPhysical()) { PairedPhys = Paired; } else if (VRM && VRM->hasPhys(Paired)) { PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this); @@ -339,11 +352,11 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, Hints.push_back(PairedPhys); // Then prefer even or odd registers. - for (unsigned Reg : Order) { + for (MCPhysReg Reg : Order) { if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd) continue; // Don't provide hints that are paired to a reserved register. - unsigned Paired = getPairedGPR(Reg, !Odd, this); + MCPhysReg Paired = getPairedGPR(Reg, !Odd, this); if (!Paired || MRI.isReserved(Paired)) continue; Hints.push_back(Reg); @@ -351,27 +364,27 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, return false; } -void -ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg, - MachineFunction &MF) const { +void ARMBaseRegisterInfo::updateRegAllocHint(Register Reg, Register NewReg, + MachineFunction &MF) const { MachineRegisterInfo *MRI = &MF.getRegInfo(); - std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg); - if ((Hint.first == (unsigned)ARMRI::RegPairOdd || - Hint.first == (unsigned)ARMRI::RegPairEven) && - Register::isVirtualRegister(Hint.second)) { + std::pair<Register, Register> Hint = MRI->getRegAllocationHint(Reg); + if ((Hint.first == ARMRI::RegPairOdd || Hint.first == ARMRI::RegPairEven) && + Hint.second.isVirtual()) { // If 'Reg' is one of the even / odd register pair and it's now changed // (e.g. coalesced) into a different register. The other register of the // pair allocation hint must be updated to reflect the relationship // change. - unsigned OtherReg = Hint.second; + Register OtherReg = Hint.second; Hint = MRI->getRegAllocationHint(OtherReg); // Make sure the pair has not already divorced. if (Hint.second == Reg) { MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg); if (Register::isVirtualRegister(NewReg)) MRI->setRegAllocationHint(NewReg, - Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven - : ARMRI::RegPairOdd, OtherReg); + Hint.first == ARMRI::RegPairOdd + ? ARMRI::RegPairEven + : ARMRI::RegPairOdd, + OtherReg); } } } @@ -457,14 +470,14 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const { /// specified immediate. void ARMBaseRegisterInfo::emitLoadConstPool( MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val, - ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const { + const DebugLoc &dl, Register DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, Register PredReg, unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineConstantPool *ConstantPool = MF.getConstantPool(); const Constant *C = ConstantInt::get(Type::getInt32Ty(MF.getFunction().getContext()), Val); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4)); BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp)) .addReg(DestReg, getDefRegState(true), SubIdx) @@ -480,11 +493,6 @@ requiresRegisterScavenging(const MachineFunction &MF) const { } bool ARMBaseRegisterInfo:: -trackLivenessAfterRegAlloc(const MachineFunction &MF) const { - return true; -} - -bool ARMBaseRegisterInfo:: requiresFrameIndexScavenging(const MachineFunction &MF) const { return true; } @@ -606,9 +614,9 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { // The FP is only available if there is no dynamic realignment. We // don't know for sure yet whether we'll need that, so we guess based // on whether there are any local variables that would trigger it. - unsigned StackAlign = TFI->getStackAlignment(); if (TFI->hasFP(MF) && - !((MFI.getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) { + !((MFI.getLocalFrameMaxAlign() > TFI->getStackAlign()) && + canRealignStack(MF))) { if (isFrameOffsetLegal(MI, getFrameRegister(MF), FPOffset)) return false; } @@ -626,10 +634,10 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { /// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to /// be a pointer to FrameIdx at the beginning of the basic block. -void ARMBaseRegisterInfo:: -materializeFrameBaseRegister(MachineBasicBlock *MBB, - unsigned BaseReg, int FrameIdx, - int64_t Offset) const { +void ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, + Register BaseReg, + int FrameIdx, + int64_t Offset) const { ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>(); unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : (AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri); @@ -652,7 +660,7 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB, MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); } -void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, +void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const { MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); @@ -680,7 +688,8 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, (void)Done; } -bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, +bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + Register BaseReg, int64_t Offset) const { const MCInstrDesc &Desc = MI->getDesc(); unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); @@ -759,7 +768,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, assert(!AFI->isThumb1OnlyFunction() && "This eliminateFrameIndex does not support Thumb1!"); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); - unsigned FrameReg; + Register FrameReg; int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h index 477f3ad0a9a7..0a0907af2141 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -134,7 +134,9 @@ public: BitVector getReservedRegs(const MachineFunction &MF) const override; bool isAsmClobberable(const MachineFunction &MF, - unsigned PhysReg) const override; + MCRegister PhysReg) const override; + bool isInlineAsmReadOnlyReg(const MachineFunction &MF, + unsigned PhysReg) const override; const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, @@ -149,14 +151,12 @@ public: unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - bool getRegAllocationHints(unsigned VirtReg, - ArrayRef<MCPhysReg> Order, + bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order, SmallVectorImpl<MCPhysReg> &Hints, - const MachineFunction &MF, - const VirtRegMap *VRM, + const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; - void updateRegAllocHint(unsigned Reg, unsigned NewReg, + void updateRegAllocHint(Register Reg, Register NewReg, MachineFunction &MF) const override; bool hasBasePointer(const MachineFunction &MF) const; @@ -165,35 +165,32 @@ public: int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override; bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; - void materializeFrameBaseRegister(MachineBasicBlock *MBB, - unsigned BaseReg, int FrameIdx, + void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg, + int FrameIdx, int64_t Offset) const override; - void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override; - bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override; bool cannotEliminateFrame(const MachineFunction &MF) const; // Debug information queries. Register getFrameRegister(const MachineFunction &MF) const override; - unsigned getBaseRegister() const { return BasePtr; } - + Register getBaseRegister() const { return BasePtr; } /// emitLoadConstPool - Emits a load from constpool to materialize the /// specified immediate. virtual void emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, + const DebugLoc &dl, Register DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred = ARMCC::AL, - unsigned PredReg = 0, + Register PredReg = Register(), unsigned MIFlags = MachineInstr::NoFlags) const; /// Code Generation virtual methods... bool requiresRegisterScavenging(const MachineFunction &MF) const override; - bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; - bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp index 00a2231f59e3..6d389cc82730 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp @@ -49,7 +49,7 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) { BasicBlockInfo &BBI = BBInfo[MBB->getNumber()]; BBI.Size = 0; BBI.Unalign = 0; - BBI.PostAlign = Align::None(); + BBI.PostAlign = Align(1); for (MachineInstr &I : *MBB) { BBI.Size += TII->getInstSizeInBytes(I); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.h index 13df399ed995..47d9a4049fa0 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMBasicBlockInfo.h @@ -87,10 +87,10 @@ struct BasicBlockInfo { /// Compute the offset immediately following this block. If Align is /// specified, return the offset the successor block will get if it has /// this alignment. - unsigned postOffset(Align Alignment = Align::None()) const { + unsigned postOffset(Align Alignment = Align(1)) const { unsigned PO = Offset + Size; const Align PA = std::max(PostAlign, Alignment); - if (PA == Align::None()) + if (PA == Align(1)) return PO; // Add alignment padding from the terminator. return PO + UnknownPadding(PA, internalKnownBits()); @@ -101,7 +101,7 @@ struct BasicBlockInfo { /// instruction alignment. An aligned terminator may increase the number /// of know bits. /// If LogAlign is given, also consider the alignment of the next block. - unsigned postKnownBits(Align Align = Align::None()) const { + unsigned postKnownBits(Align Align = llvm::Align(1)) const { return std::max(Log2(std::max(PostAlign, Align)), internalKnownBits()); } }; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp index ce260a9ba145..d860473011e7 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallLowering.cpp @@ -99,17 +99,14 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { LLT p0 = LLT::pointer(0, 32); LLT s32 = LLT::scalar(32); - Register SPReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(SPReg, Register(ARM::SP)); + auto SPReg = MIRBuilder.buildCopy(p0, Register(ARM::SP)); - Register OffsetReg = MRI.createGenericVirtualRegister(s32); - MIRBuilder.buildConstant(OffsetReg, Offset); + auto OffsetReg = MIRBuilder.buildConstant(s32, Offset); - Register AddrReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg); + auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); - return AddrReg; + return AddrReg.getReg(0); } void assignValueToReg(Register ValVReg, Register PhysReg, @@ -133,7 +130,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { Register ExtReg = extendRegister(ValVReg, VA); auto MMO = MIRBuilder.getMF().getMachineMemOperand( MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(), - /* Alignment */ 1); + Align(1)); MIRBuilder.buildStore(ExtReg, Addr, *MMO); } @@ -143,7 +140,10 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { CCValAssign VA = VAs[0]; assert(VA.needsCustom() && "Value doesn't need custom handling"); - assert(VA.getValVT() == MVT::f64 && "Unsupported type"); + + // Custom lowering for other types, such as f16, is currently not supported + if (VA.getValVT() != MVT::f64) + return 0; CCValAssign NextVA = VAs[1]; assert(NextVA.needsCustom() && "Value doesn't need custom handling"); @@ -203,7 +203,7 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg, // Even if there is no splitting to do, we still want to replace the // original type (e.g. pointer type -> integer). auto Flags = OrigArg.Flags[0]; - Flags.setOrigAlign(Align(DL.getABITypeAlignment(OrigArg.Ty))); + Flags.setOrigAlign(DL.getABITypeAlign(OrigArg.Ty)); SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), Flags, OrigArg.IsFixed); return; @@ -215,7 +215,7 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg, Type *SplitTy = SplitVT.getTypeForEVT(Ctx); auto Flags = OrigArg.Flags[0]; - Flags.setOrigAlign(Align(DL.getABITypeAlignment(SplitTy))); + Flags.setOrigAlign(DL.getABITypeAlign(SplitTy)); bool NeedsConsecutiveRegisters = TLI.functionArgumentNeedsConsecutiveRegisters( @@ -299,11 +299,8 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - Register AddrReg = - MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32)); - MIRBuilder.buildFrameIndex(AddrReg, FI); - - return AddrReg; + return MIRBuilder.buildFrameIndex(LLT::pointer(MPO.getAddrSpace(), 32), FI) + .getReg(0); } void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, @@ -318,20 +315,21 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { Size = 4; assert(MRI.getType(ValVReg).isScalar() && "Only scalars supported atm"); - auto LoadVReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); - buildLoad(LoadVReg, Addr, Size, /* Alignment */ 1, MPO); + auto LoadVReg = buildLoad(LLT::scalar(32), Addr, Size, MPO); MIRBuilder.buildTrunc(ValVReg, LoadVReg); } else { // If the value is not extended, a simple load will suffice. - buildLoad(ValVReg, Addr, Size, /* Alignment */ 1, MPO); + buildLoad(ValVReg, Addr, Size, MPO); } } - void buildLoad(Register Val, Register Addr, uint64_t Size, unsigned Alignment, - MachinePointerInfo &MPO) { - auto MMO = MIRBuilder.getMF().getMachineMemOperand( - MPO, MachineMemOperand::MOLoad, Size, Alignment); - MIRBuilder.buildLoad(Val, Addr, *MMO); + MachineInstrBuilder buildLoad(const DstOp &Res, Register Addr, uint64_t Size, + MachinePointerInfo &MPO) { + MachineFunction &MF = MIRBuilder.getMF(); + + auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size, + inferAlignFromPtrInfo(MF, MPO)); + return MIRBuilder.buildLoad(Res, Addr, *MMO); } void assignValueToReg(Register ValVReg, Register PhysReg, @@ -354,9 +352,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { // We cannot create a truncating copy, nor a trunc of a physical register. // Therefore, we need to copy the content of the physical register into a // virtual one and then truncate that. - auto PhysRegToVReg = - MRI.createGenericVirtualRegister(LLT::scalar(LocSize)); - MIRBuilder.buildCopy(PhysRegToVReg, PhysReg); + auto PhysRegToVReg = MIRBuilder.buildCopy(LLT::scalar(LocSize), PhysReg); MIRBuilder.buildTrunc(ValVReg, PhysRegToVReg); } } @@ -367,7 +363,10 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { CCValAssign VA = VAs[0]; assert(VA.needsCustom() && "Value doesn't need custom handling"); - assert(VA.getValVT() == MVT::f64 && "Unsupported type"); + + // Custom lowering for other types, such as f16, is currently not supported + if (VA.getValVT() != MVT::f64) + return 0; CCValAssign NextVA = VAs[1]; assert(NextVA.needsCustom() && "Value doesn't need custom handling"); @@ -436,7 +435,7 @@ bool ARMCallLowering::lowerFormalArguments( for (auto &Arg : F.args()) { if (!isSupportedType(DL, TLI, Arg.getType())) return false; - if (Arg.hasByValOrInAllocaAttr()) + if (Arg.hasPassPointeeByValueAttr()) return false; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp index a47c59512592..67c822a5b6ef 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.cpp @@ -32,9 +32,8 @@ static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, return false; // Put the whole thing on the stack. - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(8, 4), - LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomMem( + ValNo, ValVT, State.AllocateStack(8, Align(4)), LocVT, LocInfo)); return true; } @@ -42,9 +41,8 @@ static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, if (unsigned Reg = State.AllocateReg(RegList)) State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); else - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(4, 4), - LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomMem( + ValNo, ValVT, State.AllocateStack(4, Align(4)), LocVT, LocInfo)); return true; } @@ -81,9 +79,8 @@ static bool f64AssignAAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, return false; // Put the whole thing on the stack. - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, - State.AllocateStack(8, 8), - LocVT, LocInfo)); + State.addLoc(CCValAssign::getCustomMem( + ValNo, ValVT, State.AllocateStack(8, Align(8)), LocVT, LocInfo)); return true; } @@ -184,8 +181,8 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT, // aggregate. Store the type's required alignment as extra info for later: in // the [N x i64] case all trace has been removed by the time we actually get // to do allocation. - PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo, - ArgFlags.getOrigAlign())); + PendingMembers.push_back(CCValAssign::getPending( + ValNo, ValVT, LocVT, LocInfo, ArgFlags.getNonZeroOrigAlign().value())); if (!ArgFlags.isInConsecutiveRegsLast()) return true; @@ -193,8 +190,9 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT, // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. auto &DL = State.getMachineFunction().getDataLayout(); - unsigned StackAlign = DL.getStackAlignment().value(); - unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign); + const Align StackAlign = DL.getStackAlignment(); + const Align FirstMemberAlign(PendingMembers[0].getExtraInfo()); + Align Alignment = std::min(FirstMemberAlign, StackAlign); ArrayRef<MCPhysReg> RegList; switch (LocVT.SimpleTy) { @@ -204,21 +202,24 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT, // First consume all registers that would give an unaligned object. Whether // we go on stack or in regs, no-one will be using them in future. - unsigned RegAlign = alignTo(Align, 4) / 4; + unsigned RegAlign = alignTo(Alignment.value(), 4) / 4; while (RegIdx % RegAlign != 0 && RegIdx < RegList.size()) State.AllocateReg(RegList[RegIdx++]); break; } case MVT::f16: + case MVT::bf16: case MVT::f32: RegList = SRegList; break; case MVT::v4f16: + case MVT::v4bf16: case MVT::f64: RegList = DRegList; break; case MVT::v8f16: + case MVT::v8bf16: case MVT::v2f64: RegList = QRegList; break; @@ -247,7 +248,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT, unsigned RegIdx = State.getFirstUnallocated(RegList); for (auto &It : PendingMembers) { if (RegIdx >= RegList.size()) - It.convertToMem(State.AllocateStack(Size, Size)); + It.convertToMem(State.AllocateStack(Size, Align(Size))); else It.convertToReg(State.AllocateReg(RegList[RegIdx++])); @@ -265,12 +266,12 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT, // After the first item has been allocated, the rest are packed as tightly as // possible. (E.g. an incoming i64 would have starting Align of 8, but we'll // be allocating a bunch of i32 slots). - unsigned RestAlign = std::min(Align, Size); + const Align RestAlign = std::min(Alignment, Align(Size)); for (auto &It : PendingMembers) { - It.convertToMem(State.AllocateStack(Size, Align)); + It.convertToMem(State.AllocateStack(Size, Alignment)); State.addLoc(It); - Align = RestAlign; + Alignment = RestAlign; } // All pending members have now been allocated @@ -280,5 +281,33 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT, return true; } +static bool CustomAssignInRegList(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, CCState &State, + ArrayRef<MCPhysReg> RegList) { + unsigned Reg = State.AllocateReg(RegList); + if (Reg) { + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + return false; +} + +static bool CC_ARM_AAPCS_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + // f16 arguments are extended to i32 and assigned to a register in [r0, r3] + return CustomAssignInRegList(ValNo, ValVT, MVT::i32, LocInfo, State, + RRegList); +} + +static bool CC_ARM_AAPCS_VFP_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, + CCState &State) { + // f16 arguments are extended to f32 and assigned to a register in [s0, s15] + return CustomAssignInRegList(ValNo, ValVT, MVT::f32, LocInfo, State, + SRegList); +} + // Include the table generated calling convention implementations. #include "ARMGenCallingConv.inc" diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td index 5df5b56f5afa..3517274e4c5c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMCallingConv.td @@ -10,7 +10,7 @@ /// CCIfAlign - Match of the original alignment of the arg class CCIfAlign<string Align, CCAction A>: - CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; + CCIf<!strconcat("ArgFlags.getNonZeroOrigAlign() == ", Align), A>; //===----------------------------------------------------------------------===// // ARM APCS Calling Convention @@ -30,8 +30,8 @@ def CC_ARM_APCS : CallingConv<[ CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>, // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>, @@ -56,8 +56,8 @@ def RetCC_ARM_APCS : CallingConv<[ CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>, CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>, @@ -71,8 +71,8 @@ def RetCC_ARM_APCS : CallingConv<[ let Entry = 1 in def FastCC_ARM_APCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>, CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, @@ -91,8 +91,8 @@ def FastCC_ARM_APCS : CallingConv<[ let Entry = 1 in def RetFastCC_ARM_APCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>, CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, @@ -108,8 +108,8 @@ def RetFastCC_ARM_APCS : CallingConv<[ let Entry = 1 in def CC_ARM_APCS_GHC : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>, CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>, CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>, @@ -134,12 +134,12 @@ def CC_ARM_AAPCS_Common : CallingConv<[ // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register // (and the same is true for f64 if VFP is not enabled) CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>, - CCIfType<[i32], CCIf<"ArgFlags.getOrigAlign() != 8", + CCIfType<[i32], CCIf<"ArgFlags.getNonZeroOrigAlign() != Align(8)", CCAssignToReg<[R0, R1, R2, R3]>>>, CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>, CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>, - CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>, + CCIfType<[f16, bf16, f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>, CCIfType<[v2f64], CCIfAlign<"16", CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>, @@ -165,8 +165,8 @@ def CC_ARM_AAPCS : CallingConv<[ CCIfNest<CCAssignToReg<[R12]>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>, @@ -176,14 +176,15 @@ def CC_ARM_AAPCS : CallingConv<[ CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType<i32>>, + CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>, CCDelegateTo<CC_ARM_AAPCS_Common> ]>; let Entry = 1 in def RetCC_ARM_AAPCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>, @@ -193,6 +194,7 @@ def RetCC_ARM_AAPCS : CallingConv<[ CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>, CCIfType<[f32], CCBitConvertToType<i32>>, + CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>, CCDelegateTo<RetCC_ARM_AAPCS_Common> ]>; @@ -208,8 +210,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ CCIfByVal<CCPassByVal<4, 4>>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>, @@ -224,14 +226,15 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>, CCDelegateTo<CC_ARM_AAPCS_Common> ]>; let Entry = 1 in def RetCC_ARM_AAPCS_VFP : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>, - CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>, @@ -242,7 +245,8 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[ CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>, CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8, - S9, S10, S11, S12, S13, S14, S15]>>, + S9, S10, S11, S12, S13, S14, S15]>>, + CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>, CCDelegateTo<RetCC_ARM_AAPCS_Common> ]>; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index 66ad120a111f..195d0a89291b 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -206,10 +206,6 @@ namespace { /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions. SmallVector<MachineInstr*, 4> T2JumpTables; - /// HasFarJump - True if any far jump instruction has been emitted during - /// the branch fix up pass. - bool HasFarJump; - MachineFunction *MF; MachineConstantPool *MCP; const ARMBaseInstrInfo *TII; @@ -270,7 +266,6 @@ namespace { bool fixupImmediateBr(ImmBranch &Br); bool fixupConditionalBr(ImmBranch &Br); bool fixupUnconditionalBr(ImmBranch &Br); - bool undoLRSpillRestore(); bool optimizeThumb2Instructions(); bool optimizeThumb2Branches(); bool reorderThumb2JumpTables(); @@ -350,7 +345,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { LLVM_DEBUG(dbgs() << "***** ARMConstantIslands: " << MCP->getConstants().size() << " CP entries, aligned to " - << MCP->getConstantPoolAlignment() << " bytes *****\n"); + << MCP->getConstantPoolAlign().value() << " bytes *****\n"); STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget()); TII = STI->getInstrInfo(); @@ -363,7 +358,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { isThumb1 = AFI->isThumb1OnlyFunction(); isThumb2 = AFI->isThumb2Function(); - HasFarJump = false; bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB); // Renumber all of the machine basic blocks in the function, guaranteeing that @@ -456,11 +450,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) { // After a while, this might be made debug-only, but it is not expensive. verify(); - // If LR has been forced spilled and no far jump (i.e. BL) has been issued, - // undo the spill / restore of LR if possible. - if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump()) - MadeChange |= undoLRSpillRestore(); - // Save the mapping between original and cloned constpool entries. for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) { @@ -494,7 +483,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) MF->push_back(BB); // MachineConstantPool measures alignment in bytes. - const Align MaxAlign(MCP->getConstantPoolAlignment()); + const Align MaxAlign = MCP->getConstantPoolAlign(); const unsigned MaxLogAlign = Log2(MaxAlign); // Mark the basic block as required by the const-pool. @@ -518,14 +507,13 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) const DataLayout &TD = MF->getDataLayout(); for (unsigned i = 0, e = CPs.size(); i != e; ++i) { unsigned Size = TD.getTypeAllocSize(CPs[i].getType()); - unsigned Align = CPs[i].getAlignment(); - assert(isPowerOf2_32(Align) && "Invalid alignment"); + Align Alignment = CPs[i].getAlign(); // Verify that all constant pool entries are a multiple of their alignment. // If not, we would have to pad them out so that instructions stay aligned. - assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!"); + assert(isAligned(Alignment, Size) && "CP Entry not multiple of 4 bytes!"); // Insert CONSTPOOL_ENTRY before entries with a smaller alignment. - unsigned LogAlign = Log2_32(Align); + unsigned LogAlign = Log2(Alignment); MachineBasicBlock::iterator InsAt = InsPoint[LogAlign]; MachineInstr *CPEMI = BuildMI(*BB, InsAt, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY)) @@ -542,7 +530,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) CPEntries.emplace_back(1, CPEntry(CPEMI, i)); ++NumCPEs; LLVM_DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = " - << Size << ", align = " << Align << '\n'); + << Size << ", align = " << Alignment.value() << '\n'); } LLVM_DEBUG(BB->dump()); } @@ -668,7 +656,7 @@ Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) { unsigned CPI = getCombinedIndex(CPEMI); assert(CPI < MCP->getConstants().size() && "Invalid constant pool index."); - return Align(MCP->getConstants()[CPI].getAlignment()); + return MCP->getConstants()[CPI].getAlign(); } /// scanFunctionJumpTables - Do a scan of the function, building up @@ -1364,8 +1352,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // displacement. MachineBasicBlock::iterator I = UserMI; ++I; - for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI), - PredReg = 0; + Register PredReg; + for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI); I->getOpcode() != ARM::t2IT && getITInstrPredicate(*I, PredReg) != ARMCC::AL; Offset += TII->getInstSizeInBytes(*I), I = std::next(I)) { @@ -1410,7 +1398,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // Avoid splitting an IT block. if (LastIT) { - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg); if (CC != ARMCC::AL) MI = LastIT; @@ -1434,7 +1422,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, // We really must not split an IT block. #ifndef NDEBUG - unsigned PredReg; + Register PredReg; assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL); #endif NewMBB = splitBlockBeforeInstr(&*MI); @@ -1566,7 +1554,7 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { BBInfo[CPEBB->getNumber()].Size = 0; // This block no longer needs to be aligned. - CPEBB->setAlignment(Align::None()); + CPEBB->setAlignment(Align(1)); } else { // Entries are sorted by descending alignment, so realign from the front. CPEBB->setAlignment(getCPEAlign(&*CPEBB->begin())); @@ -1633,7 +1621,6 @@ ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) { BBInfoVector &BBInfo = BBUtils->getBBInfo(); BBInfo[MBB->getNumber()].Size += 2; BBUtils->adjustBBOffsetsAfter(MBB); - HasFarJump = true; ++NumUBrFixed; LLVM_DEBUG(dbgs() << " Changed B to long jump " << *MI); @@ -1735,34 +1722,6 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { return true; } -/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills -/// LR / restores LR to pc. FIXME: This is done here because it's only possible -/// to do this if tBfar is not used. -bool ARMConstantIslands::undoLRSpillRestore() { - bool MadeChange = false; - for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) { - MachineInstr *MI = PushPopMIs[i]; - // First two operands are predicates. - if (MI->getOpcode() == ARM::tPOP_RET && - MI->getOperand(2).getReg() == ARM::PC && - MI->getNumExplicitOperands() == 3) { - // Create the new insn and copy the predicate from the old. - BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET)) - .add(MI->getOperand(0)) - .add(MI->getOperand(1)); - MI->eraseFromParent(); - MadeChange = true; - } else if (MI->getOpcode() == ARM::tPUSH && - MI->getOperand(2).getReg() == ARM::LR && - MI->getNumExplicitOperands() == 3) { - // Just remove the push. - MI->eraseFromParent(); - MadeChange = true; - } - } - return MadeChange; -} - bool ARMConstantIslands::optimizeThumb2Instructions() { bool MadeChange = false; @@ -1868,7 +1827,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() { if (!Br.MI->killsRegister(ARM::CPSR)) return false; - unsigned PredReg = 0; + Register PredReg; unsigned NewOpc = 0; ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg); if (Pred == ARMCC::EQ) @@ -2402,6 +2361,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { SmallVector<MachineOperand, 4> CondPrior; MachineFunction::iterator BBi = BB->getIterator(); MachineFunction::iterator OldPrior = std::prev(BBi); + MachineFunction::iterator OldNext = std::next(BBi); // If the block terminator isn't analyzable, don't try to move the block bool B = TII->analyzeBranch(*BB, TBB, FBB, Cond); @@ -2412,8 +2372,8 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { if (!B && Cond.empty() && BB != &MF->front() && !TII->analyzeBranch(*OldPrior, TBB, FBB, CondPrior)) { BB->moveAfter(JTBB); - OldPrior->updateTerminator(); - BB->updateTerminator(); + OldPrior->updateTerminator(BB); + BB->updateTerminator(OldNext != MF->end() ? &*OldNext : nullptr); // Update numbering to account for the block being moved. MF->RenumberBlocks(); ++NumJTMoved; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp index 72c95f441265..c1df7ef43cad 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -73,7 +73,7 @@ StringRef ARMConstantPoolValue::getModifierText() const { } int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP, - unsigned Alignment) { + Align Alignment) { llvm_unreachable("Shouldn't be calling this directly!"); } @@ -189,7 +189,7 @@ const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const { } int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP, - unsigned Alignment) { + Align Alignment) { int index = getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment); if (index != -1) { @@ -228,7 +228,7 @@ ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, StringRef s, bool AddCurrentAddress) : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier, AddCurrentAddress), - S(s) {} + S(std::string(s)) {} ARMConstantPoolSymbol *ARMConstantPoolSymbol::Create(LLVMContext &C, StringRef s, unsigned ID, @@ -237,7 +237,7 @@ ARMConstantPoolSymbol *ARMConstantPoolSymbol::Create(LLVMContext &C, } int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP, - unsigned Alignment) { + Align Alignment) { return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment); } @@ -277,7 +277,7 @@ ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C, } int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP, - unsigned Alignment) { + Align Alignment) { return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.h index 660b7fc88d82..261070a74ba3 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -76,13 +76,11 @@ protected: bool AddCurrentAddress); template <typename Derived> - int getExistingMachineCPValueImpl(MachineConstantPool *CP, - unsigned Alignment) { - unsigned AlignMask = Alignment - 1; + int getExistingMachineCPValueImpl(MachineConstantPool *CP, Align Alignment) { const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants(); for (unsigned i = 0, e = Constants.size(); i != e; ++i) { if (Constants[i].isMachineConstantPoolEntry() && - (Constants[i].getAlignment() & AlignMask) == 0) { + Constants[i].getAlign() >= Alignment) { auto *CPV = static_cast<ARMConstantPoolValue*>(Constants[i].Val.MachineCPVal); if (Derived *APC = dyn_cast<Derived>(CPV)) @@ -114,7 +112,7 @@ public: bool isPromotedGlobal() const{ return Kind == ARMCP::CPPromotedGlobal; } int getExistingMachineCPValue(MachineConstantPool *CP, - unsigned Alignment) override; + Align Alignment) override; void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; @@ -187,7 +185,7 @@ public: } int getExistingMachineCPValue(MachineConstantPool *CP, - unsigned Alignment) override; + Align Alignment) override; /// hasSameValue - Return true if this ARM constpool value can share the same /// constantpool entry as another ARM constpool value. @@ -223,7 +221,7 @@ public: StringRef getSymbol() const { return S; } int getExistingMachineCPValue(MachineConstantPool *CP, - unsigned Alignment) override; + Align Alignment) override; void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; @@ -259,7 +257,7 @@ public: const MachineBasicBlock *getMBB() const { return MBB; } int getExistingMachineCPValue(MachineConstantPool *CP, - unsigned Alignment) override; + Align Alignment) override; void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index de4377ec5a47..48622aae3cb4 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -71,6 +71,38 @@ namespace { unsigned Opc, bool IsExt); void ExpandMOV32BitImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI); + void CMSEClearGPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + const SmallVectorImpl<unsigned> &ClearRegs, + unsigned ClobberReg); + MachineBasicBlock &CMSEClearFPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + MachineBasicBlock &CMSEClearFPRegsV8(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const BitVector &ClearRegs); + MachineBasicBlock &CMSEClearFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const BitVector &ClearRegs); + void CMSESaveClearFPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs, + SmallVectorImpl<unsigned> &AvailableRegs); + void CMSESaveClearFPRegsV8(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs, + SmallVectorImpl<unsigned> &ScratchRegs); + void CMSESaveClearFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs); + void CMSERestoreFPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs); + void CMSERestoreFPRegsV8(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs); + void CMSERestoreFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs); bool ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdrexOp, unsigned StrexOp, unsigned UxtOp, @@ -417,8 +449,7 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { // Make sure the table is sorted. static std::atomic<bool> TableChecked(false); if (!TableChecked.load(std::memory_order_relaxed)) { - assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) && - "NEONLdStTable is not sorted!"); + assert(llvm::is_sorted(NEONLdStTable) && "NEONLdStTable is not sorted!"); TableChecked.store(true, std::memory_order_relaxed); } #endif @@ -827,7 +858,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); @@ -852,10 +883,13 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, unsigned ImmVal = (unsigned)MO.getImm(); unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + unsigned MIFlags = MI.getFlags(); LO16 = LO16.addImm(SOImmValV1); HI16 = HI16.addImm(SOImmValV2); LO16.cloneMemRefs(MI); HI16.cloneMemRefs(MI); + LO16.setMIFlags(MIFlags); + HI16.setMIFlags(MIFlags); LO16.addImm(Pred).addReg(PredReg).add(condCodeOp()); HI16.addImm(Pred).addReg(PredReg).add(condCodeOp()); if (isCC) @@ -867,6 +901,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, unsigned LO16Opc = 0; unsigned HI16Opc = 0; + unsigned MIFlags = MI.getFlags(); if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) { LO16Opc = ARM::t2MOVi16; HI16Opc = ARM::t2MOVTi16; @@ -880,6 +915,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) .addReg(DstReg); + LO16.setMIFlags(MIFlags); + HI16.setMIFlags(MIFlags); + switch (MO.getType()) { case MachineOperand::MO_Immediate: { unsigned Imm = MO.getImm(); @@ -921,6 +959,582 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, LLVM_DEBUG(dbgs() << "And: "; HI16.getInstr()->dump();); } +// The size of the area, accessed by that VLSTM/VLLDM +// S0-S31 + FPSCR + 8 more bytes (VPR + pad, or just pad) +static const int CMSE_FP_SAVE_SIZE = 136; + +static void determineGPRegsToClear(const MachineInstr &MI, + const std::initializer_list<unsigned> &Regs, + SmallVectorImpl<unsigned> &ClearRegs) { + SmallVector<unsigned, 4> OpRegs; + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg() || !Op.isUse()) + continue; + OpRegs.push_back(Op.getReg()); + } + llvm::sort(OpRegs); + + std::set_difference(Regs.begin(), Regs.end(), OpRegs.begin(), OpRegs.end(), + std::back_inserter(ClearRegs)); +} + +void ARMExpandPseudo::CMSEClearGPRegs( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const SmallVectorImpl<unsigned> &ClearRegs, + unsigned ClobberReg) { + + if (STI->hasV8_1MMainlineOps()) { + // Clear the registers using the CLRM instruction. + MachineInstrBuilder CLRM = + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2CLRM)).add(predOps(ARMCC::AL)); + for (unsigned R : ClearRegs) + CLRM.addReg(R, RegState::Define); + CLRM.addReg(ARM::APSR, RegState::Define); + CLRM.addReg(ARM::CPSR, RegState::Define | RegState::Implicit); + } else { + // Clear the registers and flags by copying ClobberReg into them. + // (Baseline can't do a high register clear in one instruction). + for (unsigned Reg : ClearRegs) { + if (Reg == ClobberReg) + continue; + BuildMI(MBB, MBBI, DL, TII->get(ARM::tMOVr), Reg) + .addReg(ClobberReg) + .add(predOps(ARMCC::AL)); + } + + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2MSR_M)) + .addImm(STI->hasDSP() ? 0xc00 : 0x800) + .addReg(ClobberReg) + .add(predOps(ARMCC::AL)); + } +} + +// Find which FP registers need to be cleared. The parameter `ClearRegs` is +// initialised with all elements set to true, and this function resets all the +// bits, which correspond to register uses. Returns true if any floating point +// register is defined, false otherwise. +static bool determineFPRegsToClear(const MachineInstr &MI, + BitVector &ClearRegs) { + bool DefFP = false; + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg()) + continue; + + unsigned Reg = Op.getReg(); + if (Op.isDef()) { + if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) || + (Reg >= ARM::D0 && Reg <= ARM::D15) || + (Reg >= ARM::S0 && Reg <= ARM::S31)) + DefFP = true; + continue; + } + + if (Reg >= ARM::Q0 && Reg <= ARM::Q7) { + int R = Reg - ARM::Q0; + ClearRegs.reset(R * 4, (R + 1) * 4); + } else if (Reg >= ARM::D0 && Reg <= ARM::D15) { + int R = Reg - ARM::D0; + ClearRegs.reset(R * 2, (R + 1) * 2); + } else if (Reg >= ARM::S0 && Reg <= ARM::S31) { + ClearRegs[Reg - ARM::S0] = false; + } + } + return DefFP; +} + +MachineBasicBlock & +ARMExpandPseudo::CMSEClearFPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + BitVector ClearRegs(16, true); + (void)determineFPRegsToClear(*MBBI, ClearRegs); + + if (STI->hasV8_1MMainlineOps()) + return CMSEClearFPRegsV81(MBB, MBBI, ClearRegs); + else + return CMSEClearFPRegsV8(MBB, MBBI, ClearRegs); +} + +// Clear the FP registers for v8.0-M, by copying over the content +// of LR. Uses R12 as a scratch register. +MachineBasicBlock & +ARMExpandPseudo::CMSEClearFPRegsV8(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const BitVector &ClearRegs) { + if (!STI->hasFPRegs()) + return MBB; + + auto &RetI = *MBBI; + const DebugLoc &DL = RetI.getDebugLoc(); + + // If optimising for minimum size, clear FP registers unconditionally. + // Otherwise, check the CONTROL.SFPA (Secure Floating-Point Active) bit and + // don't clear them if they belong to the non-secure state. + MachineBasicBlock *ClearBB, *DoneBB; + if (STI->hasMinSize()) { + ClearBB = DoneBB = &MBB; + } else { + MachineFunction *MF = MBB.getParent(); + ClearBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), ClearBB); + MF->insert(++ClearBB->getIterator(), DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MBBI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + MBB.addSuccessor(ClearBB); + MBB.addSuccessor(DoneBB); + ClearBB->addSuccessor(DoneBB); + + // At the new basic blocks we need to have live-in the registers, used + // for the return value as well as LR, used to clear registers. + for (const MachineOperand &Op : RetI.operands()) { + if (!Op.isReg()) + continue; + Register Reg = Op.getReg(); + if (Reg == ARM::NoRegister || Reg == ARM::LR) + continue; + assert(Register::isPhysicalRegister(Reg) && "Unallocated register"); + ClearBB->addLiveIn(Reg); + DoneBB->addLiveIn(Reg); + } + ClearBB->addLiveIn(ARM::LR); + DoneBB->addLiveIn(ARM::LR); + + // Read the CONTROL register. + BuildMI(MBB, MBB.end(), DL, TII->get(ARM::t2MRS_M), ARM::R12) + .addImm(20) + .add(predOps(ARMCC::AL)); + // Check bit 3 (SFPA). + BuildMI(MBB, MBB.end(), DL, TII->get(ARM::t2TSTri)) + .addReg(ARM::R12) + .addImm(8) + .add(predOps(ARMCC::AL)); + // If SFPA is clear, jump over ClearBB to DoneBB. + BuildMI(MBB, MBB.end(), DL, TII->get(ARM::tBcc)) + .addMBB(DoneBB) + .addImm(ARMCC::EQ) + .addReg(ARM::CPSR, RegState::Kill); + } + + // Emit the clearing sequence + for (unsigned D = 0; D < 8; D++) { + // Attempt to clear as double + if (ClearRegs[D * 2 + 0] && ClearRegs[D * 2 + 1]) { + unsigned Reg = ARM::D0 + D; + BuildMI(ClearBB, DL, TII->get(ARM::VMOVDRR), Reg) + .addReg(ARM::LR) + .addReg(ARM::LR) + .add(predOps(ARMCC::AL)); + } else { + // Clear first part as single + if (ClearRegs[D * 2 + 0]) { + unsigned Reg = ARM::S0 + D * 2; + BuildMI(ClearBB, DL, TII->get(ARM::VMOVSR), Reg) + .addReg(ARM::LR) + .add(predOps(ARMCC::AL)); + } + // Clear second part as single + if (ClearRegs[D * 2 + 1]) { + unsigned Reg = ARM::S0 + D * 2 + 1; + BuildMI(ClearBB, DL, TII->get(ARM::VMOVSR), Reg) + .addReg(ARM::LR) + .add(predOps(ARMCC::AL)); + } + } + } + + // Clear FPSCR bits 0-4, 7, 28-31 + // The other bits are program global according to the AAPCS + BuildMI(ClearBB, DL, TII->get(ARM::VMRS), ARM::R12) + .add(predOps(ARMCC::AL)); + BuildMI(ClearBB, DL, TII->get(ARM::t2BICri), ARM::R12) + .addReg(ARM::R12) + .addImm(0x0000009F) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + BuildMI(ClearBB, DL, TII->get(ARM::t2BICri), ARM::R12) + .addReg(ARM::R12) + .addImm(0xF0000000) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + BuildMI(ClearBB, DL, TII->get(ARM::VMSR)) + .addReg(ARM::R12) + .add(predOps(ARMCC::AL)); + + return *DoneBB; +} + +MachineBasicBlock & +ARMExpandPseudo::CMSEClearFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const BitVector &ClearRegs) { + auto &RetI = *MBBI; + + // Emit a sequence of VSCCLRM <sreglist> instructions, one instruction for + // each contiguous sequence of S-registers. + int Start = -1, End = -1; + for (int S = 0, E = ClearRegs.size(); S != E; ++S) { + if (ClearRegs[S] && S == End + 1) { + End = S; // extend range + continue; + } + // Emit current range. + if (Start < End) { + MachineInstrBuilder VSCCLRM = + BuildMI(MBB, MBBI, RetI.getDebugLoc(), TII->get(ARM::VSCCLRMS)) + .add(predOps(ARMCC::AL)); + while (++Start <= End) + VSCCLRM.addReg(ARM::S0 + Start, RegState::Define); + VSCCLRM.addReg(ARM::VPR, RegState::Define); + } + Start = End = S; + } + // Emit last range. + if (Start < End) { + MachineInstrBuilder VSCCLRM = + BuildMI(MBB, MBBI, RetI.getDebugLoc(), TII->get(ARM::VSCCLRMS)) + .add(predOps(ARMCC::AL)); + while (++Start <= End) + VSCCLRM.addReg(ARM::S0 + Start, RegState::Define); + VSCCLRM.addReg(ARM::VPR, RegState::Define); + } + + return MBB; +} + +void ARMExpandPseudo::CMSESaveClearFPRegs( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) { + if (STI->hasV8_1MMainlineOps()) + CMSESaveClearFPRegsV81(MBB, MBBI, DL, LiveRegs); + else + CMSESaveClearFPRegsV8(MBB, MBBI, DL, LiveRegs, ScratchRegs); +} + +// Save and clear FP registers if present +void ARMExpandPseudo::CMSESaveClearFPRegsV8( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) { + if (!STI->hasFPRegs()) + return; + + // Store an available register for FPSCR clearing + assert(!ScratchRegs.empty()); + unsigned SpareReg = ScratchRegs.front(); + + // save space on stack for VLSTM + BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBspi), ARM::SP) + .addReg(ARM::SP) + .addImm(CMSE_FP_SAVE_SIZE >> 2) + .add(predOps(ARMCC::AL)); + + // Use ScratchRegs to store the fp regs + std::vector<std::tuple<unsigned, unsigned, unsigned>> ClearedFPRegs; + std::vector<unsigned> NonclearedFPRegs; + for (const MachineOperand &Op : MBBI->operands()) { + if (Op.isReg() && Op.isUse()) { + unsigned Reg = Op.getReg(); + assert(!ARM::DPRRegClass.contains(Reg) || + ARM::DPR_VFP2RegClass.contains(Reg)); + assert(!ARM::QPRRegClass.contains(Reg)); + if (ARM::DPR_VFP2RegClass.contains(Reg)) { + if (ScratchRegs.size() >= 2) { + unsigned SaveReg2 = ScratchRegs.pop_back_val(); + unsigned SaveReg1 = ScratchRegs.pop_back_val(); + ClearedFPRegs.emplace_back(Reg, SaveReg1, SaveReg2); + + // Save the fp register to the normal registers + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRRD)) + .addReg(SaveReg1, RegState::Define) + .addReg(SaveReg2, RegState::Define) + .addReg(Reg) + .add(predOps(ARMCC::AL)); + } else { + NonclearedFPRegs.push_back(Reg); + } + } else if (ARM::SPRRegClass.contains(Reg)) { + if (ScratchRegs.size() >= 1) { + unsigned SaveReg = ScratchRegs.pop_back_val(); + ClearedFPRegs.emplace_back(Reg, SaveReg, 0); + + // Save the fp register to the normal registers + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRS), SaveReg) + .addReg(Reg) + .add(predOps(ARMCC::AL)); + } else { + NonclearedFPRegs.push_back(Reg); + } + } + } + } + + bool passesFPReg = (!NonclearedFPRegs.empty() || !ClearedFPRegs.empty()); + + // Lazy store all fp registers to the stack + MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM)) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1, + ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7}) + VLSTM.addReg(R, RegState::Implicit | + (LiveRegs.contains(R) ? 0 : RegState::Undef)); + + // Restore all arguments + for (const auto &Regs : ClearedFPRegs) { + unsigned Reg, SaveReg1, SaveReg2; + std::tie(Reg, SaveReg1, SaveReg2) = Regs; + if (ARM::DPR_VFP2RegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVDRR), Reg) + .addReg(SaveReg1) + .addReg(SaveReg2) + .add(predOps(ARMCC::AL)); + else if (ARM::SPRRegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVSR), Reg) + .addReg(SaveReg1) + .add(predOps(ARMCC::AL)); + } + + for (unsigned Reg : NonclearedFPRegs) { + if (ARM::DPR_VFP2RegClass.contains(Reg)) { + if (STI->isLittle()) { + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRD), Reg) + .addReg(ARM::SP) + .addImm((Reg - ARM::D0) * 2) + .add(predOps(ARMCC::AL)); + } else { + // For big-endian targets we need to load the two subregisters of Reg + // manually because VLDRD would load them in wrong order + unsigned SReg0 = TRI->getSubReg(Reg, ARM::ssub_0); + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), SReg0) + .addReg(ARM::SP) + .addImm((Reg - ARM::D0) * 2) + .add(predOps(ARMCC::AL)); + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), SReg0 + 1) + .addReg(ARM::SP) + .addImm((Reg - ARM::D0) * 2 + 1) + .add(predOps(ARMCC::AL)); + } + } else if (ARM::SPRRegClass.contains(Reg)) { + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), Reg) + .addReg(ARM::SP) + .addImm(Reg - ARM::S0) + .add(predOps(ARMCC::AL)); + } + } + // restore FPSCR from stack and clear bits 0-4, 7, 28-31 + // The other bits are program global according to the AAPCS + if (passesFPReg) { + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2LDRi8), SpareReg) + .addReg(ARM::SP) + .addImm(0x40) + .add(predOps(ARMCC::AL)); + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg) + .addReg(SpareReg) + .addImm(0x0000009F) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg) + .addReg(SpareReg) + .addImm(0xF0000000) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMSR)) + .addReg(SpareReg) + .add(predOps(ARMCC::AL)); + // The ldr must happen after a floating point instruction. To prevent the + // post-ra scheduler to mess with the order, we create a bundle. + finalizeBundle(MBB, VLSTM->getIterator(), MBBI->getIterator()); + } +} + +void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc &DL, + const LivePhysRegs &LiveRegs) { + BitVector ClearRegs(32, true); + bool DefFP = determineFPRegsToClear(*MBBI, ClearRegs); + + // If the instruction does not write to a FP register and no elements were + // removed from the set, then no FP registers were used to pass + // arguments/returns. + if (!DefFP && ClearRegs.count() == ClearRegs.size()) { + // save space on stack for VLSTM + BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBspi), ARM::SP) + .addReg(ARM::SP) + .addImm(CMSE_FP_SAVE_SIZE >> 2) + .add(predOps(ARMCC::AL)); + + // Lazy store all FP registers to the stack + MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM)) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1, + ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7}) + VLSTM.addReg(R, RegState::Implicit | + (LiveRegs.contains(R) ? 0 : RegState::Undef)); + } else { + // Push all the callee-saved registers (s16-s31). + MachineInstrBuilder VPUSH = + BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTMSDB_UPD), ARM::SP) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg) + VPUSH.addReg(Reg); + + // Clear FP registers with a VSCCLRM. + (void)CMSEClearFPRegsV81(MBB, MBBI, ClearRegs); + + // Save floating-point context. + BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTR_FPCXTS_pre), ARM::SP) + .addReg(ARM::SP) + .addImm(-8) + .add(predOps(ARMCC::AL)); + } +} + +// Restore FP registers if present +void ARMExpandPseudo::CMSERestoreFPRegs( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs) { + if (STI->hasV8_1MMainlineOps()) + CMSERestoreFPRegsV81(MBB, MBBI, DL, AvailableRegs); + else + CMSERestoreFPRegsV8(MBB, MBBI, DL, AvailableRegs); +} + +void ARMExpandPseudo::CMSERestoreFPRegsV8( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs) { + if (!STI->hasFPRegs()) + return; + + // Use AvailableRegs to store the fp regs + std::vector<std::tuple<unsigned, unsigned, unsigned>> ClearedFPRegs; + std::vector<unsigned> NonclearedFPRegs; + for (const MachineOperand &Op : MBBI->operands()) { + if (Op.isReg() && Op.isDef()) { + unsigned Reg = Op.getReg(); + assert(!ARM::DPRRegClass.contains(Reg) || + ARM::DPR_VFP2RegClass.contains(Reg)); + assert(!ARM::QPRRegClass.contains(Reg)); + if (ARM::DPR_VFP2RegClass.contains(Reg)) { + if (AvailableRegs.size() >= 2) { + unsigned SaveReg2 = AvailableRegs.pop_back_val(); + unsigned SaveReg1 = AvailableRegs.pop_back_val(); + ClearedFPRegs.emplace_back(Reg, SaveReg1, SaveReg2); + + // Save the fp register to the normal registers + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRRD)) + .addReg(SaveReg1, RegState::Define) + .addReg(SaveReg2, RegState::Define) + .addReg(Reg) + .add(predOps(ARMCC::AL)); + } else { + NonclearedFPRegs.push_back(Reg); + } + } else if (ARM::SPRRegClass.contains(Reg)) { + if (AvailableRegs.size() >= 1) { + unsigned SaveReg = AvailableRegs.pop_back_val(); + ClearedFPRegs.emplace_back(Reg, SaveReg, 0); + + // Save the fp register to the normal registers + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRS), SaveReg) + .addReg(Reg) + .add(predOps(ARMCC::AL)); + } else { + NonclearedFPRegs.push_back(Reg); + } + } + } + } + + // Push FP regs that cannot be restored via normal registers on the stack + for (unsigned Reg : NonclearedFPRegs) { + if (ARM::DPR_VFP2RegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRD), Reg) + .addReg(ARM::SP) + .addImm((Reg - ARM::D0) * 2) + .add(predOps(ARMCC::AL)); + else if (ARM::SPRRegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRS), Reg) + .addReg(ARM::SP) + .addImm(Reg - ARM::S0) + .add(predOps(ARMCC::AL)); + } + + // Lazy load fp regs from stack + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM)) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + + // Restore all FP registers via normal registers + for (const auto &Regs : ClearedFPRegs) { + unsigned Reg, SaveReg1, SaveReg2; + std::tie(Reg, SaveReg1, SaveReg2) = Regs; + if (ARM::DPR_VFP2RegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVDRR), Reg) + .addReg(SaveReg1) + .addReg(SaveReg2) + .add(predOps(ARMCC::AL)); + else if (ARM::SPRRegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVSR), Reg) + .addReg(SaveReg1) + .add(predOps(ARMCC::AL)); + } + + // Pop the stack space + BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP) + .addReg(ARM::SP) + .addImm(CMSE_FP_SAVE_SIZE >> 2) + .add(predOps(ARMCC::AL)); +} + +static bool definesOrUsesFPReg(const MachineInstr &MI) { + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg()) + continue; + unsigned Reg = Op.getReg(); + if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) || + (Reg >= ARM::D0 && Reg <= ARM::D15) || + (Reg >= ARM::S0 && Reg <= ARM::S31)) + return true; + } + return false; +} + +void ARMExpandPseudo::CMSERestoreFPRegsV81( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs) { + if (!definesOrUsesFPReg(*MBBI)) { + // Load FP registers from stack. + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM)) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + + // Pop the stack space + BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP) + .addReg(ARM::SP) + .addImm(CMSE_FP_SAVE_SIZE >> 2) + .add(predOps(ARMCC::AL)); + } else { + // Restore the floating point context. + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::VLDR_FPCXTS_post), + ARM::SP) + .addReg(ARM::SP) + .addImm(8) + .add(predOps(ARMCC::AL)); + + // Pop all the callee-saved registers (s16-s31). + MachineInstrBuilder VPOP = + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDMSIA_UPD), ARM::SP) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg) + VPOP.addReg(Reg, RegState::Define); + } +} + /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as /// possible. This only gets used at -O0 so we don't care about efficiency of /// the generated code. @@ -1149,6 +1763,93 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, return true; } +static void CMSEPushCalleeSaves(const TargetInstrInfo &TII, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, int JumpReg, + const LivePhysRegs &LiveRegs, bool Thumb1Only) { + const DebugLoc &DL = MBBI->getDebugLoc(); + if (Thumb1Only) { // push Lo and Hi regs separately + MachineInstrBuilder PushMIB = + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); + for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) { + PushMIB.addReg( + Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef); + } + + // Thumb1 can only tPUSH low regs, so we copy the high regs to the low + // regs that we just saved and push the low regs again, taking care to + // not clobber JumpReg. If JumpReg is one of the low registers, push first + // the values of r9-r11, and then r8. That would leave them ordered in + // memory, and allow us to later pop them with a single instructions. + // FIXME: Could also use any of r0-r3 that are free (including in the + // first PUSH above). + for (int LoReg = ARM::R7, HiReg = ARM::R11; LoReg >= ARM::R4; --LoReg) { + if (JumpReg == LoReg) + continue; + BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg) + .addReg(HiReg, LiveRegs.contains(HiReg) ? 0 : RegState::Undef) + .add(predOps(ARMCC::AL)); + --HiReg; + } + MachineInstrBuilder PushMIB2 = + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); + for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) { + if (Reg == JumpReg) + continue; + PushMIB2.addReg(Reg, RegState::Kill); + } + + // If we couldn't use a low register for temporary storage (because it was + // the JumpReg), use r4 or r5, whichever is not JumpReg. It has already been + // saved. + if (JumpReg >= ARM::R4 && JumpReg <= ARM::R7) { + int LoReg = JumpReg == ARM::R4 ? ARM::R5 : ARM::R4; + BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg) + .addReg(ARM::R8, LiveRegs.contains(ARM::R8) ? 0 : RegState::Undef) + .add(predOps(ARMCC::AL)); + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)) + .add(predOps(ARMCC::AL)) + .addReg(LoReg, RegState::Kill); + } + } else { // push Lo and Hi registers with a single instruction + MachineInstrBuilder PushMIB = + BuildMI(MBB, MBBI, DL, TII.get(ARM::t2STMDB_UPD), ARM::SP) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg) { + PushMIB.addReg( + Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef); + } + } +} + +static void CMSEPopCalleeSaves(const TargetInstrInfo &TII, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, int JumpReg, + bool Thumb1Only) { + const DebugLoc &DL = MBBI->getDebugLoc(); + if (Thumb1Only) { + MachineInstrBuilder PopMIB = + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL)); + for (int R = 0; R < 4; ++R) { + PopMIB.addReg(ARM::R4 + R, RegState::Define); + BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), ARM::R8 + R) + .addReg(ARM::R4 + R, RegState::Kill) + .add(predOps(ARMCC::AL)); + } + MachineInstrBuilder PopMIB2 = + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL)); + for (int R = 0; R < 4; ++R) + PopMIB2.addReg(ARM::R4 + R, RegState::Define); + } else { // pop Lo and Hi registers with a single instruction + MachineInstrBuilder PopMIB = + BuildMI(MBB, MBBI, DL, TII.get(ARM::t2LDMIA_UPD), ARM::SP) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg) + PopMIB.addReg(Reg, RegState::Define); + } +} bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -1207,12 +1908,117 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Update call site info and delete the pseudo instruction TCRETURN. - MBB.getParent()->moveCallSiteInfo(&MI, &*NewMI); + if (MI.isCandidateForCallSiteEntry()) + MI.getMF()->moveCallSiteInfo(&MI, &*NewMI); MBB.erase(MBBI); MBBI = NewMI; return true; } + case ARM::tBXNS_RET: { + MachineBasicBlock &AfterBB = CMSEClearFPRegs(MBB, MBBI); + + if (STI->hasV8_1MMainlineOps()) { + // Restore the non-secure floating point context. + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(ARM::VLDR_FPCXTNS_post), ARM::SP) + .addReg(ARM::SP) + .addImm(4) + .add(predOps(ARMCC::AL)); + } + + // Clear all GPR that are not a use of the return instruction. + assert(llvm::all_of(MBBI->operands(), [](const MachineOperand &Op) { + return !Op.isReg() || Op.getReg() != ARM::R12; + })); + SmallVector<unsigned, 5> ClearRegs; + determineGPRegsToClear( + *MBBI, {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R12}, ClearRegs); + CMSEClearGPRegs(AfterBB, AfterBB.end(), MBBI->getDebugLoc(), ClearRegs, + ARM::LR); + + MachineInstrBuilder NewMI = + BuildMI(AfterBB, AfterBB.end(), MBBI->getDebugLoc(), + TII->get(ARM::tBXNS)) + .addReg(ARM::LR) + .add(predOps(ARMCC::AL)); + for (const MachineOperand &Op : MI.operands()) + NewMI->addOperand(Op); + MI.eraseFromParent(); + return true; + } + case ARM::tBLXNS_CALL: { + DebugLoc DL = MBBI->getDebugLoc(); + unsigned JumpReg = MBBI->getOperand(0).getReg(); + + // Figure out which registers are live at the point immediately before the + // call. When we indiscriminately push a set of registers, the live + // registers are added as ordinary use operands, whereas dead registers + // are "undef". + LivePhysRegs LiveRegs(*TRI); + LiveRegs.addLiveOuts(MBB); + for (const MachineInstr &MI : make_range(MBB.rbegin(), MBBI.getReverse())) + LiveRegs.stepBackward(MI); + LiveRegs.stepBackward(*MBBI); + + CMSEPushCalleeSaves(*TII, MBB, MBBI, JumpReg, LiveRegs, + AFI->isThumb1OnlyFunction()); + + SmallVector<unsigned, 16> ClearRegs; + determineGPRegsToClear(*MBBI, + {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4, + ARM::R5, ARM::R6, ARM::R7, ARM::R8, ARM::R9, + ARM::R10, ARM::R11, ARM::R12}, + ClearRegs); + auto OriginalClearRegs = ClearRegs; + + // Get the first cleared register as a scratch (to use later with tBIC). + // We need to use the first so we can ensure it is a low register. + unsigned ScratchReg = ClearRegs.front(); + + // Clear LSB of JumpReg + if (AFI->isThumb2Function()) { + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), JumpReg) + .addReg(JumpReg) + .addImm(1) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + } else { + // We need to use an extra register to cope with 8M Baseline, + // since we have saved all of the registers we are ok to trash a non + // argument register here. + BuildMI(MBB, MBBI, DL, TII->get(ARM::tMOVi8), ScratchReg) + .add(condCodeOp()) + .addImm(1) + .add(predOps(ARMCC::AL)); + BuildMI(MBB, MBBI, DL, TII->get(ARM::tBIC), JumpReg) + .addReg(ARM::CPSR, RegState::Define) + .addReg(JumpReg) + .addReg(ScratchReg) + .add(predOps(ARMCC::AL)); + } + + CMSESaveClearFPRegs(MBB, MBBI, DL, LiveRegs, + ClearRegs); // save+clear FP regs with ClearRegs + CMSEClearGPRegs(MBB, MBBI, DL, ClearRegs, JumpReg); + + const MachineInstrBuilder NewCall = + BuildMI(MBB, MBBI, DL, TII->get(ARM::tBLXNSr)) + .add(predOps(ARMCC::AL)) + .addReg(JumpReg, RegState::Kill); + + for (int I = 1, E = MI.getNumOperands(); I != E; ++I) + NewCall->addOperand(MI.getOperand(I)); + if (MI.isCandidateForCallSiteEntry()) + MI.getMF()->moveCallSiteInfo(&MI, NewCall.getInstr()); + + CMSERestoreFPRegs(MBB, MBBI, DL, OriginalClearRegs); // restore FP registers + + CMSEPopCalleeSaves(*TII, MBB, MBBI, JumpReg, AFI->isThumb1OnlyFunction()); + + MI.eraseFromParent(); + return true; + } case ARM::VMOVHcc: case ARM::VMOVScc: case ARM::VMOVDcc: { @@ -1359,17 +2165,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // If there's dynamic realignment, adjust for it. if (RI.needsStackRealignment(MF)) { MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned MaxAlign = MFI.getMaxAlignment(); + Align MaxAlign = MFI.getMaxAlign(); assert (!AFI->isThumb1OnlyFunction()); // Emit bic r6, r6, MaxAlign - assert(MaxAlign <= 256 && "The BIC instruction cannot encode " - "immediates larger than 256 with all lower " - "bits set."); + assert(MaxAlign <= Align(256) && + "The BIC instruction cannot encode " + "immediates larger than 256 with all lower " + "bits set."); unsigned bicOpc = AFI->isThumbFunction() ? ARM::t2BICri : ARM::BICri; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(bicOpc), ARM::R6) .addReg(ARM::R6, RegState::Kill) - .addImm(MaxAlign - 1) + .addImm(MaxAlign.value() - 1) .add(predOps(ARMCC::AL)) .add(condCodeOp()); } @@ -1410,17 +2217,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, const bool Thumb = Opcode == ARM::tTPsoft; MachineInstrBuilder MIB; + MachineFunction *MF = MBB.getParent(); if (STI->genLongCalls()) { - MachineFunction *MF = MBB.getParent(); MachineConstantPool *MCP = MF->getConstantPool(); unsigned PCLabelID = AFI->createPICLabelUId(); MachineConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(MF->getFunction().getContext(), "__aeabi_read_tp", PCLabelID, 0); Register Reg = MI.getOperand(0).getReg(); - MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg) - .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4)); + MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg) + .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, Align(4))); if (!Thumb) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); @@ -1440,7 +2248,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.cloneMemRefs(MI); TransferImpOps(MI, MIB, MIB); - MI.getMF()->moveCallSiteInfo(&MI, &*MIB); + // Update the call site info. + if (MI.isCandidateForCallSiteEntry()) + MF->moveCallSiteInfo(&MI, &*MIB); MI.eraseFromParent(); return true; } @@ -1504,7 +2314,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg) - .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4)); + .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, Align(4))); if (IsARM) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); @@ -1952,6 +2762,24 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case ARM::LOADDUAL: + case ARM::STOREDUAL: { + Register PairReg = MI.getOperand(0).getReg(); + + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Opcode == ARM::LOADDUAL ? ARM::LDRD : ARM::STRD)) + .addReg(TRI->getSubReg(PairReg, ARM::gsub_0), + Opcode == ARM::LOADDUAL ? RegState::Define : 0) + .addReg(TRI->getSubReg(PairReg, ARM::gsub_1), + Opcode == ARM::LOADDUAL ? RegState::Define : 0); + for (unsigned i = 1; i < MI.getNumOperands(); i++) + MIB.add(MI.getOperand(i)); + MIB.add(predOps(ARMCC::AL)); + MIB.cloneMemRefs(MI); + MI.eraseFromParent(); + return true; + } } } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp index 6e19db3c7e22..4bfca8a803ca 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -48,7 +48,6 @@ #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -209,7 +208,7 @@ class ARMFastISel final : public FastISel { unsigned ARMMoveToFPReg(MVT VT, unsigned SrcReg); unsigned ARMMoveToIntReg(MVT VT, unsigned SrcReg); unsigned ARMSelectCallOp(bool UseReg); - unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT); + unsigned ARMLowerPICELF(const GlobalValue *GV, MVT VT); const TargetLowering *getTargetLowering() { return &TLI; } @@ -444,12 +443,8 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) { if (!Subtarget->hasVFP2Base()) return false; // MachineConstantPool wants an explicit alignment. - unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); - if (Align == 0) { - // TODO: Figure out if this is correct. - Align = DL.getTypeAllocSize(CFP->getType()); - } - unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align); + Align Alignment = DL.getPrefTypeAlign(CFP->getType()); + unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment); unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS; @@ -508,12 +503,8 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) { return 0; // MachineConstantPool wants an explicit alignment. - unsigned Align = DL.getPrefTypeAlignment(C->getType()); - if (Align == 0) { - // TODO: Figure out if this is correct. - Align = DL.getTypeAllocSize(C->getType()); - } - unsigned Idx = MCP.getConstantPoolIndex(C, Align); + Align Alignment = DL.getPrefTypeAlign(C->getType()); + unsigned Idx = MCP.getConstantPoolIndex(C, Alignment); ResultReg = createResultReg(TLI.getRegClassFor(VT)); if (isThumb2) AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -570,14 +561,10 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF)); } else { // MachineConstantPool wants an explicit alignment. - unsigned Align = DL.getPrefTypeAlignment(GV->getType()); - if (Align == 0) { - // TODO: Figure out if this is correct. - Align = DL.getTypeAllocSize(GV->getType()); - } + Align Alignment = DL.getPrefTypeAlign(GV->getType()); if (Subtarget->isTargetELF() && IsPositionIndependent) - return ARMLowerPICELF(GV, Align, VT); + return ARMLowerPICELF(GV, VT); // Grab index. unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; @@ -585,7 +572,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) { ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id, ARMCP::CPValue, PCAdj); - unsigned Idx = MCP.getConstantPoolIndex(CPV, Align); + unsigned Idx = MCP.getConstantPoolIndex(CPV, Alignment); // Load value. MachineInstrBuilder MIB; @@ -882,7 +869,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr, int Offset = Addr.Offset; MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); // Now add the rest of the operands. MIB.addFrameIndex(FI); @@ -2090,6 +2077,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs, bool ARMFastISel::SelectRet(const Instruction *I) { const ReturnInst *Ret = cast<ReturnInst>(I); const Function &F = *I->getParent()->getParent(); + const bool IsCmseNSEntry = F.hasFnAttribute("cmse_nonsecure_entry"); if (!FuncInfo.CanLowerReturn) return false; @@ -2166,8 +2154,17 @@ bool ARMFastISel::SelectRet(const Instruction *I) { RetRegs.push_back(VA.getLocReg()); } + unsigned RetOpc; + if (IsCmseNSEntry) + if (isThumb2) + RetOpc = ARM::tBXNS_RET; + else + llvm_unreachable("CMSE not valid for non-Thumb targets"); + else + RetOpc = Subtarget->getReturnOpcode(); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Subtarget->getReturnOpcode())); + TII.get(RetOpc)); AddOptionalDefs(MIB); for (unsigned R : RetRegs) MIB.addReg(R, RegState::Implicit); @@ -2239,7 +2236,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { if (!isTypeLegal(ArgTy, ArgVT)) return false; ISD::ArgFlagsTy Flags; - Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy))); + Flags.setOrigAlign(DL.getABITypeAlign(ArgTy)); Args.push_back(Op); ArgRegs.push_back(Arg); @@ -2293,7 +2290,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { bool ARMFastISel::SelectCall(const Instruction *I, const char *IntrMemName = nullptr) { const CallInst *CI = cast<CallInst>(I); - const Value *Callee = CI->getCalledValue(); + const Value *Callee = CI->getCalledOperand(); // Can't handle inline asm. if (isa<InlineAsm>(Callee)) return false; @@ -2302,12 +2299,11 @@ bool ARMFastISel::SelectCall(const Instruction *I, if (CI->isTailCall()) return false; // Check the calling convention. - ImmutableCallSite CS(CI); - CallingConv::ID CC = CS.getCallingConv(); + CallingConv::ID CC = CI->getCallingConv(); // TODO: Avoid some calling conventions? - FunctionType *FTy = CS.getFunctionType(); + FunctionType *FTy = CI->getFunctionType(); bool isVarArg = FTy->isVarArg(); // Handle *simple* calls for now. @@ -2334,47 +2330,46 @@ bool ARMFastISel::SelectCall(const Instruction *I, SmallVector<Register, 8> ArgRegs; SmallVector<MVT, 8> ArgVTs; SmallVector<ISD::ArgFlagsTy, 8> ArgFlags; - unsigned arg_size = CS.arg_size(); + unsigned arg_size = CI->arg_size(); Args.reserve(arg_size); ArgRegs.reserve(arg_size); ArgVTs.reserve(arg_size); ArgFlags.reserve(arg_size); - for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); - i != e; ++i) { + for (auto ArgI = CI->arg_begin(), ArgE = CI->arg_end(); ArgI != ArgE; ++ArgI) { // If we're lowering a memory intrinsic instead of a regular call, skip the // last argument, which shouldn't be passed to the underlying function. - if (IntrMemName && e - i <= 1) + if (IntrMemName && ArgE - ArgI <= 1) break; ISD::ArgFlagsTy Flags; - unsigned ArgIdx = i - CS.arg_begin(); - if (CS.paramHasAttr(ArgIdx, Attribute::SExt)) + unsigned ArgIdx = ArgI - CI->arg_begin(); + if (CI->paramHasAttr(ArgIdx, Attribute::SExt)) Flags.setSExt(); - if (CS.paramHasAttr(ArgIdx, Attribute::ZExt)) + if (CI->paramHasAttr(ArgIdx, Attribute::ZExt)) Flags.setZExt(); // FIXME: Only handle *easy* calls for now. - if (CS.paramHasAttr(ArgIdx, Attribute::InReg) || - CS.paramHasAttr(ArgIdx, Attribute::StructRet) || - CS.paramHasAttr(ArgIdx, Attribute::SwiftSelf) || - CS.paramHasAttr(ArgIdx, Attribute::SwiftError) || - CS.paramHasAttr(ArgIdx, Attribute::Nest) || - CS.paramHasAttr(ArgIdx, Attribute::ByVal)) + if (CI->paramHasAttr(ArgIdx, Attribute::InReg) || + CI->paramHasAttr(ArgIdx, Attribute::StructRet) || + CI->paramHasAttr(ArgIdx, Attribute::SwiftSelf) || + CI->paramHasAttr(ArgIdx, Attribute::SwiftError) || + CI->paramHasAttr(ArgIdx, Attribute::Nest) || + CI->paramHasAttr(ArgIdx, Attribute::ByVal)) return false; - Type *ArgTy = (*i)->getType(); + Type *ArgTy = (*ArgI)->getType(); MVT ArgVT; if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 && ArgVT != MVT::i1) return false; - Register Arg = getRegForValue(*i); + Register Arg = getRegForValue(*ArgI); if (!Arg.isValid()) return false; - Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy))); + Flags.setOrigAlign(DL.getABITypeAlign(ArgTy)); - Args.push_back(*i); + Args.push_back(*ArgI); ArgRegs.push_back(Arg); ArgVTs.push_back(ArgVT); ArgFlags.push_back(Flags); @@ -2949,8 +2944,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, return true; } -unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, - unsigned Align, MVT VT) { +unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MVT VT) { bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); LLVMContext *Context = &MF->getFunction().getContext(); @@ -2961,12 +2955,12 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, /*AddCurrentAddress=*/UseGOT_PREL); - unsigned ConstAlign = - MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context)); + Align ConstAlign = + MF->getDataLayout().getPrefTypeAlign(Type::getInt32PtrTy(*Context)); unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign); MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand::MOLoad, 4, Align(4)); Register TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp index cb98b2b34efd..8a8f3237bb6f 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -142,27 +142,6 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects(); } -static bool isCSRestore(MachineInstr &MI, const ARMBaseInstrInfo &TII, - const MCPhysReg *CSRegs) { - // Integer spill area is handled with "pop". - if (isPopOpcode(MI.getOpcode())) { - // The first two operands are predicates. The last two are - // imp-def and imp-use of SP. Check everything in between. - for (int i = 5, e = MI.getNumOperands(); i != e; ++i) - if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs)) - return false; - return true; - } - if ((MI.getOpcode() == ARM::LDR_POST_IMM || - MI.getOpcode() == ARM::LDR_POST_REG || - MI.getOpcode() == ARM::t2LDR_POST) && - isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs) && - MI.getOperand(1).getReg() == ARM::SP) - return true; - - return false; -} - static void emitRegPlusImmediate( bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg, @@ -256,9 +235,9 @@ struct StackAdjustingInsts { if (HasFP && !Info.BeforeFPSet) return; - CFAOffset -= Info.SPAdjust; + CFAOffset += Info.SPAdjust; unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); BuildMI(MBB, std::next(Info.I), dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) @@ -281,13 +260,13 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const unsigned Reg, - const unsigned Alignment, + const Align Alignment, const bool MustBeSingleInstruction) { const ARMSubtarget &AST = static_cast<const ARMSubtarget &>(MF.getSubtarget()); const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops(); - const unsigned AlignMask = Alignment - 1; - const unsigned NrBitsToZero = countTrailingZeros(Alignment); + const unsigned AlignMask = Alignment.value() - 1U; + const unsigned NrBitsToZero = Log2(Alignment); assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported"); if (!AFI->isThumbFunction()) { // if the BFC instruction is available, use that to zero the lower @@ -343,14 +322,15 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, /// Unfortunately we cannot determine this value in determineCalleeSaves() yet /// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use /// this to produce a conservative estimate that we check in an assert() later. -static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) { +static int getMaxFPOffset(const ARMSubtarget &STI, const ARMFunctionInfo &AFI) { // For Thumb1, push.w isn't available, so the first push will always push // r7 and lr onto the stack first. if (AFI.isThumb1OnlyFunction()) return -AFI.getArgRegsSaveSize() - (2 * 4); // This is a conservative estimation: Assume the frame pointer being r7 and // pc("r15") up to r8 getting spilled before (= 8 registers). - return -AFI.getArgRegsSaveSize() - (8 * 4); + int FPCXTSaveSize = (STI.hasV8_1MMainlineOps() && AFI.isCmseNSEntryFunction()) ? 4 : 0; + return - FPCXTSaveSize - AFI.getArgRegsSaveSize() - (8 * 4); } void ARMFrameLowering::emitPrologue(MachineFunction &MF, @@ -367,10 +347,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, assert(!AFI->isThumb1OnlyFunction() && "This emitPrologue does not support Thumb1!"); bool isARM = !AFI->isThumbFunction(); - unsigned Align = STI.getFrameLowering()->getStackAlignment(); + Align Alignment = STI.getFrameLowering()->getStackAlign(); unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); unsigned NumBytes = MFI.getStackSize(); const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + int FPCXTSaveSize = 0; // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. @@ -439,6 +420,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, FramePtrSpillFI = FI; GPRCS1Size += 4; break; + case ARM::FPCXTNS: + FPCXTSaveSize = 4; + break; default: // This is a DPR. Exclude the aligned DPRCS2 spills. if (Reg == ARM::D8) @@ -448,25 +432,35 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, } } - // Move past area 1. + // Move past FPCXT area. MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push; + if (FPCXTSaveSize > 0) { + LastPush = MBBI++; + DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, true); + } + + // Move past area 1. if (GPRCS1Size > 0) { GPRCS1Push = LastPush = MBBI++; DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true); } // Determine starting offsets of spill areas. - unsigned GPRCS1Offset = NumBytes - ArgRegsSaveSize - GPRCS1Size; + unsigned FPCXTOffset = NumBytes - ArgRegsSaveSize - FPCXTSaveSize; + unsigned GPRCS1Offset = FPCXTOffset - GPRCS1Size; unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size; - unsigned DPRAlign = DPRCSSize ? std::min(8U, Align) : 4U; - unsigned DPRGapSize = (GPRCS1Size + GPRCS2Size + ArgRegsSaveSize) % DPRAlign; + Align DPRAlign = DPRCSSize ? std::min(Align(8), Alignment) : Align(4); + unsigned DPRGapSize = + (GPRCS1Size + GPRCS2Size + FPCXTSaveSize + ArgRegsSaveSize) % + DPRAlign.value(); + unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize; int FramePtrOffsetInPush = 0; if (HasFP) { int FPOffset = MFI.getObjectOffset(FramePtrSpillFI); - assert(getMaxFPOffset(MF.getFunction(), *AFI) <= FPOffset && + assert(getMaxFPOffset(STI, *AFI) <= FPOffset && "Max FP estimation is wrong"); - FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize; + FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize + FPCXTSaveSize; AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) + NumBytes); } @@ -599,9 +593,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, PushSize + FramePtrOffsetInPush, MachineInstr::FrameSetup); if (FramePtrOffsetInPush + PushSize != 0) { - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( nullptr, MRI->getDwarfRegNum(FramePtr, true), - -(ArgRegsSaveSize - FramePtrOffsetInPush))); + FPCXTSaveSize + ArgRegsSaveSize - FramePtrOffsetInPush)); BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -707,6 +701,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, MFI.setOffsetAdjustment(MFI.getOffsetAdjustment() - AFI->getFramePtrSpillOffset()); + AFI->setFPCXTSaveAreaSize(FPCXTSaveSize); AFI->setGPRCalleeSavedArea1Size(GPRCS1Size); AFI->setGPRCalleeSavedArea2Size(GPRCS2Size); AFI->setDPRCalleeSavedGapSize(DPRGapSize); @@ -717,7 +712,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, // If aligned NEON registers were spilled, the stack has already been // realigned. if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) { - unsigned MaxAlign = MFI.getMaxAlignment(); + Align MaxAlign = MFI.getMaxAlign(); assert(!AFI->isThumb1OnlyFunction()); if (!AFI->isThumbFunction()) { emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign, @@ -793,20 +788,22 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (!AFI->hasStackFrame()) { if (NumBytes - ArgRegsSaveSize != 0) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize, + MachineInstr::FrameDestroy); } else { // Unwind MBBI to point to first LDR / VLDRD. - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); if (MBBI != MBB.begin()) { do { --MBBI; - } while (MBBI != MBB.begin() && isCSRestore(*MBBI, TII, CSRegs)); - if (!isCSRestore(*MBBI, TII, CSRegs)) + } while (MBBI != MBB.begin() && + MBBI->getFlag(MachineInstr::FrameDestroy)); + if (!MBBI->getFlag(MachineInstr::FrameDestroy)) ++MBBI; } // Move SP to start of FP callee save spill area. NumBytes -= (ArgRegsSaveSize + + AFI->getFPCXTSaveAreaSize() + AFI->getGPRCalleeSavedArea1Size() + AFI->getGPRCalleeSavedArea2Size() + AFI->getDPRCalleeSavedGapSize() + @@ -819,7 +816,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (NumBytes) { if (isARM) emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes, - ARMCC::AL, 0, TII); + ARMCC::AL, 0, TII, + MachineInstr::FrameDestroy); else { // It's not possible to restore SP from FP in a single instruction. // For iOS, this looks like: @@ -831,10 +829,11 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, assert(!MFI.getPristineRegs(MF).test(ARM::R4) && "No scratch register to restore SP from FP!"); emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes, - ARMCC::AL, 0, TII); + ARMCC::AL, 0, TII, MachineInstr::FrameDestroy); BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) .addReg(ARM::R4) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::FrameDestroy); } } else { // Thumb2 or ARM. @@ -842,15 +841,18 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP) .addReg(FramePtr) .add(predOps(ARMCC::AL)) - .add(condCodeOp()); + .add(condCodeOp()) + .setMIFlag(MachineInstr::FrameDestroy); else BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP) .addReg(FramePtr) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .setMIFlag(MachineInstr::FrameDestroy); } } else if (NumBytes && !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes)) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes, + MachineInstr::FrameDestroy); // Increment past our save areas. if (MBBI != MBB.end() && AFI->getDPRCalleeSavedAreaSize()) { @@ -863,31 +865,32 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (AFI->getDPRCalleeSavedGapSize()) { assert(AFI->getDPRCalleeSavedGapSize() == 4 && "unexpected DPR alignment gap"); - emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize()); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize(), + MachineInstr::FrameDestroy); } if (AFI->getGPRCalleeSavedArea2Size()) MBBI++; if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; + if (AFI->getFPCXTSaveAreaSize()) MBBI++; } if (ArgRegsSaveSize) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize, + MachineInstr::FrameDestroy); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for /// debug info. It's the same as what we use for resolving the code-gen /// references for now. FIXME: This can go wrong when references are /// SP-relative and simple call frames aren't used. -int -ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { +int ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + Register &FrameReg) const { return ResolveFrameIndexReference(MF, FI, FrameReg, 0); } -int -ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, - int FI, unsigned &FrameReg, - int SPAdj) const { +int ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, + int FI, Register &FrameReg, + int SPAdj) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>( MF.getSubtarget().getRegisterInfo()); @@ -969,10 +972,9 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + ArrayRef<CalleeSavedInfo> CSI, unsigned StmOpc, unsigned StrOpc, - bool NoGap, - bool(*Func)(unsigned, bool), + bool NoGap, bool (*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs, unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); @@ -1047,10 +1049,10 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, + MutableArrayRef<CalleeSavedInfo> CSI, unsigned LdmOpc, unsigned LdrOpc, bool isVarArg, bool NoGap, - bool(*Func)(unsigned, bool), + bool (*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); @@ -1060,6 +1062,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, bool isTailCall = false; bool isInterrupt = false; bool isTrap = false; + bool isCmseEntry = false; if (MBB.end() != MI) { DL = MI->getDebugLoc(); unsigned RetOpcode = MI->getOpcode(); @@ -1069,6 +1072,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, isTrap = RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl || RetOpcode == ARM::tTRAP; + isCmseEntry = (RetOpcode == ARM::tBXNS || RetOpcode == ARM::tBXNS_RET); } SmallVector<unsigned, 4> Regs; @@ -1086,7 +1090,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, continue; if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && - !isTrap && STI.hasV5TOps()) { + !isCmseEntry && !isTrap && STI.hasV5TOps()) { if (MBB.succ_empty()) { Reg = ARM::PC; // Fold the return instruction into the LDM. @@ -1119,7 +1123,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, if (Regs.size() > 1 || LdrOpc == 0) { MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP) .addReg(ARM::SP) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .setMIFlags(MachineInstr::FrameDestroy); for (unsigned i = 0, e = Regs.size(); i < e; ++i) MIB.addReg(Regs[i], getDefRegState(true)); if (DeleteRet) { @@ -1137,7 +1142,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0]) .addReg(ARM::SP, RegState::Define) - .addReg(ARM::SP); + .addReg(ARM::SP) + .setMIFlags(MachineInstr::FrameDestroy); // ARM mode needs an extra reg0 here due to addrmode2. Will go away once // that refactoring is complete (eventually). if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) { @@ -1162,7 +1168,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned NumAlignedDPRCS2Regs, - const std::vector<CalleeSavedInfo> &CSI, + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) { MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -1180,7 +1186,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, int FI = CSI[i].getFrameIdx(); // The even-numbered registers will be 16-byte aligned, the odd-numbered // registers will be 8-byte aligned. - MFI.setObjectAlignment(FI, DNum % 2 ? 8 : 16); + MFI.setObjectAlignment(FI, DNum % 2 ? Align(8) : Align(16)); // The stack slot for D8 needs to be maximally aligned because this is // actually the point where we align the stack pointer. MachineFrameInfo @@ -1189,7 +1195,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, // over-alignment is not realized because the code inserted below adjusts // the stack pointer by numregs * 8 before aligning the stack pointer. if (DNum == 0) - MFI.setObjectAlignment(FI, MFI.getMaxAlignment()); + MFI.setObjectAlignment(FI, MFI.getMaxAlign()); } // Move the stack pointer to the d8 spill slot, and align it at the same @@ -1212,7 +1218,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, .add(predOps(ARMCC::AL)) .add(condCodeOp()); - unsigned MaxAlign = MF.getFrameInfo().getMaxAlignment(); + Align MaxAlign = MF.getFrameInfo().getMaxAlign(); // We must set parameter MustBeSingleInstruction to true, since // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform // stack alignment. Luckily, this can always be done since all ARM @@ -1335,7 +1341,7 @@ skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned NumAlignedDPRCS2Regs, - const std::vector<CalleeSavedInfo> &CSI, + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) { MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -1422,10 +1428,9 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, std::prev(MI)->addRegisterKilled(ARM::R4, TRI); } -bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { +bool ARMFrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; @@ -1437,6 +1442,16 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, ARM::t2STR_PRE : ARM::STR_PRE_IMM; unsigned FltOpc = ARM::VSTMDDB_UPD; unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs(); + // Save the non-secure floating point context. + if (llvm::any_of(CSI, [](const CalleeSavedInfo &C) { + return C.getReg() == ARM::FPCXTNS; + })) { + BuildMI(MBB, MI, DebugLoc(), STI.getInstrInfo()->get(ARM::VSTR_FPCXTNS_pre), + ARM::SP) + .addReg(ARM::SP) + .addImm(-4) + .add(predOps(ARMCC::AL)); + } emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0, MachineInstr::FrameSetup); emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0, @@ -1453,10 +1468,9 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB, return true; } -bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { +bool ARMFrameLowering::restoreCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; @@ -1601,7 +1615,7 @@ checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) { return; // Don't bother if the default stack alignment is sufficiently high. - if (MF.getSubtarget().getFrameLowering()->getStackAlignment() >= 8) + if (MF.getSubtarget().getFrameLowering()->getStackAlign() >= Align(8)) return; // Aligned spills require stack realignment. @@ -1630,6 +1644,16 @@ checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) { SavedRegs.set(ARM::R4); } +bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { + // For CMSE entry functions, we want to save the FPCXT_NS immediately + // upon function entry (resp. restore it immmediately before return) + if (STI.hasV8_1MMainlineOps() && + MF.getInfo<ARMFunctionInfo>()->isCmseNSEntryFunction()) + return false; + + return true; +} + void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { @@ -1699,6 +1723,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, if (RegInfo->hasBasePointer(MF)) SavedRegs.set(RegInfo->getBaseRegister()); + // On v8.1-M.Main CMSE entry functions save/restore FPCXT. + if (STI.hasV8_1MMainlineOps() && AFI->isCmseNSEntryFunction()) + CanEliminateFrame = false; + // Don't spill FP if the frame can be eliminated. This is determined // by scanning the callee-save registers to see if any is modified. const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); @@ -1771,8 +1799,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, if (!LRSpilled && AFI->isThumb1OnlyFunction()) { unsigned FnSize = EstimateFunctionSizeInBytes(MF, TII); // Force LR to be spilled if the Thumb function size is > 2048. This enables - // use of BL to implement far jump. If it turns out that it's not needed - // then the branch fix up path will undo it. + // use of BL to implement far jump. if (FnSize >= (1 << 11)) { CanEliminateFrame = false; ForceLRSpill = true; @@ -1858,7 +1885,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // // We could do slightly better on Thumb1; in some cases, an sp-relative // offset would be legal even though an fp-relative offset is not. - int MaxFPOffset = getMaxFPOffset(MF.getFunction(), *AFI); + int MaxFPOffset = getMaxFPOffset(STI, *AFI); bool HasLargeArgumentList = HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit; @@ -2045,8 +2072,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // of GPRs, spill one extra callee save GPR so we won't have to pad between // the integer and double callee save areas. LLVM_DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n"); - unsigned TargetAlign = getStackAlignment(); - if (TargetAlign >= 8 && (NumGPRSpills & 1)) { + const Align TargetAlign = getStackAlign(); + if (TargetAlign >= Align(8) && (NumGPRSpills & 1)) { if (CS1Spilled && !UnspilledCS1GPRs.empty()) { for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { unsigned Reg = UnspilledCS1GPRs[i]; @@ -2083,7 +2110,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, if (BigFrameOffsets && !ExtraCSSpill) { // If any non-reserved CS register isn't spilled, just spill one or two // extra. That should take care of it! - unsigned NumExtras = TargetAlign / 4; + unsigned NumExtras = TargetAlign.value() / 4; SmallVector<unsigned, 2> Extras; while (NumExtras && !UnspilledCS1GPRs.empty()) { unsigned Reg = UnspilledCS1GPRs.back(); @@ -2117,16 +2144,15 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n"); const TargetRegisterClass &RC = ARM::GPRRegClass; unsigned Size = TRI->getSpillSize(RC); - unsigned Align = TRI->getSpillAlignment(RC); - RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false)); + Align Alignment = TRI->getSpillAlign(RC); + RS->addScavengingFrameIndex( + MFI.CreateStackObject(Size, Alignment, false)); } } } - if (ForceLRSpill) { + if (ForceLRSpill) SavedRegs.set(ARM::LR); - AFI->setLRIsSpilledForFarJump(true); - } AFI->setLRIsSpilled(SavedRegs.test(ARM::LR)); } @@ -2142,6 +2168,27 @@ void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF, SavedRegs.set(ARM::R0); } +bool ARMFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const { + // For CMSE entry functions, handle floating-point context as if it was a + // callee-saved register. + if (STI.hasV8_1MMainlineOps() && + MF.getInfo<ARMFunctionInfo>()->isCmseNSEntryFunction()) { + CSI.emplace_back(ARM::FPCXTNS); + CSI.back().setRestored(false); + } + + return false; +} + +const TargetFrameLowering::SpillSlot * +ARMFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { + static const SpillSlot FixedSpillOffsets[] = {{ARM::FPCXTNS, -4}}; + NumEntries = array_lengthof(FixedSpillOffsets); + return FixedSpillOffsets; +} + MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { @@ -2364,8 +2411,7 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Emit the relevant DWARF information about the change in stack pointer as // well as where to find both r4 and r5 (the callee-save registers) - CFIIndex = - MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -8)); + CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 8)); BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( @@ -2409,7 +2455,7 @@ void ARMFrameLowering::adjustForSegmentedStacks( ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create( MF.getFunction().getContext(), "__STACK_LIMIT", PCLabelId, 0); MachineConstantPool *MCP = MF.getConstantPool(); - unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4); + unsigned CPI = MCP->getConstantPoolIndex(NewCPV, Align(4)); // ldr SR0, [pc, offset(STACK_LIMIT)] BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0) @@ -2507,8 +2553,7 @@ void ARMFrameLowering::adjustForSegmentedStacks( // Emit the DWARF info about the change in stack as well as where to find the // previous link register - CFIIndex = - MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -12)); + CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 12)); BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( @@ -2570,7 +2615,7 @@ void ARMFrameLowering::adjustForSegmentedStacks( } // Update the CFA offset now that we've popped - CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0)); + CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); @@ -2594,7 +2639,7 @@ void ARMFrameLowering::adjustForSegmentedStacks( } // Update the CFA offset now that we've popped - CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0)); + CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h index 0462b01af707..4c2c07d64f57 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -9,9 +9,7 @@ #ifndef LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H #define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H -#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/TargetFrameLowering.h" -#include <vector> namespace llvm { @@ -33,13 +31,14 @@ public: bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const override; - bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const override; + bool + restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + MutableArrayRef<CalleeSavedInfo> CSI, + const TargetRegisterInfo *TRI) const override; bool keepFramePointer(const MachineFunction &MF) const override; @@ -49,9 +48,9 @@ public: bool hasReservedCallFrame(const MachineFunction &MF) const override; bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const override; + Register &FrameReg) const override; int ResolveFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg, int SPAdj) const; + Register &FrameReg, int SPAdj) const; void getCalleeSaves(const MachineFunction &MF, BitVector &SavedRegs) const override; @@ -62,25 +61,31 @@ public: MachineBasicBlock &MBB) const override; /// Returns true if the target will correctly handle shrink wrapping. - bool enableShrinkWrapping(const MachineFunction &MF) const override { - return true; - } + bool enableShrinkWrapping(const MachineFunction &MF) const override; + bool isProfitableForNoCSROpt(const Function &F) const override { // The no-CSR optimisation is bad for code size on ARM, because we can save // many registers with a single PUSH/POP pair. return false; } + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector<CalleeSavedInfo> &CSI) const override; + + const SpillSlot * + getCalleeSavedSpillSlots(unsigned &NumEntries) const override; + private: void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc, - unsigned StrOpc, bool NoGap, - bool(*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs, - unsigned MIFlags = 0) const; + ArrayRef<CalleeSavedInfo> CSI, unsigned StmOpc, + unsigned StrOpc, bool NoGap, bool (*Func)(unsigned, bool), + unsigned NumAlignedDPRCS2Regs, unsigned MIFlags = 0) const; void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc, + MutableArrayRef<CalleeSavedInfo> CSI, unsigned LdmOpc, unsigned LdrOpc, bool isVarArg, bool NoGap, - bool(*Func)(unsigned, bool), + bool (*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs) const; MachineBasicBlock::iterator diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 9b06987178d8..2a9a31dab74f 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -145,6 +145,8 @@ public: // Thumb 2 Addressing Modes: bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); + template <unsigned Shift> + bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, @@ -237,6 +239,10 @@ private: void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, uint16_t OpcodeWithNoCarry, bool Add, bool Predicated); + /// SelectMVE_VSHLC - Select MVE intrinsics for a shift that carries between + /// vector lanes. + void SelectMVE_VSHLC(SDNode *N, bool Predicated); + /// Select long MVE vector reductions with two vector operands /// Stride is the number of vector element widths the instruction can operate /// on: @@ -264,7 +270,21 @@ private: /// pointer points to a set of NumVecs sub-opcodes used for the /// different stages (e.g. VLD20 versus VLD21) of each load family. void SelectMVE_VLD(SDNode *N, unsigned NumVecs, - const uint16_t *const *Opcodes); + const uint16_t *const *Opcodes, bool HasWriteback); + + /// SelectMVE_VxDUP - Select MVE incrementing-dup instructions. Opcodes is an + /// array of 3 elements for the 8, 16 and 32-bit lane sizes. + void SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, + bool Wrapping, bool Predicated); + + /// Select SelectCDE_CXxD - Select CDE dual-GPR instruction (one of CX1D, + /// CX1DA, CX2D, CX2DA, CX3, CX3DA). + /// \arg \c NumExtraOps number of extra operands besides the coprocossor, + /// the accumulator and the immediate operand, i.e. 0 + /// for CX1*, 1 for CX2*, 2 for CX3* + /// \arg \c HasAccum whether the instruction has an accumulator operand + void SelectCDE_CXxD(SDNode *N, uint16_t Opcode, size_t NumExtraOps, + bool HasAccum); /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used @@ -1171,8 +1191,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, // Only multiples of 4 are allowed for the offset, so the frame object // alignment must be at least 4. MachineFrameInfo &MFI = MF->getFrameInfo(); - if (MFI.getObjectAlignment(FI) < 4) - MFI.setObjectAlignment(FI, 4); + if (MFI.getObjectAlign(FI) < Align(4)) + MFI.setObjectAlignment(FI, Align(4)); Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); @@ -1195,9 +1215,9 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, if (RHSC * 4 < MFI.getObjectSize(FI)) { // For LHS+RHS to result in an offset that's a multiple of 4 the object // indexed by the LHS must be 4-byte aligned. - if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4) - MFI.setObjectAlignment(FI, 4); - if (MFI.getObjectAlignment(FI) >= 4) { + if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlign(FI) < Align(4)) + MFI.setObjectAlignment(FI, Align(4)); + if (MFI.getObjectAlign(FI) >= Align(4)) { Base = CurDAG->getTargetFrameIndex( FI, TLI->getPointerTy(CurDAG->getDataLayout())); OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); @@ -1294,6 +1314,33 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, return true; } +template <unsigned Shift> +bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base, + SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || CurDAG->isBaseWithConstantOffset(N)) { + int RHSC; + if (isScaledConstantInRange(N.getOperand(1), 1 << Shift, -255, 256, RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + OffImm = + CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32); + return true; + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R - imm8 operands. @@ -1679,7 +1726,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { EVT LoadedVT; unsigned Opcode = 0; bool isSExtLd, isPre; - unsigned Align; + Align Alignment; ARMVCC::VPTCodes Pred; SDValue PredReg; SDValue Chain, Base, Offset; @@ -1695,7 +1742,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { Chain = LD->getChain(); Base = LD->getBasePtr(); Offset = LD->getOffset(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); Pred = ARMVCC::None; @@ -1711,7 +1758,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { Chain = LD->getChain(); Base = LD->getBasePtr(); Offset = LD->getOffset(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD; isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC); Pred = ARMVCC::Then; @@ -1725,7 +1772,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { bool CanChangeType = Subtarget->isLittle() && !isa<MaskedLoadSDNode>(N); SDValue NewOffset; - if (Align >= 2 && LoadedVT == MVT::v4i16 && + if (Alignment >= Align(2) && LoadedVT == MVT::v4i16 && SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) { if (isSExtLd) Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post; @@ -1743,12 +1790,12 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post; else Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post; - } else if (Align >= 4 && + } else if (Alignment >= Align(4) && (CanChangeType || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) && SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 2)) Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post; - else if (Align >= 2 && + else if (Alignment >= Align(2) && (CanChangeType || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) && SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) @@ -1762,8 +1809,8 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) { SDValue Ops[] = {Base, NewOffset, CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg, Chain}; - SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0), - MVT::i32, MVT::Other, Ops); + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, + N->getValueType(0), MVT::Other, Ops); transferMemOperands(N, New); ReplaceUses(SDValue(N, 0), SDValue(New, 1)); ReplaceUses(SDValue(N, 1), SDValue(New, 0)); @@ -2009,6 +2056,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { + assert(Subtarget->hasNEON()); assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range"); SDLoc dl(N); @@ -2030,6 +2078,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; case MVT::v4f16: + case MVT::v4bf16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; @@ -2037,6 +2086,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs, // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; case MVT::v8f16: + case MVT::v8bf16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; @@ -2148,6 +2198,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { + assert(Subtarget->hasNEON()); assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range"); SDLoc dl(N); @@ -2172,6 +2223,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; case MVT::v4f16: + case MVT::v4bf16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; @@ -2179,6 +2231,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; case MVT::v8f16: + case MVT::v8bf16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; @@ -2299,6 +2352,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, const uint16_t *QOpcodes) { + assert(Subtarget->hasNEON()); assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range"); SDLoc dl(N); @@ -2339,11 +2393,13 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; case MVT::v4f16: + case MVT::v4bf16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; // Quad-register operations: case MVT::v8f16: + case MVT::v8bf16: case MVT::v8i16: OpcodeIndex = 0; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 1; break; @@ -2482,7 +2538,16 @@ void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes, Ops.push_back(N->getOperand(0)); // chain - CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); + SmallVector<EVT, 8> VTs; + VTs.push_back(N->getValueType(1)); + VTs.push_back(N->getValueType(0)); + VTs.push_back(N->getValueType(2)); + + SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), VTs, Ops); + ReplaceUses(SDValue(N, 0), SDValue(New, 1)); + ReplaceUses(SDValue(N, 1), SDValue(New, 0)); + ReplaceUses(SDValue(N, 2), SDValue(New, 2)); + CurDAG->RemoveDeadNode(N); } void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode, @@ -2552,6 +2617,25 @@ void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } +void ARMDAGToDAGISel::SelectMVE_VSHLC(SDNode *N, bool Predicated) { + SDLoc Loc(N); + SmallVector<SDValue, 8> Ops; + + // One vector input, followed by a 32-bit word of bits to shift in + // and then an immediate shift count + Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(2)); + int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(); + Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(4)); + else + AddEmptyMVEPredicateToOps(Ops, Loc); + + CurDAG->SelectNodeTo(N, ARM::MVE_VSHLC, N->getVTList(), makeArrayRef(Ops)); +} + static bool SDValueToConstBool(SDValue SDVal) { assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant"); ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal); @@ -2644,7 +2728,8 @@ void ARMDAGToDAGISel::SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated, } void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs, - const uint16_t *const *Opcodes) { + const uint16_t *const *Opcodes, + bool HasWriteback) { EVT VT = N->getValueType(0); SDLoc Loc(N); @@ -2664,23 +2749,141 @@ void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs, } EVT DataTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, NumVecs * 2); - EVT ResultTys[] = {DataTy, MVT::Other}; + SmallVector<EVT, 4> ResultTys = {DataTy, MVT::Other}; + unsigned PtrOperand = HasWriteback ? 1 : 2; auto Data = SDValue( CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, DataTy), 0); SDValue Chain = N->getOperand(0); - for (unsigned Stage = 0; Stage < NumVecs; ++Stage) { - SDValue Ops[] = {Data, N->getOperand(2), Chain}; + // Add a MVE_VLDn instruction for each Vec, except the last + for (unsigned Stage = 0; Stage < NumVecs - 1; ++Stage) { + SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain}; auto LoadInst = CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops); Data = SDValue(LoadInst, 0); Chain = SDValue(LoadInst, 1); } + // The last may need a writeback on it + if (HasWriteback) + ResultTys = {DataTy, MVT::i32, MVT::Other}; + SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain}; + auto LoadInst = + CurDAG->getMachineNode(OurOpcodes[NumVecs - 1], Loc, ResultTys, Ops); - for (unsigned i = 0; i < NumVecs; i++) + unsigned i; + for (i = 0; i < NumVecs; i++) ReplaceUses(SDValue(N, i), - CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, Data)); - ReplaceUses(SDValue(N, NumVecs), Chain); + CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, + SDValue(LoadInst, 0))); + if (HasWriteback) + ReplaceUses(SDValue(N, i++), SDValue(LoadInst, 1)); + ReplaceUses(SDValue(N, i), SDValue(LoadInst, HasWriteback ? 2 : 1)); + CurDAG->RemoveDeadNode(N); +} + +void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, + bool Wrapping, bool Predicated) { + EVT VT = N->getValueType(0); + SDLoc Loc(N); + + uint16_t Opcode; + switch (VT.getScalarSizeInBits()) { + case 8: + Opcode = Opcodes[0]; + break; + case 16: + Opcode = Opcodes[1]; + break; + case 32: + Opcode = Opcodes[2]; + break; + default: + llvm_unreachable("bad vector element size in SelectMVE_VxDUP"); + } + + SmallVector<SDValue, 8> Ops; + unsigned OpIdx = 1; + + SDValue Inactive; + if (Predicated) + Inactive = N->getOperand(OpIdx++); + + Ops.push_back(N->getOperand(OpIdx++)); // base + if (Wrapping) + Ops.push_back(N->getOperand(OpIdx++)); // limit + + SDValue ImmOp = N->getOperand(OpIdx++); // step + int ImmValue = cast<ConstantSDNode>(ImmOp)->getZExtValue(); + Ops.push_back(getI32Imm(ImmValue, Loc)); + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(OpIdx), Inactive); + else + AddEmptyMVEPredicateToOps(Ops, Loc, N->getValueType(0)); + + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + +void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode, + size_t NumExtraOps, bool HasAccum) { + bool IsBigEndian = CurDAG->getDataLayout().isBigEndian(); + SDLoc Loc(N); + SmallVector<SDValue, 8> Ops; + + unsigned OpIdx = 1; + + // Convert and append the immediate operand designating the coprocessor. + SDValue ImmCorpoc = N->getOperand(OpIdx++); + uint32_t ImmCoprocVal = cast<ConstantSDNode>(ImmCorpoc)->getZExtValue(); + Ops.push_back(getI32Imm(ImmCoprocVal, Loc)); + + // For accumulating variants copy the low and high order parts of the + // accumulator into a register pair and add it to the operand vector. + if (HasAccum) { + SDValue AccLo = N->getOperand(OpIdx++); + SDValue AccHi = N->getOperand(OpIdx++); + if (IsBigEndian) + std::swap(AccLo, AccHi); + Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, AccLo, AccHi), 0)); + } + + // Copy extra operands as-is. + for (size_t I = 0; I < NumExtraOps; I++) + Ops.push_back(N->getOperand(OpIdx++)); + + // Convert and append the immediate operand + SDValue Imm = N->getOperand(OpIdx); + uint32_t ImmVal = cast<ConstantSDNode>(Imm)->getZExtValue(); + Ops.push_back(getI32Imm(ImmVal, Loc)); + + // Accumulating variants are IT-predicable, add predicate operands. + if (HasAccum) { + SDValue Pred = getAL(CurDAG, Loc); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + Ops.push_back(Pred); + Ops.push_back(PredReg); + } + + // Create the CDE intruction + SDNode *InstrNode = CurDAG->getMachineNode(Opcode, Loc, MVT::Untyped, Ops); + SDValue ResultPair = SDValue(InstrNode, 0); + + // The original intrinsic had two outputs, and the output of the dual-register + // CDE instruction is a register pair. We need to extract the two subregisters + // and replace all uses of the original outputs with the extracted + // subregisters. + uint16_t SubRegs[2] = {ARM::gsub_0, ARM::gsub_1}; + if (IsBigEndian) + std::swap(SubRegs[0], SubRegs[1]); + + for (size_t ResIdx = 0; ResIdx < 2; ResIdx++) { + if (SDValue(N, ResIdx).use_empty()) + continue; + SDValue SubReg = CurDAG->getTargetExtractSubreg(SubRegs[ResIdx], Loc, + MVT::i32, ResultPair); + ReplaceUses(SDValue(N, ResIdx), SubReg); + } + CurDAG->RemoveDeadNode(N); } @@ -2689,6 +2892,7 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, const uint16_t *DOpcodes, const uint16_t *QOpcodes0, const uint16_t *QOpcodes1) { + assert(Subtarget->hasNEON()); assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range"); SDLoc dl(N); @@ -2725,6 +2929,8 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, case MVT::v8i16: case MVT::v4f16: case MVT::v8f16: + case MVT::v4bf16: + case MVT::v8bf16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: @@ -3202,7 +3408,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { MachineFunction& MF = CurDAG->getMachineFunction(); MachineMemOperand *MemOp = MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand::MOLoad, 4, Align(4)); CurDAG->setNodeMemRefs(cast<MachineSDNode>(ResNode), {MemOp}); @@ -3222,8 +3428,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Set the alignment of the frame object to 4, to avoid having to generate // more than one ADD MachineFrameInfo &MFI = MF->getFrameInfo(); - if (MFI.getObjectAlignment(FI) < 4) - MFI.setObjectAlignment(FI, 4); + if (MFI.getObjectAlign(FI) < Align(4)) + MFI.setObjectAlignment(FI, Align(4)); CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI, CurDAG->getTargetConstant(0, dl, MVT::i32)); return; @@ -3486,6 +3692,59 @@ void ARMDAGToDAGISel::Select(SDNode *N) { CurDAG->RemoveDeadNode(N); return; } + case ARMISD::LDRD: { + if (Subtarget->isThumb2()) + break; // TableGen handles isel in this case. + SDValue Base, RegOffset, ImmOffset; + const SDValue &Chain = N->getOperand(0); + const SDValue &Addr = N->getOperand(1); + SelectAddrMode3(Addr, Base, RegOffset, ImmOffset); + if (RegOffset != CurDAG->getRegister(0, MVT::i32)) { + // The register-offset variant of LDRD mandates that the register + // allocated to RegOffset is not reused in any of the remaining operands. + // This restriction is currently not enforced. Therefore emitting this + // variant is explicitly avoided. + Base = Addr; + RegOffset = CurDAG->getRegister(0, MVT::i32); + } + SDValue Ops[] = {Base, RegOffset, ImmOffset, Chain}; + SDNode *New = CurDAG->getMachineNode(ARM::LOADDUAL, dl, + {MVT::Untyped, MVT::Other}, Ops); + SDValue Lo = CurDAG->getTargetExtractSubreg(ARM::gsub_0, dl, MVT::i32, + SDValue(New, 0)); + SDValue Hi = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32, + SDValue(New, 0)); + transferMemOperands(N, New); + ReplaceUses(SDValue(N, 0), Lo); + ReplaceUses(SDValue(N, 1), Hi); + ReplaceUses(SDValue(N, 2), SDValue(New, 1)); + CurDAG->RemoveDeadNode(N); + return; + } + case ARMISD::STRD: { + if (Subtarget->isThumb2()) + break; // TableGen handles isel in this case. + SDValue Base, RegOffset, ImmOffset; + const SDValue &Chain = N->getOperand(0); + const SDValue &Addr = N->getOperand(3); + SelectAddrMode3(Addr, Base, RegOffset, ImmOffset); + if (RegOffset != CurDAG->getRegister(0, MVT::i32)) { + // The register-offset variant of STRD mandates that the register + // allocated to RegOffset is not reused in any of the remaining operands. + // This restriction is currently not enforced. Therefore emitting this + // variant is explicitly avoided. + Base = Addr; + RegOffset = CurDAG->getRegister(0, MVT::i32); + } + SDNode *RegPair = + createGPRPairNode(MVT::Untyped, N->getOperand(1), N->getOperand(2)); + SDValue Ops[] = {SDValue(RegPair, 0), Base, RegOffset, ImmOffset, Chain}; + SDNode *New = CurDAG->getMachineNode(ARM::STOREDUAL, dl, MVT::Other, Ops); + transferMemOperands(N, New); + ReplaceUses(SDValue(N, 0), SDValue(New, 0)); + CurDAG->RemoveDeadNode(N); + return; + } case ARMISD::LOOP_DEC: { SDValue Ops[] = { N->getOperand(1), N->getOperand(2), @@ -3828,14 +4087,24 @@ void ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VLD2_UPD: { - static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed, - ARM::VLD2d16wb_fixed, - ARM::VLD2d32wb_fixed, - ARM::VLD1q64wb_fixed}; - static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed, - ARM::VLD2q16PseudoWB_fixed, - ARM::VLD2q32PseudoWB_fixed }; - SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VLD2d8wb_fixed, ARM::VLD2d16wb_fixed, ARM::VLD2d32wb_fixed, + ARM::VLD1q64wb_fixed}; + static const uint16_t QOpcodes[] = {ARM::VLD2q8PseudoWB_fixed, + ARM::VLD2q16PseudoWB_fixed, + ARM::VLD2q32PseudoWB_fixed}; + SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr); + } else { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD20_8, + ARM::MVE_VLD21_8_wb}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD20_16, + ARM::MVE_VLD21_16_wb}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32, + ARM::MVE_VLD21_32_wb}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 2, Opcodes, true); + } return; } @@ -3855,17 +4124,30 @@ void ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VLD4_UPD: { - static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, - ARM::VLD4d16Pseudo_UPD, - ARM::VLD4d32Pseudo_UPD, - ARM::VLD1d64QPseudoWB_fixed}; - static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD, - ARM::VLD4q16Pseudo_UPD, - ARM::VLD4q32Pseudo_UPD }; - static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD, - ARM::VLD4q16oddPseudo_UPD, - ARM::VLD4q32oddPseudo_UPD }; - SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, ARM::VLD4d32Pseudo_UPD, + ARM::VLD1d64QPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = {ARM::VLD4q8Pseudo_UPD, + ARM::VLD4q16Pseudo_UPD, + ARM::VLD4q32Pseudo_UPD}; + static const uint16_t QOpcodes1[] = {ARM::VLD4q8oddPseudo_UPD, + ARM::VLD4q16oddPseudo_UPD, + ARM::VLD4q32oddPseudo_UPD}; + SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + } else { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD40_8, ARM::MVE_VLD41_8, + ARM::MVE_VLD42_8, + ARM::MVE_VLD43_8_wb}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD40_16, ARM::MVE_VLD41_16, + ARM::MVE_VLD42_16, + ARM::MVE_VLD43_16_wb}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD40_32, ARM::MVE_VLD41_32, + ARM::MVE_VLD42_32, + ARM::MVE_VLD43_32_wb}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 4, Opcodes, true); + } return; } @@ -3913,15 +4195,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VST2_UPD: { - static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed, - ARM::VST2d16wb_fixed, - ARM::VST2d32wb_fixed, - ARM::VST1q64wb_fixed}; - static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed, - ARM::VST2q16PseudoWB_fixed, - ARM::VST2q32PseudoWB_fixed }; - SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); - return; + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VST2d8wb_fixed, ARM::VST2d16wb_fixed, ARM::VST2d32wb_fixed, + ARM::VST1q64wb_fixed}; + static const uint16_t QOpcodes[] = {ARM::VST2q8PseudoWB_fixed, + ARM::VST2q16PseudoWB_fixed, + ARM::VST2q32PseudoWB_fixed}; + SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr); + return; + } + break; } case ARMISD::VST3_UPD: { @@ -3940,18 +4224,20 @@ void ARMDAGToDAGISel::Select(SDNode *N) { } case ARMISD::VST4_UPD: { - static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD, - ARM::VST4d16Pseudo_UPD, - ARM::VST4d32Pseudo_UPD, - ARM::VST1d64QPseudoWB_fixed}; - static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, - ARM::VST4q16Pseudo_UPD, - ARM::VST4q32Pseudo_UPD }; - static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, - ARM::VST4q16oddPseudo_UPD, - ARM::VST4q32oddPseudo_UPD }; - SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); - return; + if (Subtarget->hasNEON()) { + static const uint16_t DOpcodes[] = { + ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, ARM::VST4d32Pseudo_UPD, + ARM::VST1d64QPseudoWB_fixed}; + static const uint16_t QOpcodes0[] = {ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD}; + static const uint16_t QOpcodes1[] = {ARM::VST4q8oddPseudo_UPD, + ARM::VST4q16oddPseudo_UPD, + ARM::VST4q32oddPseudo_UPD}; + SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1); + return; + } + break; } case ARMISD::VST2LN_UPD: { @@ -4430,7 +4716,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32, ARM::MVE_VLD21_32}; static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; - SelectMVE_VLD(N, 2, Opcodes); + SelectMVE_VLD(N, 2, Opcodes, false); return; } @@ -4444,7 +4730,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { ARM::MVE_VLD42_32, ARM::MVE_VLD43_32}; static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; - SelectMVE_VLD(N, 4, Opcodes); + SelectMVE_VLD(N, 4, Opcodes, false); return; } } @@ -4457,6 +4743,29 @@ void ARMDAGToDAGISel::Select(SDNode *N) { default: break; + // Scalar f32 -> bf16 + case Intrinsic::arm_neon_vcvtbfp2bf: { + SDLoc dl(N); + const SDValue &Src = N->getOperand(1); + llvm::EVT DestTy = N->getValueType(0); + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { Src, Src, Pred, Reg0 }; + CurDAG->SelectNodeTo(N, ARM::BF16_VCVTB, DestTy, Ops); + return; + } + + // Vector v4f32 -> v4bf16 + case Intrinsic::arm_neon_vcvtfp2bf: { + SDLoc dl(N); + const SDValue &Src = N->getOperand(1); + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { Src, Pred, Reg0 }; + CurDAG->SelectNodeTo(N, ARM::BF16_VCVT, MVT::v4bf16, Ops); + return; + } + case Intrinsic::arm_mve_urshrl: SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false); return; @@ -4475,18 +4784,21 @@ void ARMDAGToDAGISel::Select(SDNode *N) { case Intrinsic::arm_mve_sqrshrl: SelectMVE_LongShift(N, ARM::MVE_SQRSHRL, false, true); return; - case Intrinsic::arm_mve_lsll: - SelectMVE_LongShift(N, ARM::MVE_LSLLr, false, false); - return; - case Intrinsic::arm_mve_asrl: - SelectMVE_LongShift(N, ARM::MVE_ASRLr, false, false); - return; case Intrinsic::arm_mve_vadc: case Intrinsic::arm_mve_vadc_predicated: SelectMVE_VADCSBC(N, ARM::MVE_VADC, ARM::MVE_VADCI, true, IntNo == Intrinsic::arm_mve_vadc_predicated); return; + case Intrinsic::arm_mve_vsbc: + case Intrinsic::arm_mve_vsbc_predicated: + SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true, + IntNo == Intrinsic::arm_mve_vsbc_predicated); + return; + case Intrinsic::arm_mve_vshlc: + case Intrinsic::arm_mve_vshlc_predicated: + SelectMVE_VSHLC(N, IntNo == Intrinsic::arm_mve_vshlc_predicated); + return; case Intrinsic::arm_mve_vmlldava: case Intrinsic::arm_mve_vmlldava_predicated: { @@ -4524,6 +4836,80 @@ void ARMDAGToDAGISel::Select(SDNode *N) { OpcodesS, OpcodesU); return; } + + case Intrinsic::arm_mve_vidup: + case Intrinsic::arm_mve_vidup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VIDUPu8, ARM::MVE_VIDUPu16, ARM::MVE_VIDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, false, + IntNo == Intrinsic::arm_mve_vidup_predicated); + return; + } + + case Intrinsic::arm_mve_vddup: + case Intrinsic::arm_mve_vddup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VDDUPu8, ARM::MVE_VDDUPu16, ARM::MVE_VDDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, false, + IntNo == Intrinsic::arm_mve_vddup_predicated); + return; + } + + case Intrinsic::arm_mve_viwdup: + case Intrinsic::arm_mve_viwdup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VIWDUPu8, ARM::MVE_VIWDUPu16, ARM::MVE_VIWDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, true, + IntNo == Intrinsic::arm_mve_viwdup_predicated); + return; + } + + case Intrinsic::arm_mve_vdwdup: + case Intrinsic::arm_mve_vdwdup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VDWDUPu8, ARM::MVE_VDWDUPu16, ARM::MVE_VDWDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, true, + IntNo == Intrinsic::arm_mve_vdwdup_predicated); + return; + } + + case Intrinsic::arm_cde_cx1d: + case Intrinsic::arm_cde_cx1da: + case Intrinsic::arm_cde_cx2d: + case Intrinsic::arm_cde_cx2da: + case Intrinsic::arm_cde_cx3d: + case Intrinsic::arm_cde_cx3da: { + bool HasAccum = IntNo == Intrinsic::arm_cde_cx1da || + IntNo == Intrinsic::arm_cde_cx2da || + IntNo == Intrinsic::arm_cde_cx3da; + size_t NumExtraOps; + uint16_t Opcode; + switch (IntNo) { + case Intrinsic::arm_cde_cx1d: + case Intrinsic::arm_cde_cx1da: + NumExtraOps = 0; + Opcode = HasAccum ? ARM::CDE_CX1DA : ARM::CDE_CX1D; + break; + case Intrinsic::arm_cde_cx2d: + case Intrinsic::arm_cde_cx2da: + NumExtraOps = 1; + Opcode = HasAccum ? ARM::CDE_CX2DA : ARM::CDE_CX2D; + break; + case Intrinsic::arm_cde_cx3d: + case Intrinsic::arm_cde_cx3da: + NumExtraOps = 2; + Opcode = HasAccum ? ARM::CDE_CX3DA : ARM::CDE_CX3D; + break; + default: + llvm_unreachable("Unexpected opcode"); + } + SelectCDE_CXxD(N, Opcode, NumExtraOps, HasAccum); + return; + } } break; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9f504b1eaa42..287e2e60e572 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -210,6 +210,8 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) @@ -284,6 +286,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); // Vector reductions @@ -292,6 +296,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); + setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); @@ -341,6 +349,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::FMINNUM, VT, Legal); setOperationAction(ISD::FMAXNUM, VT, Legal); setOperationAction(ISD::FROUND, VT, Legal); + setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); // No native support for these. setOperationAction(ISD::FDIV, VT, Expand); @@ -358,6 +370,17 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { } } + // Custom Expand smaller than legal vector reductions to prevent false zero + // items being added. + setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom); + setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom); + setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); + setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); + // We 'support' these types up to bitcast/load/store level, regardless of // MVE integer-only / float support. Only doing FP data processing on the FP // vector types is inhibited at integer-only level. @@ -717,13 +740,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasFullFP16()) { addRegisterClass(MVT::f16, &ARM::HPRRegClass); setOperationAction(ISD::BITCAST, MVT::i16, Custom); - setOperationAction(ISD::BITCAST, MVT::i32, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); } + if (Subtarget->hasBF16()) { + addRegisterClass(MVT::bf16, &ARM::HPRRegClass); + setAllExpand(MVT::bf16); + if (!Subtarget->hasFullFP16()) + setOperationAction(ISD::BITCAST, MVT::bf16, Custom); + } + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -771,6 +800,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8f16); addDRTypeForNEON(MVT::v4f16); } + + if (Subtarget->hasBF16()) { + addQRTypeForNEON(MVT::v8bf16); + addDRTypeForNEON(MVT::v4bf16); + } } if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { @@ -912,9 +946,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::v4f32, Expand); } - setTargetDAGCombine(ISD::INTRINSIC_VOID); - setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SRA); @@ -938,10 +969,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::VECREDUCE_ADD); + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::BITCAST); + } + if (Subtarget->hasMVEIntegerOps()) { + setTargetDAGCombine(ISD::SMIN); + setTargetDAGCombine(ISD::UMIN); + setTargetDAGCombine(ISD::SMAX); + setTargetDAGCombine(ISD::UMAX); + setTargetDAGCombine(ISD::FP_EXTEND); } if (!Subtarget->hasFP64()) { @@ -1073,6 +1118,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRA, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); // MVE lowers 64 bit shifts to lsll and lsrl // assuming that ISD::SRL and SRA of i64 are already marked custom @@ -1419,12 +1466,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } if (Subtarget->hasNEON()) { - // vmin and vmax aren't available in a scalar form, so we use - // a NEON instruction with an undef lane instead. - setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); - setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); - setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); + // vmin and vmax aren't available in a scalar form, so we can use + // a NEON instruction with an undef lane instead. This has a performance + // penalty on some cores, so we don't do this unless we have been + // asked to by the core tuning model. + if (Subtarget->useNEONForSinglePrecisionFP()) { + setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); + setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); + setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); + } setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); @@ -1452,6 +1503,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::XOR); + if (Subtarget->hasMVEIntegerOps()) + setTargetDAGCombine(ISD::VSELECT); + if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); if (Subtarget->isThumb1Only()) @@ -1550,10 +1604,12 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CALL: return "ARMISD::CALL"; case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; + case ARMISD::tSECALL: return "ARMISD::tSECALL"; case ARMISD::BRCOND: return "ARMISD::BRCOND"; case ARMISD::BR_JT: return "ARMISD::BR_JT"; case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; + case ARMISD::SERET_FLAG: return "ARMISD::SERET_FLAG"; case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; @@ -1606,10 +1662,14 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; + case ARMISD::LDRD: return "ARMISD::LDRD"; + case ARMISD::STRD: return "ARMISD::STRD"; + case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; + case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST"; case ARMISD::VCMP: return "ARMISD::VCMP"; case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; case ARMISD::VTST: return "ARMISD::VTST"; @@ -1650,8 +1710,28 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTBL1: return "ARMISD::VTBL1"; case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMOVN: return "ARMISD::VMOVN"; + case ARMISD::VQMOVNs: return "ARMISD::VQMOVNs"; + case ARMISD::VQMOVNu: return "ARMISD::VQMOVNu"; + case ARMISD::VCVTN: return "ARMISD::VCVTN"; + case ARMISD::VCVTL: return "ARMISD::VCVTL"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::VADDVs: return "ARMISD::VADDVs"; + case ARMISD::VADDVu: return "ARMISD::VADDVu"; + case ARMISD::VADDLVs: return "ARMISD::VADDLVs"; + case ARMISD::VADDLVu: return "ARMISD::VADDLVu"; + case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs"; + case ARMISD::VADDLVAu: return "ARMISD::VADDLVAu"; + case ARMISD::VADDLVps: return "ARMISD::VADDLVps"; + case ARMISD::VADDLVpu: return "ARMISD::VADDLVpu"; + case ARMISD::VADDLVAps: return "ARMISD::VADDLVAps"; + case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu"; + case ARMISD::VMLAVs: return "ARMISD::VMLAVs"; + case ARMISD::VMLAVu: return "ARMISD::VMLAVu"; + case ARMISD::VMLALVs: return "ARMISD::VMLALVs"; + case ARMISD::VMLALVu: return "ARMISD::VMLALVu"; + case ARMISD::VMLALVAs: return "ARMISD::VMLALVAs"; + case ARMISD::VMLALVAu: return "ARMISD::VMLALVAu"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; @@ -1955,6 +2035,35 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, } } +SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, + MVT LocVT, MVT ValVT, SDValue Val) const { + Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()), + Val); + if (Subtarget->hasFullFP16()) { + Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val); + } else { + Val = DAG.getNode(ISD::TRUNCATE, dl, + MVT::getIntegerVT(ValVT.getSizeInBits()), Val); + Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val); + } + return Val; +} + +SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, + MVT LocVT, MVT ValVT, + SDValue Val) const { + if (Subtarget->hasFullFP16()) { + Val = DAG.getNode(ARMISD::VMOVrh, dl, + MVT::getIntegerVT(LocVT.getSizeInBits()), Val); + } else { + Val = DAG.getNode(ISD::BITCAST, dl, + MVT::getIntegerVT(ValVT.getSizeInBits()), Val); + Val = DAG.getNode(ISD::ZERO_EXTEND, dl, + MVT::getIntegerVT(LocVT.getSizeInBits()), Val); + } + return DAG.getNode(ISD::BITCAST, dl, LocVT, Val); +} + /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue ARMTargetLowering::LowerCallResult( @@ -1982,7 +2091,8 @@ SDValue ARMTargetLowering::LowerCallResult( } SDValue Val; - if (VA.needsCustom()) { + if (VA.needsCustom() && + (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) { // Handle f64 or half of a v2f64. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); @@ -2031,6 +2141,13 @@ SDValue ARMTargetLowering::LowerCallResult( break; } + // f16 arguments have their size extended to 4 bytes and passed as if they + // had been copied to the LSBs of a 32-bit register. + // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) + if (VA.needsCustom() && + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) + Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val); + InVals.push_back(Val); } @@ -2097,22 +2214,34 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); MachineFunction::CallSiteInfo CSInfo; bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; + bool isCmseNSCall = false; bool PreferIndirect = false; + // Determine whether this is a non-secure function call. + if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call")) + isCmseNSCall = true; + // Disable tail calls if they're not supported. if (!Subtarget->supportsTailCall()) isTailCall = false; + // For both the non-secure calls and the returns from a CMSE entry function, + // the function needs to do some extra work afte r the call, or before the + // return, respectively, thus it cannot end with atail call + if (isCmseNSCall || AFI->isCmseNSEntryFunction()) + isTailCall = false; + if (isa<GlobalAddressSDNode>(Callee)) { // If we're optimizing for minimum size and the function is called three or // more times in this block, we can improve codesize by calling indirectly // as BLXr has a 16-bit encoding. auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); - if (CLI.CS) { - auto *BB = CLI.CS.getParent(); + if (CLI.CB) { + auto *BB = CLI.CB->getParent(); PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && count_if(GV->users(), [&BB](const User *U) { return isa<Instruction>(U) && @@ -2126,7 +2255,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee, CallConv, isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, PreferIndirect); - if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) + if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // We don't support GuaranteedTailCallOpt for ARM, only automatically @@ -2187,31 +2316,50 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, break; } - // f64 and v2f64 might be passed in i32 pairs and must be split into pieces - if (VA.needsCustom()) { - if (VA.getLocVT() == MVT::v2f64) { - SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(0, dl, MVT::i32)); - SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, - DAG.getConstant(1, dl, MVT::i32)); - - PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, - VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); - - VA = ArgLocs[++i]; // skip ahead to next loc - if (VA.isRegLoc()) { - PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, - VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); - } else { - assert(VA.isMemLoc()); + // f16 arguments have their size extended to 4 bytes and passed as if they + // had been copied to the LSBs of a 32-bit register. + // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) + if (VA.needsCustom() && + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { + Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); + } else { + // f16 arguments could have been extended prior to argument lowering. + // Mask them arguments if this is a CMSE nonsecure call. + auto ArgVT = Outs[realArgIdx].ArgVT; + if (isCmseNSCall && (ArgVT == MVT::f16)) { + auto LocBits = VA.getLocVT().getSizeInBits(); + auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits()); + SDValue Mask = + DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); + Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + } + } - MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, - dl, DAG, VA, Flags)); - } - } else { - PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + // f64 and v2f64 might be passed in i32 pairs and must be split into pieces + if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { + SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(0, dl, MVT::i32)); + SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, + DAG.getConstant(1, dl, MVT::i32)); + + PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); + + VA = ArgLocs[++i]; // skip ahead to next loc + if (VA.isRegLoc()) { + PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); + } else { + assert(VA.isMemLoc()); + + MemOpChains.push_back( + LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags)); } + } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { + PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], + StackPtr, MemOpChains, Flags); } else if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i32) { @@ -2222,7 +2370,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, isThisReturn = true; } const TargetOptions &Options = DAG.getTarget().Options; - if (Options.EnableDebugEntryValues) + if (Options.EmitCallSiteInfo) CSInfo.emplace_back(VA.getLocReg(), i); RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { @@ -2245,9 +2393,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); - SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, - MachinePointerInfo(), - DAG.InferPtrAlignment(AddArg)); + SDValue Load = + DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), + DAG.InferPtrAlign(AddArg)); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(j, Load)); } @@ -2268,8 +2416,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); - SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, - MVT::i32); + SDValue AlignNode = + DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; @@ -2311,7 +2459,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); bool isLocalARMFunc = false; - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); auto PtrVt = getPointerTy(DAG.getDataLayout()); if (Subtarget->genLongCalls()) { @@ -2327,7 +2474,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, @@ -2341,7 +2488,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 0); // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, @@ -2393,7 +2540,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, @@ -2405,10 +2552,31 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } + if (isCmseNSCall) { + assert(!isARMFunc && !isDirect && + "Cannot handle call to ARM function or direct call"); + if (NumBytes > 0) { + DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(), + "call to non-secure function would " + "require passing arguments on stack", + dl.getDebugLoc()); + DAG.getContext()->diagnose(Diag); + } + if (isStructRet) { + DiagnosticInfoUnsupported Diag( + DAG.getMachineFunction().getFunction(), + "call to non-secure function would return value through pointer", + dl.getDebugLoc()); + DAG.getContext()->diagnose(Diag); + } + } + // FIXME: handle tail calls differently. unsigned CallOpc; if (Subtarget->isThumb()) { - if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) + if (isCmseNSCall) + CallOpc = ARMISD::tSECALL; + else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else CallOpc = ARMISD::CALL; @@ -2468,6 +2636,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); InFlag = Chain.getValue(1); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); @@ -2488,15 +2657,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, /// and then confiscate the rest of the parameter registers to insure /// this. void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, - unsigned Align) const { + Align Alignment) const { // Byval (as with any stack) slots are always at least 4 byte aligned. - Align = std::max(Align, 4U); + Alignment = std::max(Alignment, Align(4)); unsigned Reg = State->AllocateReg(GPRArgRegs); if (!Reg) return; - unsigned AlignInRegs = Align / 4; + unsigned AlignInRegs = Alignment.value() / 4; unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; for (unsigned i = 0; i < Waste; ++i) Reg = State->AllocateReg(GPRArgRegs); @@ -2635,9 +2804,11 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( // Check that the call results are passed in the same way. LLVMContext &C = *DAG.getContext(); - if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, - CCAssignFnForReturn(CalleeCC, isVarArg), - CCAssignFnForReturn(CallerCC, isVarArg))) + if (!CCState::resultsCompatible( + getEffectiveCallingConv(CalleeCC, isVarArg), + getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins, + CCAssignFnForReturn(CalleeCC, isVarArg), + CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) return false; // The callee has to preserve all registers the caller needs to preserve. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); @@ -2678,7 +2849,7 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; - if (VA.needsCustom()) { + if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) { // f64 and vector types are split into multiple registers or // register/stack-slot combinations. The types will not match // the registers; give up on memory f64 refs until we figure @@ -2777,6 +2948,17 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); AFI->setReturnRegsCount(RVLocs.size()); + // Report error if cmse entry function returns structure through first ptr arg. + if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) { + // Note: using an empty SDLoc(), as the first line of the function is a + // better place to report than the last line. + DiagnosticInfoUnsupported Diag( + DAG.getMachineFunction().getFunction(), + "secure entry function would return value through pointer", + SDLoc().getDebugLoc()); + DAG.getContext()->diagnose(Diag); + } + // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); @@ -2819,7 +3001,24 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, break; } - if (VA.needsCustom()) { + // Mask f16 arguments if this is a CMSE nonsecure entry. + auto RetVT = Outs[realRVLocIdx].ArgVT; + if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) { + if (VA.needsCustom() && VA.getValVT() == MVT::f16) { + Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); + } else { + auto LocBits = VA.getLocVT().getSizeInBits(); + auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits()); + SDValue Mask = + DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); + Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); + Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); + } + } + + if (VA.needsCustom() && + (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) { if (VA.getLocVT() == MVT::v2f64) { // Extract the first half and return it in two registers. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, @@ -2827,15 +3026,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Half); - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - HalfGPRs.getValue(isLittleEndian ? 0 : 1), - Flag); + Chain = + DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - HalfGPRs.getValue(isLittleEndian ? 1 : 0), - Flag); + Chain = + DAG.getCopyToReg(Chain, dl, VA.getLocReg(), + HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc @@ -2849,22 +3048,20 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - fmrrd.getValue(isLittleEndian ? 0 : 1), - Flag); + fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), - fmrrd.getValue(isLittleEndian ? 1 : 0), - Flag); + fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); } else Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); // Guarantee that all emitted copies are // stuck together, avoiding something bad. Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), - ReturnF16 ? MVT::f16 : VA.getLocVT())); + RetOps.push_back(DAG.getRegister( + VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT())); } const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = @@ -2898,7 +3095,9 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return LowerInterruptReturn(RetOps, dl, DAG); } - return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); + ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG : + ARMISD::RET_FLAG; + return DAG.getNode(RetNode, dl, MVT::Other, RetOps); } bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -3040,11 +3239,10 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, } if (CP->isMachineConstantPoolEntry()) - Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, - CP->getAlignment()); + Res = + DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign()); else - Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, - CP->getAlignment()); + Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign()); return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); } @@ -3063,14 +3261,14 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SDValue CPAddr; bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); if (!IsPositionIndependent) { - CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); + CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4)); } else { unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, ARMCP::CPBlockAddress, PCAdj); - CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); SDValue Result = DAG.getLoad( @@ -3199,8 +3397,9 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, const auto *GA = cast<GlobalAddressSDNode>(Op); auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); SDValue Offset = DAG.getLoad( - PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, - DAG.getTargetConstantPool(CPV, PtrVT, 4)), + PtrVT, DL, Chain, + DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, + DAG.getTargetConstantPool(CPV, PtrVT, Align(4))), MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); @@ -3219,7 +3418,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); - SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); + SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); Argument = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), Argument, @@ -3270,7 +3469,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true); - Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, @@ -3288,7 +3487,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, assert(model == TLSModel::LocalExec); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); - Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); + Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, @@ -3391,11 +3590,11 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, // that are strings for simplicity. auto *CDAInit = dyn_cast<ConstantDataArray>(Init); unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); - unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar); + Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar); unsigned RequiredPadding = 4 - (Size % 4); bool PaddingPossible = RequiredPadding == 4 || (CDAInit && CDAInit->isString()); - if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || + if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize || Size == 0) return SDValue(); @@ -3434,8 +3633,7 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, } auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); - SDValue CPAddr = - DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); + SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4)); if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { AFI->markGlobalAsPromotedToConstantPool(GVar); AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + @@ -3505,7 +3703,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, } else { // use literal pool for address constant ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); RelAddr = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, @@ -3525,7 +3723,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, DAG.getTargetGlobalAddress(GV, dl, PtrVT)); } else { - SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); + SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); return DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, @@ -3636,7 +3834,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID( unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); SDValue ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); - std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue}; + constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; SDValue Callee = DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); SDValue RegisterMask = DAG.getRegisterMask(Mask); @@ -3720,7 +3918,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, ARMCP::CPLSDA, PCAdj); - CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); + CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, @@ -3782,6 +3980,15 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_mve_pred_v2i: return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), Op.getOperand(1)); + case Intrinsic::arm_mve_vreinterpretq: + return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::arm_mve_lsll: + return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::arm_mve_asrl: + return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } @@ -3982,6 +4189,42 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, AFI->setVarArgsFrameIndex(FrameIndex); } +bool ARMTargetLowering::splitValueIntoRegisterParts( + SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, + unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const { + bool IsABIRegCopy = CC.hasValue(); + EVT ValueVT = Val.getValueType(); + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && + PartVT == MVT::f32) { + unsigned ValueBits = ValueVT.getSizeInBits(); + unsigned PartBits = PartVT.getSizeInBits(); + Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); + Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); + Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); + Parts[0] = Val; + return true; + } + return false; +} + +SDValue ARMTargetLowering::joinRegisterPartsIntoValue( + SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, + MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const { + bool IsABIRegCopy = CC.hasValue(); + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && + PartVT == MVT::f32) { + unsigned ValueBits = ValueVT.getSizeInBits(); + unsigned PartBits = PartVT.getSizeInBits(); + SDValue Val = Parts[0]; + + Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); + Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); + Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + return Val; + } + return SDValue(); +} + SDValue ARMTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, @@ -4054,44 +4297,41 @@ SDValue ARMTargetLowering::LowerFormalArguments( if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); - if (VA.needsCustom()) { + if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { // f64 and vector types are split up into multiple registers or // combinations of registers and stack slots. - if (VA.getLocVT() == MVT::v2f64) { - SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], - Chain, DAG, dl); - VA = ArgLocs[++i]; // skip ahead to next loc - SDValue ArgValue2; - if (VA.isMemLoc()) { - int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); - SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), FI)); - } else { - ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], - Chain, DAG, dl); - } - ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); - ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, - ArgValue, ArgValue1, - DAG.getIntPtrConstant(0, dl)); - ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, - ArgValue, ArgValue2, - DAG.getIntPtrConstant(1, dl)); - } else - ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + SDValue ArgValue1 = + GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + VA = ArgLocs[++i]; // skip ahead to next loc + SDValue ArgValue2; + if (VA.isMemLoc()) { + int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); + SDValue FIN = DAG.getFrameIndex(FI, PtrVT); + ArgValue2 = DAG.getLoad( + MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + } else { + ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); + } + ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, + ArgValue1, DAG.getIntPtrConstant(0, dl)); + ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, + ArgValue2, DAG.getIntPtrConstant(1, dl)); + } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { + ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } else { const TargetRegisterClass *RC; - - if (RegVT == MVT::f16) + if (RegVT == MVT::f16 || RegVT == MVT::bf16) RC = &ARM::HPRRegClass; else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; - else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) + else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 || + RegVT == MVT::v4bf16) RC = &ARM::DPRRegClass; - else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) + else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 || + RegVT == MVT::v8bf16) RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass @@ -4131,6 +4371,13 @@ SDValue ARMTargetLowering::LowerFormalArguments( break; } + // f16 arguments have their size extended to 4 bytes and passed as if they + // had been copied to the LSBs of a 32-bit register. + // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) + if (VA.needsCustom() && + (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) + ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue); + InVals.push_back(ArgValue); } else { // VA.isRegLoc() // sanity check @@ -5709,85 +5956,27 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// operand type is illegal (e.g., v2f32 for a target that doesn't support /// vectors), since the legalizer won't know what to do with that. -static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *Subtarget) { +SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDValue Op = N->getOperand(0); - // This function is only supposed to be called for i64 types, either as the - // source or destination of the bit convert. + // This function is only supposed to be called for i16 and i64 types, either + // as the source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); - const bool HasFullFP16 = Subtarget->hasFullFP16(); - - if (SrcVT == MVT::f32 && DstVT == MVT::i32) { - // FullFP16: half values are passed in S-registers, and we don't - // need any of the bitcast and moves: - // - // t2: f32,ch = CopyFromReg t0, Register:f32 %0 - // t5: i32 = bitcast t2 - // t18: f16 = ARMISD::VMOVhr t5 - if (Op.getOpcode() != ISD::CopyFromReg || - Op.getValueType() != MVT::f32) - return SDValue(); - - auto Move = N->use_begin(); - if (Move->getOpcode() != ARMISD::VMOVhr) - return SDValue(); - - SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; - SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); - DAG.ReplaceAllUsesWith(*Move, &Copy); - return Copy; - } - - if (SrcVT == MVT::i16 && DstVT == MVT::f16) { - if (!HasFullFP16) - return SDValue(); - // SoftFP: read half-precision arguments: - // - // t2: i32,ch = ... - // t7: i16 = truncate t2 <~~~~ Op - // t8: f16 = bitcast t7 <~~~~ N - // - if (Op.getOperand(0).getValueType() == MVT::i32) - return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), - MVT::f16, Op.getOperand(0)); - - return SDValue(); - } - // Half-precision return values - if (SrcVT == MVT::f16 && DstVT == MVT::i16) { - if (!HasFullFP16) - return SDValue(); - // - // t11: f16 = fadd t8, t10 - // t12: i16 = bitcast t11 <~~~ SDNode N - // t13: i32 = zero_extend t12 - // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 - // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 - // - // transform this into: - // - // t20: i32 = ARMISD::VMOVrh t11 - // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 - // - auto ZeroExtend = N->use_begin(); - if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || - ZeroExtend->getValueType(0) != MVT::i32) - return SDValue(); + if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) && + (DstVT == MVT::f16 || DstVT == MVT::bf16)) + return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(), + DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); - auto Copy = ZeroExtend->use_begin(); - if (Copy->getOpcode() == ISD::CopyToReg && - Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { - SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); - DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); - return Cvt; - } - return SDValue(); - } + if ((DstVT == MVT::i16 || DstVT == MVT::i32) && + (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) + return DAG.getNode( + ISD::TRUNCATE, SDLoc(N), DstVT, + MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op)); if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) return SDValue(); @@ -5930,16 +6119,20 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); - SDValue Ops[] = { DAG.getEntryNode(), - DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; + SDValue Chain = Op.getOperand(0); + SDValue Ops[] = {Chain, + DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)}; - SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); + SDValue FPSCR = + DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops); + Chain = FPSCR.getValue(1); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); - return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, - DAG.getConstant(3, dl, MVT::i32)); + SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, + DAG.getConstant(3, dl, MVT::i32)); + return DAG.getMergeValues({And, Chain}, dl); } static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, @@ -6424,9 +6617,10 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { /// immediate" operand (e.g., VMOV). If so, return the encoded value. static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, - const SDLoc &dl, EVT &VT, bool is128Bits, + const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type) { unsigned OpCmode, Imm; + bool is128Bits = VectorVT.is128BitVector(); // SplatBitSize is set to the smallest size that splats the vector, so a // zero vector will always have SplatBitSize == 8. However, NEON modified @@ -6544,9 +6738,18 @@ static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, ImmMask <<= 1; } - if (DAG.getDataLayout().isBigEndian()) - // swap higher and lower 32 bit word - Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); + if (DAG.getDataLayout().isBigEndian()) { + // Reverse the order of elements within the vector. + unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; + unsigned Mask = (1 << BytesPerElem) - 1; + unsigned NumElems = 8 / BytesPerElem; + unsigned NewImm = 0; + for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { + unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); + NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; + } + Imm = NewImm; + } // Op=1, Cmode=1110. OpCmode = 0x1e; @@ -6585,8 +6788,6 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, case MVT::f64: { SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); - if (!ST->isLittle()) - std::swap(Lo, Hi); return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); } case MVT::f32: @@ -6639,7 +6840,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), - VMovVT, false, VMOVModImm); + VMovVT, VT, VMOVModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, @@ -6656,7 +6857,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, // Finally, try a VMVN.i32 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, - false, VMVNModImm); + VT, VMVNModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); @@ -7064,6 +7265,104 @@ static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) { return true; } +// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted +// from a pair of inputs. For example: +// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), +// FP_ROUND(EXTRACT_ELT(Y, 0), +// FP_ROUND(EXTRACT_ELT(X, 1), +// FP_ROUND(EXTRACT_ELT(Y, 1), ...) +static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, + const ARMSubtarget *ST) { + assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + if (!ST->hasMVEFloatOps()) + return SDValue(); + + SDLoc dl(BV); + EVT VT = BV.getValueType(); + if (VT != MVT::v8f16) + return SDValue(); + + // We are looking for a buildvector of fptrunc elements, where all the + // elements are interleavingly extracted from two sources. Check the first two + // items are valid enough and extract some info from them (they are checked + // properly in the loop below). + if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND || + BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || + BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0) + return SDValue(); + if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND || + BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || + BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0) + return SDValue(); + SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); + SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0); + if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32) + return SDValue(); + + // Check all the values in the BuildVector line up with our expectations. + for (unsigned i = 1; i < 4; i++) { + auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { + return Trunc.getOpcode() == ISD::FP_ROUND && + Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Trunc.getOperand(0).getOperand(0) == Op && + Trunc.getOperand(0).getConstantOperandVal(1) == Idx; + }; + if (!Check(BV.getOperand(i * 2 + 0), Op0, i)) + return SDValue(); + if (!Check(BV.getOperand(i * 2 + 1), Op1, i)) + return SDValue(); + } + + SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0, + DAG.getConstant(0, dl, MVT::i32)); + return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1, + DAG.getConstant(1, dl, MVT::i32)); +} + +// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted +// from a single input on alternating lanes. For example: +// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), +// FP_ROUND(EXTRACT_ELT(X, 2), +// FP_ROUND(EXTRACT_ELT(X, 4), ...) +static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, + const ARMSubtarget *ST) { + assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); + if (!ST->hasMVEFloatOps()) + return SDValue(); + + SDLoc dl(BV); + EVT VT = BV.getValueType(); + if (VT != MVT::v4f32) + return SDValue(); + + // We are looking for a buildvector of fptext elements, where all the + // elements are alternating lanes from a single source. For example <0,2,4,6> + // or <1,3,5,7>. Check the first two items are valid enough and extract some + // info from them (they are checked properly in the loop below). + if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND || + BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); + int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1); + if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1)) + return SDValue(); + + // Check all the values in the BuildVector line up with our expectations. + for (unsigned i = 1; i < 4; i++) { + auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { + return Trunc.getOpcode() == ISD::FP_EXTEND && + Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Trunc.getOperand(0).getOperand(0) == Op && + Trunc.getOperand(0).getConstantOperandVal(1) == Idx; + }; + if (!Check(BV.getOperand(i), Op0, 2 * i + Offset)) + return SDValue(); + } + + return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0, + DAG.getConstant(Offset, dl, MVT::i32)); +} + // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. @@ -7163,13 +7462,12 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return DAG.getUNDEF(VT); if ((ST->hasNEON() && SplatBitSize <= 64) || - (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { + (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { // Check if an immediate VMOV works. EVT VmovVT; - SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VmovVT, VT.is128BitVector(), - VMOVModImm); + SDValue Val = + isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); @@ -7179,9 +7477,8 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); Val = isVMOVModifiedImm( - NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VmovVT, VT.is128BitVector(), - ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); + NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, + VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); @@ -7321,12 +7618,19 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (isConstant) return SDValue(); - // Empirical tests suggest this is rarely worth it for vectors of length <= 2. - if (NumElts >= 4) { - SDValue shuffle = ReconstructShuffle(Op, DAG); - if (shuffle != SDValue()) + // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and + // vmovn). Empirical tests suggest this is rarely worth it for vectors of + // length <= 2. + if (NumElts >= 4) + if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; - } + + // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into + // VCVT's + if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget)) + return VCVT; + if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget)) + return VCVT; if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector @@ -7527,7 +7831,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); - Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } @@ -7579,7 +7883,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, ShuffleOps[1], Mask, DAG); if (!Shuffle) return SDValue(); - return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); } enum ShuffleOpCodes { @@ -8892,7 +9196,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { if (ShouldUseSRet) { // Create stack object for sret. const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); + const Align StackAlign = DL.getPrefTypeAlign(RetTy); int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); @@ -9067,8 +9371,7 @@ void ARMTargetLowering::ExpandDIV_Windows( DAG.getConstant(32, dl, TLI.getPointerTy(DL))); Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); - Results.push_back(Lower); - Results.push_back(Upper); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper)); } static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { @@ -9101,6 +9404,25 @@ static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); } +void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + LoadSDNode *LD = cast<LoadSDNode>(N); + EVT MemVT = LD->getMemoryVT(); + assert(LD->isUnindexed() && "Loads should be unindexed at this point."); + + if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && + !Subtarget->isThumb1Only() && LD->isVolatile()) { + SDLoc dl(N); + SDValue Result = DAG.getMemIntrinsicNode( + ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), + {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); + SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1); + SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0); + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + Results.append({Pair, Result.getValue(2)}); + } +} + static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); EVT MemVT = ST->getMemoryVT(); @@ -9130,6 +9452,38 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { ST->getMemOperand()); } +static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); + EVT MemVT = ST->getMemoryVT(); + assert(ST->isUnindexed() && "Stores should be unindexed at this point."); + + if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && + !Subtarget->isThumb1Only() && ST->isVolatile()) { + SDNode *N = Op.getNode(); + SDLoc dl(N); + + SDValue Lo = DAG.getNode( + ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), + DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl, + MVT::i32)); + SDValue Hi = DAG.getNode( + ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), + DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl, + MVT::i32)); + + return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), + {ST->getChain(), Lo, Hi, ST->getBasePtr()}, + MemVT, ST->getMemOperand()); + } else if (Subtarget->hasMVEIntegerOps() && + ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || + MemVT == MVT::v16i1))) { + return LowerPredicateStore(Op, DAG); + } + + return SDValue(); +} + static bool isZeroVector(SDValue N) { return (ISD::isBuildVectorAllZeros(N.getNode()) || (N->getOpcode() == ARMISD::VMOVIMM && @@ -9155,13 +9509,87 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), N->isExpandingLoad()); SDValue Combo = NewLoad; - if (!PassThru.isUndef() && - (PassThru.getOpcode() != ISD::BITCAST || - !isZeroVector(PassThru->getOperand(0)))) + bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST || + PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) && + isZeroVector(PassThru->getOperand(0)); + if (!PassThru.isUndef() && !PassThruIsCastZero) Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); } +static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + SDLoc dl(Op); + unsigned BaseOpcode = 0; + switch (Op->getOpcode()) { + default: llvm_unreachable("Expected VECREDUCE opcode"); + case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; + case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; + case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; + case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; + case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; + case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; + case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; + case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; + } + + SDValue Op0 = Op->getOperand(0); + EVT VT = Op0.getValueType(); + EVT EltVT = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumActiveLanes = NumElts; + + assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 || + NumActiveLanes == 2) && + "Only expected a power 2 vector size"); + + // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements + // allows us to easily extract vector elements from the lanes. + while (NumActiveLanes > 4) { + unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32; + SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0); + Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev); + NumActiveLanes /= 2; + } + + SDValue Res; + if (NumActiveLanes == 4) { + // The remaining 4 elements are summed sequentially + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(0 * NumElts / 4, dl, MVT::i32)); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(1 * NumElts / 4, dl, MVT::i32)); + SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(2 * NumElts / 4, dl, MVT::i32)); + SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(3 * NumElts / 4, dl, MVT::i32)); + SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); + SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags()); + Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags()); + } else { + SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(0, dl, MVT::i32)); + SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(1, dl, MVT::i32)); + Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); + } + + // Result type may be wider than element type. + if (EltVT != Op->getValueType(0)) + Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res); + return Res; +} + +static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEFloatOps()) + return SDValue(); + return LowerVecReduce(Op, DAG, ST); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or @@ -9231,12 +9659,13 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N, bool isBigEndian = DAG.getDataLayout().isBigEndian(); - Results.push_back( + SDValue Lo = DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, - SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); - Results.push_back( + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); + SDValue Hi = DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, - SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); + SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi)); Results.push_back(SDValue(CmpSwap, 2)); } @@ -9362,9 +9791,19 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::LOAD: return LowerPredicateLoad(Op, DAG); case ISD::STORE: - return LowerPredicateStore(Op, DAG); + return LowerSTORE(Op, DAG, Subtarget); case ISD::MLOAD: return LowerMLOAD(Op, DAG); + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return LowerVecReduce(Op, DAG, Subtarget); + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_FMIN: + case ISD::VECREDUCE_FMAX: + return LowerVecReduceF(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); @@ -9411,8 +9850,8 @@ static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, DAG.getVTList(MVT::i32, MVT::i32), N->getOperand(1), N->getOperand(2), Lo, Hi); - Results.push_back(LongMul.getValue(0)); - Results.push_back(LongMul.getValue(1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, + LongMul.getValue(0), LongMul.getValue(1))); } /// ReplaceNodeResults - Replace the results of node with an illegal result @@ -9466,7 +9905,9 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ABS: lowerABS(N, Results, DAG); return ; - + case ISD::LOAD: + LowerLOAD(N, Results, DAG); + break; } if (Res.getNode()) Results.push_back(Res); @@ -9499,7 +9940,7 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); - unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); + unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4)); const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; @@ -9507,11 +9948,11 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand::MOLoad, 4, Align(4)); MachineMemOperand *FIMMOSt = MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), - MachineMemOperand::MOStore, 4, 4); + MachineMemOperand::MOStore, 4, Align(4)); // Load the address of the dispatch MBB into the jump buffer. if (isThumb2) { @@ -9697,7 +10138,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), - MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); + MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4)); MachineInstrBuilder MIB; MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); @@ -9788,10 +10229,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); - if (Align == 0) - Align = MF->getDataLayout().getTypeAllocSize(C->getType()); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) @@ -9828,8 +10267,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addReg(NewVReg3) .add(predOps(ARMCC::AL)); - MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( - MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = + MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), + MachineMemOperand::MOLoad, 4, Align(4)); Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) @@ -9889,10 +10329,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. - unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); - if (Align == 0) - Align = MF->getDataLayout().getTypeAllocSize(C->getType()); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); Register VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) @@ -9922,8 +10360,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); - MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( - MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = + MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), + MachineMemOperand::MOLoad, 4, Align(4)); Register NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) .addReg(NewVReg3, RegState::Kill) @@ -10162,7 +10601,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, Register dest = MI.getOperand(0).getReg(); Register src = MI.getOperand(1).getReg(); unsigned SizeVal = MI.getOperand(2).getImm(); - unsigned Align = MI.getOperand(3).getImm(); + unsigned Alignment = MI.getOperand(3).getImm(); DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); @@ -10175,17 +10614,17 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, bool IsThumb2 = Subtarget->isThumb2(); bool IsThumb = Subtarget->isThumb(); - if (Align & 1) { + if (Alignment & 1) { UnitSize = 1; - } else if (Align & 2) { + } else if (Alignment & 2) { UnitSize = 2; } else { // Check whether we can use NEON instructions. if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { - if ((Align % 16 == 0) && SizeVal >= 16) + if ((Alignment % 16 == 0) && SizeVal >= 16) UnitSize = 16; - else if ((Align % 8 == 0) && SizeVal >= 8) + else if ((Alignment % 8 == 0) && SizeVal >= 8) UnitSize = 8; } // Can't use NEON instructions. @@ -10291,13 +10730,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. - unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); - if (Align == 0) - Align = MF->getDataLayout().getTypeAllocSize(C->getType()); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand::MOLoad, 4, Align(4)); if (IsThumb) BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) @@ -11667,6 +12104,42 @@ static SDValue PerformAddeSubeCombine(SDNode *N, return SDValue(); } +static SDValue PerformVSELECTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). + // + // We need to re-implement this optimization here as the implementation in the + // Target-Independent DAGCombiner does not handle the kind of constant we make + // (it calls isConstOrConstSplat with AllowTruncation set to false - and for + // good reason, allowing truncation there would break other targets). + // + // Currently, this is only done for MVE, as it's the only target that benefits + // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL). + if (!Subtarget->hasMVEIntegerOps()) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::XOR) + return SDValue(); + SDValue XOR = N->getOperand(0); + + // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s. + // It is important to check with truncation allowed as the BUILD_VECTORs we + // generate in those situations will truncate their operands. + ConstantSDNode *Const = + isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false, + /*AllowTruncation*/ true); + if (!Const || !Const->isOne()) + return SDValue(); + + // Rewrite into vselect(cond, rhs, lhs). + SDValue Cond = XOR->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + EVT Type = N->getValueType(0); + return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS); +} + static SDValue PerformABSCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { @@ -11724,6 +12197,71 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, return SDValue(); } +static SDValue PerformADDVecReduce(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this + // will look like: + // t1: i32,i32 = ARMISD::VADDLVs x + // t2: i64 = build_pair t1, t1:1 + // t3: i64 = add t2, y + // We also need to check for sext / zext and commutitive adds. + auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, + SDValue NB) { + if (NB->getOpcode() != ISD::BUILD_PAIR) + return SDValue(); + SDValue VecRed = NB->getOperand(0); + if (VecRed->getOpcode() != Opcode || VecRed.getResNo() != 0 || + NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) + return SDValue(); + + SDLoc dl(N); + SmallVector<SDValue, 4> Ops; + Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, + DCI.DAG.getConstant(0, dl, MVT::i32))); + Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, + DCI.DAG.getConstant(1, dl, MVT::i32))); + for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++) + Ops.push_back(VecRed->getOperand(i)); + SDValue Red = DCI.DAG.getNode(OpcodeA, dl, + DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops); + return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, + SDValue(Red.getNode(), 1)); + }; + + if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) + return M; + return SDValue(); +} + bool ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const { @@ -11895,6 +12433,9 @@ static SDValue PerformADDCombine(SDNode *N, if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; + if (SDValue Result = PerformADDVecReduce(N, DCI, Subtarget)) + return Result; + // First try with the default operand order. if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) return Result; @@ -11986,18 +12527,86 @@ static SDValue PerformVMULCombine(SDNode *N, DAG.getNode(ISD::MUL, DL, VT, N01, N1)); } +static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (VT != MVT::v2i64) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + auto IsSignExt = [&](SDValue Op) { + if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG) + return SDValue(); + EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT(); + if (VT.getScalarSizeInBits() == 32) + return Op->getOperand(0); + return SDValue(); + }; + auto IsZeroExt = [&](SDValue Op) { + // Zero extends are a little more awkward. At the point we are matching + // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask. + // That might be before of after a bitcast depending on how the and is + // placed. Because this has to look through bitcasts, it is currently only + // supported on LE. + if (!Subtarget->isLittle()) + return SDValue(); + + SDValue And = Op; + if (And->getOpcode() == ISD::BITCAST) + And = And->getOperand(0); + if (And->getOpcode() != ISD::AND) + return SDValue(); + SDValue Mask = And->getOperand(1); + if (Mask->getOpcode() == ISD::BITCAST) + Mask = Mask->getOperand(0); + + if (Mask->getOpcode() != ISD::BUILD_VECTOR || + Mask.getValueType() != MVT::v4i32) + return SDValue(); + if (isAllOnesConstant(Mask->getOperand(0)) && + isNullConstant(Mask->getOperand(1)) && + isAllOnesConstant(Mask->getOperand(2)) && + isNullConstant(Mask->getOperand(3))) + return And->getOperand(0); + return SDValue(); + }; + + SDLoc dl(N); + if (SDValue Op0 = IsSignExt(N0)) { + if (SDValue Op1 = IsSignExt(N1)) { + SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); + SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); + return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a); + } + } + if (SDValue Op0 = IsZeroExt(N0)) { + if (SDValue Op1 = IsZeroExt(N1)) { + SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); + SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); + return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a); + } + } + + return SDValue(); +} + static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64) + return PerformMVEVMULLCombine(N, DAG, Subtarget); + if (Subtarget->isThumb1Only()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); - EVT VT = N->getValueType(0); if (VT.is64BitVector() || VT.is128BitVector()) return PerformVMULCombine(N, DCI, Subtarget); if (VT != MVT::i32) @@ -12182,20 +12791,21 @@ static SDValue PerformANDCombine(SDNode *N, EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; - if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 || + VT == MVT::v8i1 || VT == MVT::v16i1) return SDValue(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && Subtarget->hasNEON() && + if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatBitSize <= 64) { + if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || + SplatBitSize == 64) { EVT VbicVT; SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VbicVT, VT.is128BitVector(), - OtherModImm); + DAG, dl, VbicVT, VT, OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); @@ -12425,58 +13035,44 @@ static bool isValidMVECond(unsigned CC, bool IsFloat) { }; } +static ARMCC::CondCodes getVCMPCondCode(SDValue N) { + if (N->getOpcode() == ARMISD::VCMP) + return (ARMCC::CondCodes)N->getConstantOperandVal(2); + else if (N->getOpcode() == ARMISD::VCMPZ) + return (ARMCC::CondCodes)N->getConstantOperandVal(1); + else + llvm_unreachable("Not a VCMP/VCMPZ!"); +} + +static bool CanInvertMVEVCMP(SDValue N) { + ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N)); + return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint()); +} + static SDValue PerformORCombine_i1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain // together with predicates EVT VT = N->getValueType(0); + SDLoc DL(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - ARMCC::CondCodes CondCode0 = ARMCC::AL; - ARMCC::CondCodes CondCode1 = ARMCC::AL; - if (N0->getOpcode() == ARMISD::VCMP) - CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2)) - ->getZExtValue(); - else if (N0->getOpcode() == ARMISD::VCMPZ) - CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1)) - ->getZExtValue(); - if (N1->getOpcode() == ARMISD::VCMP) - CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2)) - ->getZExtValue(); - else if (N1->getOpcode() == ARMISD::VCMPZ) - CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1)) - ->getZExtValue(); - - if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) - return SDValue(); - - unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); - unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); + auto IsFreelyInvertable = [&](SDValue V) { + if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ) + return CanInvertMVEVCMP(V); + return false; + }; - if (!isValidMVECond(Opposite0, - N0->getOperand(0)->getValueType(0).isFloatingPoint()) || - !isValidMVECond(Opposite1, - N1->getOperand(0)->getValueType(0).isFloatingPoint())) + // At least one operand must be freely invertable. + if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1))) return SDValue(); - SmallVector<SDValue, 4> Ops0; - Ops0.push_back(N0->getOperand(0)); - if (N0->getOpcode() == ARMISD::VCMP) - Ops0.push_back(N0->getOperand(1)); - Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); - SmallVector<SDValue, 4> Ops1; - Ops1.push_back(N1->getOperand(0)); - if (N1->getOpcode() == ARMISD::VCMP) - Ops1.push_back(N1->getOperand(1)); - Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); - - SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); - SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); - SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); - return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, - DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); + SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT); + SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT); + SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1); + return DCI.DAG.getLogicalNOT(DL, And, VT); } /// PerformORCombine - Target-specific dag combine xforms for ISD::OR @@ -12492,17 +13088,21 @@ static SDValue PerformORCombine(SDNode *N, if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); + if (Subtarget->hasMVEIntegerOps() && + (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) + return PerformORCombine_i1(N, DCI, Subtarget); + APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && Subtarget->hasNEON() && + if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatBitSize <= 64) { + if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || + SplatBitSize == 64) { EVT VorrVT; - SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VorrVT, VT.is128BitVector(), - OtherModImm); + SDValue Val = + isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), + SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); @@ -12563,10 +13163,6 @@ static SDValue PerformORCombine(SDNode *N, } } - if (Subtarget->hasMVEIntegerOps() && - (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) - return PerformORCombine_i1(N, DCI, Subtarget); - // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { @@ -12598,6 +13194,27 @@ static SDValue PerformXORCombine(SDNode *N, return Result; } + if (Subtarget->hasMVEIntegerOps()) { + // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition. + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + const TargetLowering *TLI = Subtarget->getTargetLowering(); + if (TLI->isConstTrueVal(N1.getNode()) && + (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) { + if (CanInvertMVEVCMP(N0)) { + SDLoc DL(N0); + ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0)); + + SmallVector<SDValue, 4> Ops; + Ops.push_back(N0->getOperand(0)); + if (N0->getOpcode() == ARMISD::VCMP) + Ops.push_back(N0->getOperand(1)); + Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32)); + return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops); + } + } + } + return SDValue(); } @@ -12796,6 +13413,78 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + + // VMOVhr (VMOVrh (X)) -> X + if (Op0->getOpcode() == ARMISD::VMOVrh) + return Op0->getOperand(0); + + // FullFP16: half values are passed in S-registers, and we don't + // need any of the bitcast and moves: + // + // t2: f32,ch = CopyFromReg t0, Register:f32 %0 + // t5: i32 = bitcast t2 + // t18: f16 = ARMISD::VMOVhr t5 + if (Op0->getOpcode() == ISD::BITCAST) { + SDValue Copy = Op0->getOperand(0); + if (Copy.getValueType() == MVT::f32 && + Copy->getOpcode() == ISD::CopyFromReg) { + SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)}; + SDValue NewCopy = + DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops); + return NewCopy; + } + } + + // fold (VMOVhr (load x)) -> (load (f16*)x) + if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) { + if (LN0->hasOneUse() && LN0->isUnindexed() && + LN0->getMemoryVT() == MVT::i16) { + SDValue Load = + DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(), + LN0->getBasePtr(), LN0->getMemOperand()); + DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); + DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1)); + return Load; + } + } + + // Only the bottom 16 bits of the source register are used. + APInt DemandedMask = APInt::getLowBitsSet(32, 16); + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + +static SDValue PerformVMOVrhCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // fold (VMOVrh (load x)) -> (zextload (i16*)x) + if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { + LoadSDNode *LN0 = cast<LoadSDNode>(N0); + + SDValue Load = + DCI.DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), + LN0->getBasePtr(), MVT::i16, LN0->getMemOperand()); + DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); + DCI.DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); + return Load; + } + + // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n) + if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa<ConstantSDNode>(N0->getOperand(1))) + return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0), + N0->getOperand(1)); + + return SDValue(); +} + /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node /// are normal, non-volatile loads. If so, it is profitable to bitcast an /// i64 vector to have f64 elements, since the value can then be loaded @@ -12946,8 +13635,29 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // If the valuetypes are the same, we can remove the cast entirely. if (Op->getOperand(0).getValueType() == VT) return Op->getOperand(0); - return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, - Op->getOperand(0).getValueType(), Op->getOperand(0)); + return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); + } + + return SDValue(); +} + +static SDValue +PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + SDLoc dl(N); + + // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST + if (ST->isLittle()) + return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op); + + // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) + if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { + // If the valuetypes are the same, we can remove the cast entirely. + if (Op->getOperand(0).getValueType() == VT) + return Op->getOperand(0); + return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); } return SDValue(); @@ -13012,6 +13722,29 @@ static SDValue PerformInsertEltCombine(SDNode *N, return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); } +static SDValue PerformExtractEltCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + EVT VT = N->getValueType(0); + SDLoc dl(N); + + // extract (vdup x) -> x + if (Op0->getOpcode() == ARMISD::VDUP) { + SDValue X = Op0->getOperand(0); + if (VT == MVT::f16 && X.getValueType() == MVT::i32) + return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); + if (VT == MVT::i32 && X.getValueType() == MVT::f16) + return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); + + while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) + X = X->getOperand(0); + if (X.getValueType() == VT) + return X; + } + + return SDValue(); +} + /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for /// ISD::VECTOR_SHUFFLE. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { @@ -13293,6 +14026,128 @@ static SDValue PerformVLDCombine(SDNode *N, return CombineBaseUpdate(N, DCI); } +static SDValue PerformMVEVLDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDValue Addr = N->getOperand(2); + MemSDNode *MemN = cast<MemSDNode>(N); + SDLoc dl(N); + + // For the stores, where there are multiple intrinsics we only actually want + // to post-inc the last of the them. + unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + if (IntNo == Intrinsic::arm_mve_vst2q && + cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1) + return SDValue(); + if (IntNo == Intrinsic::arm_mve_vst4q && + cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3) + return SDValue(); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); + UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. We can avoid searching through Addr as it's a + // predecessor to both. + SmallPtrSet<const SDNode *, 32> Visited; + SmallVector<const SDNode *, 16> Worklist; + Visited.insert(Addr.getNode()); + Worklist.push_back(N); + Worklist.push_back(User); + if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || + SDNode::hasPredecessorHelper(User, Visited, Worklist)) + continue; + + // Find the new opcode for the updating load/store. + bool isLoadOp = true; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + switch (IntNo) { + default: + llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); + case Intrinsic::arm_mve_vld2q: + NewOpc = ARMISD::VLD2_UPD; + NumVecs = 2; + break; + case Intrinsic::arm_mve_vld4q: + NewOpc = ARMISD::VLD4_UPD; + NumVecs = 4; + break; + case Intrinsic::arm_mve_vst2q: + NewOpc = ARMISD::VST2_UPD; + NumVecs = 2; + isLoadOp = false; + break; + case Intrinsic::arm_mve_vst4q: + NewOpc = ARMISD::VST4_UPD; + NumVecs = 4; + isLoadOp = false; + break; + } + + // Find the size of memory referenced by the load/store. + EVT VecTy; + if (isLoadOp) { + VecTy = N->getValueType(0); + } else { + VecTy = N->getOperand(3).getValueType(); + } + + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); + if (!CInc || CInc->getZExtValue() != NumBytes) + continue; + + // Create the new updating load/store node. + // First, create an SDVTList for the new updating node's results. + EVT Tys[6]; + unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i32; + Tys[n] = MVT::Other; + SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); + + // Then, gather the new node's operands. + SmallVector<SDValue, 8> Ops; + Ops.push_back(N->getOperand(0)); // incoming chain + Ops.push_back(N->getOperand(2)); // ptr + Ops.push_back(Inc); + + for (unsigned i = 3; i < N->getNumOperands(); ++i) + Ops.push_back(N->getOperand(i)); + + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, + MemN->getMemOperand()); + + // Update the uses. + SmallVector<SDValue, 5> NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) + NewResults.push_back(SDValue(UpdN.getNode(), i)); + + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + + return SDValue(); +} + /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and @@ -13377,8 +14232,21 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { /// PerformVDUPLANECombine - Target-specific dag combine xforms for /// ARMISD::VDUPLANE. static SDValue PerformVDUPLANECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { SDValue Op = N->getOperand(0); + EVT VT = N->getValueType(0); + + // On MVE, we just convert the VDUPLANE to a VDUP with an extract. + if (Subtarget->hasMVEIntegerOps()) { + EVT ExtractVT = VT.getVectorElementType(); + // We need to ensure we are creating a legal type. + if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) + ExtractVT = MVT::i32; + SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT, + N->getOperand(0), N->getOperand(1)); + return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract); + } // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. @@ -13399,7 +14267,6 @@ static SDValue PerformVDUPLANECombine(SDNode *N, unsigned EltBits; if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) EltSize = 8; - EVT VT = N->getValueType(0); if (EltSize > VT.getScalarSizeInBits()) return SDValue(); @@ -13412,6 +14279,18 @@ static SDValue PerformVDUPCombine(SDNode *N, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); + SDLoc dl(N); + + if (Subtarget->hasMVEIntegerOps()) { + // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will + // need to come from a GPR. + if (Op.getValueType() == MVT::f32) + return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), + DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); + else if (Op.getValueType() == MVT::f16) + return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), + DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); + } if (!Subtarget->hasNEON()) return SDValue(); @@ -13540,7 +14419,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) return SDValue(); SDValue Trunc = St->getValue(); - if (Trunc->getOpcode() != ISD::TRUNCATE) + if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND) return SDValue(); EVT FromVT = Trunc->getOperand(0).getValueType(); EVT ToVT = Trunc.getValueType(); @@ -13555,20 +14434,54 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, NumElements = 4; if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) NumElements = 8; - if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || + if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16) + NumElements = 4; + if (NumElements == 0 || + (FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) || FromVT.getVectorNumElements() % NumElements != 0) return SDValue(); + // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so + // use the VMOVN over splitting the store. We are looking for patterns of: + // !rev: 0 N 1 N+1 2 N+2 ... + // rev: N 0 N+1 1 N+2 2 ... + auto isVMOVNOriginalMask = [&](ArrayRef<int> M, bool rev) { + unsigned NumElts = ToVT.getVectorNumElements(); + if (NumElts != M.size()) + return false; + + unsigned Off0 = rev ? NumElts : 0; + unsigned Off1 = rev ? 0 : NumElts; + + for (unsigned i = 0; i < NumElts; i += 2) { + if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) + return false; + if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) + return false; + } + + return true; + }; + + if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc->getOperand(0))) + if (isVMOVNOriginalMask(Shuffle->getMask(), false) || + isVMOVNOriginalMask(Shuffle->getMask(), true)) + return SDValue(); + + LLVMContext &C = *DAG.getContext(); SDLoc DL(St); // Details about the old store SDValue Ch = St->getChain(); SDValue BasePtr = St->getBasePtr(); - unsigned Alignment = St->getOriginalAlignment(); + Align Alignment = St->getOriginalAlign(); MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); AAMDNodes AAInfo = St->getAAInfo(); - EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); - EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); + // We split the store into slices of NumElements. fp16 trunc stores are vcvt + // and then stored as truncating integer stores. + EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements); + EVT NewToVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements); SmallVector<SDValue, 4> Stores; for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { @@ -13578,9 +14491,17 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), DAG.getConstant(i * NumElements, DL, MVT::i32)); + + if (ToEltVT == MVT::f16) { + SDValue FPTrunc = + DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), + Extract, DAG.getConstant(0, DL, MVT::i32)); + Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); + } + SDValue Store = DAG.getTruncStore( Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), - NewToVT, Alignment, MMOFlags, AAInfo); + NewToVT, Alignment.value(), MMOFlags, AAInfo); Stores.push_back(Store); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); @@ -13778,8 +14699,163 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, ConvInput, DAG.getConstant(C, dl, MVT::i32)); } +static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + assert(N->getOpcode() == ISD::VECREDUCE_ADD); + EVT ResVT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDLoc dl(N); + + // We are looking for something that will have illegal types if left alone, + // but that we can convert to a single instruction undef MVE. For example + // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A + // or + // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B + + // Cases: + // VADDV u/s 8/16/32 + // VMLAV u/s 8/16/32 + // VADDLV u/s 32 + // VMLALV u/s 16/32 + + auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { + if (ResVT != RetTy || N0->getOpcode() != ExtendCode) + return SDValue(); + SDValue A = N0->getOperand(0); + if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) + return A; + return SDValue(); + }; + auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, + SDValue &A, SDValue &B) { + if (ResVT != RetTy || N0->getOpcode() != ISD::MUL) + return false; + SDValue ExtA = N0->getOperand(0); + SDValue ExtB = N0->getOperand(1); + if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) + return false; + A = ExtA->getOperand(0); + B = ExtB->getOperand(0); + if (A.getValueType() == B.getValueType() && + llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) + return true; + return false; + }; + auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { + SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, + SDValue(Node.getNode(), 1)); + }; + + if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) + return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); + if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) + return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); + if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32})) + return Create64bitNode(ARMISD::VADDLVs, {A}); + if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32})) + return Create64bitNode(ARMISD::VADDLVu, {A}); + + SDValue A, B; + if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) + return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); + if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) + return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); + if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B)) + return Create64bitNode(ARMISD::VMLALVs, {A, B}); + if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B)) + return Create64bitNode(ARMISD::VMLALVu, {A, B}); + return SDValue(); +} + +static SDValue PerformVMOVNCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + unsigned IsTop = N->getConstantOperandVal(2); + + // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b) + // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b) + if ((Op1->getOpcode() == ARMISD::VQMOVNs || + Op1->getOpcode() == ARMISD::VQMOVNu) && + Op1->getConstantOperandVal(2) == 0) + return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), + Op0, Op1->getOperand(1), N->getOperand(2)); + + // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from + // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting + // into the top or bottom lanes. + unsigned NumElts = N->getValueType(0).getVectorNumElements(); + APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1)); + APInt Op0DemandedElts = + IsTop ? Op1DemandedElts + : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1)); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + +static SDValue PerformVQMOVNCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + unsigned IsTop = N->getConstantOperandVal(2); + + unsigned NumElts = N->getValueType(0).getVectorNumElements(); + APInt Op0DemandedElts = + APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) + : APInt::getHighBitsSet(2, 1)); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + return SDValue(); +} + +static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from + // uses of the intrinsics. + if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { + int ShiftAmt = C->getSExtValue(); + if (ShiftAmt == 0) { + SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL); + DAG.ReplaceAllUsesWith(N, Merge.getNode()); + return SDValue(); + } + + if (ShiftAmt >= -32 && ShiftAmt < 0) { + unsigned NewOpcode = + N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL; + SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1, + DAG.getConstant(-ShiftAmt, DL, MVT::i32)); + DAG.ReplaceAllUsesWith(N, NewShift.getNode()); + return NewShift; + } + } + + return SDValue(); +} + /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. -static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { +SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); switch (IntNo) { default: @@ -13928,6 +15004,72 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { case Intrinsic::arm_neon_vqrshiftu: // No immediate versions of these to check for. break; + + case Intrinsic::arm_mve_vqdmlah: + case Intrinsic::arm_mve_vqdmlash: + case Intrinsic::arm_mve_vqrdmlah: + case Intrinsic::arm_mve_vqrdmlash: + case Intrinsic::arm_mve_vmla_n_predicated: + case Intrinsic::arm_mve_vmlas_n_predicated: + case Intrinsic::arm_mve_vqdmlah_predicated: + case Intrinsic::arm_mve_vqdmlash_predicated: + case Intrinsic::arm_mve_vqrdmlah_predicated: + case Intrinsic::arm_mve_vqrdmlash_predicated: { + // These intrinsics all take an i32 scalar operand which is narrowed to the + // size of a single lane of the vector type they return. So we don't need + // any bits of that operand above that point, which allows us to eliminate + // uxth/sxth. + unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); + if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI)) + return SDValue(); + break; + } + + case Intrinsic::arm_mve_minv: + case Intrinsic::arm_mve_maxv: + case Intrinsic::arm_mve_minav: + case Intrinsic::arm_mve_maxav: + case Intrinsic::arm_mve_minv_predicated: + case Intrinsic::arm_mve_maxv_predicated: + case Intrinsic::arm_mve_minav_predicated: + case Intrinsic::arm_mve_maxav_predicated: { + // These intrinsics all take an i32 scalar operand which is narrowed to the + // size of a single lane of the vector type they take as the other input. + unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); + if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) + return SDValue(); + break; + } + + case Intrinsic::arm_mve_addv: { + // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, + // which allow PerformADDVecReduce to turn it into VADDLV when possible. + bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; + return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); + } + + case Intrinsic::arm_mve_addlv: + case Intrinsic::arm_mve_addlv_predicated: { + // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR + // which recombines the two outputs into an i64 + bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); + unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? + (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : + (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); + + SmallVector<SDValue, 4> Ops; + for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) + if (i != 2) // skip the unsigned flag + Ops.push_back(N->getOperand(i)); + + SDLoc dl(N); + SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), + val.getValue(1)); + } } return SDValue(); @@ -14023,9 +15165,10 @@ static SDValue PerformShiftCombine(SDNode *N, return SDValue(); } -// Look for a sign/zero extend of a larger than legal load. This can be split -// into two extending loads, which are simpler to deal with than an arbitrary -// sign extend. +// Look for a sign/zero/fpextend extend of a larger than legal load. This can be +// split into multiple extending loads, which are simpler to deal with than an +// arbitrary extend. For fp extends we use an integer extending load and a VCVTL +// to convert the type to an f32. static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::LOAD) @@ -14047,45 +15190,63 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { NumElements = 4; if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) NumElements = 8; + if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16) + NumElements = 4; if (NumElements == 0 || - FromVT.getVectorNumElements() == NumElements || + (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) || FromVT.getVectorNumElements() % NumElements != 0 || !isPowerOf2_32(NumElements)) return SDValue(); + LLVMContext &C = *DAG.getContext(); SDLoc DL(LD); // Details about the old load SDValue Ch = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); - unsigned Alignment = LD->getOriginalAlignment(); + Align Alignment = LD->getOriginalAlign(); MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); AAMDNodes AAInfo = LD->getAAInfo(); ISD::LoadExtType NewExtType = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); - EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); - EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); - unsigned NewOffset = NewFromVT.getSizeInBits() / 8; - SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); - - // Split the load in half, each side of which is extended separately. This - // is good enough, as legalisation will take it from there. They are either - // already legal or they will be split further into something that is - // legal. - SDValue NewLoad1 = - DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, - LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); - SDValue NewLoad2 = - DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, - LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, - Alignment, MMOFlags, AAInfo); - - SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - SDValue(NewLoad1.getNode(), 1), - SDValue(NewLoad2.getNode(), 1)); + EVT NewFromVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); + EVT NewToVT = EVT::getVectorVT( + C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); + + SmallVector<SDValue, 4> Loads; + SmallVector<SDValue, 4> Chains; + for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { + unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + SDValue NewLoad = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, + LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, + Alignment.value(), MMOFlags, AAInfo); + Loads.push_back(NewLoad); + Chains.push_back(SDValue(NewLoad.getNode(), 1)); + } + + // Float truncs need to extended with VCVTB's into their floating point types. + if (FromEltVT == MVT::f16) { + SmallVector<SDValue, 4> Extends; + + for (unsigned i = 0; i < Loads.size(); i++) { + SDValue LoadBC = + DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]); + SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC, + DAG.getConstant(0, DL, MVT::i32)); + Extends.push_back(FPExt); + } + + Loads = Extends; + } + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads); } /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, @@ -14133,6 +15294,116 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (ST->hasMVEFloatOps()) + if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) + return NewLoad; + + return SDValue(); +} + +/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating +/// saturates. +static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + if (!ST->hasMVEIntegerOps()) + return SDValue(); + + if (VT != MVT::v4i32 && VT != MVT::v8i16) + return SDValue(); + + auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) { + // Check one is a smin and the other is a smax + if (Min->getOpcode() != ISD::SMIN) + std::swap(Min, Max); + if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX) + return false; + + APInt SaturateC; + if (VT == MVT::v4i32) + SaturateC = APInt(32, (1 << 15) - 1, true); + else //if (VT == MVT::v8i16) + SaturateC = APInt(16, (1 << 7) - 1, true); + + APInt MinC, MaxC; + if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || + MinC != SaturateC) + return false; + if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) || + MaxC != ~SaturateC) + return false; + return true; + }; + + if (IsSignedSaturate(N, N0.getNode())) { + SDLoc DL(N); + MVT ExtVT, HalfVT; + if (VT == MVT::v4i32) { + HalfVT = MVT::v8i16; + ExtVT = MVT::v4i16; + } else { // if (VT == MVT::v8i16) + HalfVT = MVT::v16i8; + ExtVT = MVT::v8i8; + } + + // Create a VQMOVNB with undef top lanes, then signed extended into the top + // half. That extend will hopefully be removed if only the bottom bits are + // demanded (though a truncating store, for example). + SDValue VQMOVN = + DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT), + N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32)); + SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast, + DAG.getValueType(ExtVT)); + } + + auto IsUnsignedSaturate = [&](SDNode *Min) { + // For unsigned, we just need to check for <= 0xffff + if (Min->getOpcode() != ISD::UMIN) + return false; + + APInt SaturateC; + if (VT == MVT::v4i32) + SaturateC = APInt(32, (1 << 16) - 1, true); + else //if (VT == MVT::v8i16) + SaturateC = APInt(16, (1 << 8) - 1, true); + + APInt MinC; + if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || + MinC != SaturateC) + return false; + return true; + }; + + if (IsUnsignedSaturate(N)) { + SDLoc DL(N); + MVT HalfVT; + unsigned ExtConst; + if (VT == MVT::v4i32) { + HalfVT = MVT::v8i16; + ExtConst = 0x0000FFFF; + } else { //if (VT == MVT::v8i16) + HalfVT = MVT::v16i8; + ExtConst = 0x00FF; + } + + // Create a VQMOVNB with undef top lanes, then ZExt into the top half with + // an AND. That extend will hopefully be removed if only the bottom bits are + // demanded (though a truncating store, for example). + SDValue VQMOVN = + DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0, + DAG.getConstant(0, DL, MVT::i32)); + SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); + return DAG.getNode(ISD::AND, DL, VT, Bitcast, + DAG.getConstant(ExtConst, DL, VT)); + } + + return SDValue(); +} + static const APInt *isPowerOf2Constant(SDValue V) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); if (!C) @@ -14614,10 +15885,41 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { return Res; } +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *ST) { + SDValue Src = N->getOperand(0); + EVT DstVT = N->getValueType(0); + + // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE. + if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) { + EVT SrcVT = Src.getValueType(); + if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits()) + return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0)); + } + + // We may have a bitcast of something that has already had this bitcast + // combine performed on it, so skip past any VECTOR_REG_CASTs. + while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) + Src = Src.getOperand(0); + + // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that + // would be generated is at least the width of the element type. + EVT SrcVT = Src.getValueType(); + if ((Src.getOpcode() == ARMISD::VMOVIMM || + Src.getOpcode() == ARMISD::VMVNIMM || + Src.getOpcode() == ARMISD::VMOVFPIMM) && + SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() && + DAG.getDataLayout().isBigEndian()) + return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src); + + return SDValue(); +} + SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; + case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); @@ -14635,25 +15937,37 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); + case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); + case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI); case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); + case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); - case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); + case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); case ISD::FDIV: return PerformVDIVCombine(N, DCI.DAG, Subtarget); - case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); + case ISD::INTRINSIC_WO_CHAIN: + return PerformIntrinsicCombine(N, DCI); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DCI, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: - case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ISD::ANY_EXTEND: + return PerformExtendCombine(N, DCI.DAG, Subtarget); + case ISD::FP_EXTEND: + return PerformFPExtendCombine(N, DCI.DAG, Subtarget); + case ISD::SMIN: + case ISD::UMIN: + case ISD::SMAX: + case ISD::UMAX: + return PerformMinMaxCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); @@ -14664,10 +15978,25 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); + case ISD::BITCAST: + return PerformBITCASTCombine(N, DCI.DAG, Subtarget); case ARMISD::PREDICATE_CAST: return PerformPREDICATE_CASTCombine(N, DCI); + case ARMISD::VECTOR_REG_CAST: + return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget); case ARMISD::VCMP: return PerformVCMPCombine(N, DCI, Subtarget); + case ISD::VECREDUCE_ADD: + return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); + case ARMISD::VMOVN: + return PerformVMOVNCombine(N, DCI); + case ARMISD::VQMOVNs: + case ARMISD::VQMOVNu: + return PerformVQMOVNCombine(N, DCI); + case ARMISD::ASRL: + case ARMISD::LSRL: + case ARMISD::LSLL: + return PerformLongShiftCombine(N, DCI.DAG); case ARMISD::SMULWB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); @@ -14756,6 +16085,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: return PerformVLDCombine(N, DCI); + case Intrinsic::arm_mve_vld2q: + case Intrinsic::arm_mve_vld4q: + case Intrinsic::arm_mve_vst2q: + case Intrinsic::arm_mve_vst4q: + return PerformMVEVLDCombine(N, DCI); default: break; } break; @@ -14839,28 +16173,21 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, return false; } -static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, - unsigned AlignCheck) { - return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && - (DstAlign == 0 || DstAlign % AlignCheck == 0)); -} EVT ARMTargetLowering::getOptimalMemOpType( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { // See if we can use NEON instructions for this... - if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && + if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { bool Fast; - if (Size >= 16 && - (memOpAlign(SrcAlign, DstAlign, 16) || + if (Op.size() >= 16 && + (Op.isAligned(Align(16)) || (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { return MVT::v2f64; - } else if (Size >= 8 && - (memOpAlign(SrcAlign, DstAlign, 8) || + } else if (Op.size() >= 8 && + (Op.isAligned(Align(8)) || (allowsMisalignedMemoryAccesses( MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { @@ -14974,45 +16301,97 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I, if (!Subtarget->hasMVEIntegerOps()) return false; - auto IsSinker = [](Instruction *I, int Operand) { + auto IsFMSMul = [&](Instruction *I) { + if (!I->hasOneUse()) + return false; + auto *Sub = cast<Instruction>(*I->users().begin()); + return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; + }; + auto IsFMS = [&](Instruction *I) { + if (match(I->getOperand(0), m_FNeg(m_Value())) || + match(I->getOperand(1), m_FNeg(m_Value()))) + return true; + return false; + }; + + auto IsSinker = [&](Instruction *I, int Operand) { switch (I->getOpcode()) { case Instruction::Add: case Instruction::Mul: + case Instruction::FAdd: case Instruction::ICmp: + case Instruction::FCmp: return true; + case Instruction::FMul: + return !IsFMSMul(I); case Instruction::Sub: + case Instruction::FSub: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: return Operand == 1; + case Instruction::Call: + if (auto *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + case Intrinsic::fma: + return !IsFMS(I); + default: + return false; + } + } + return false; default: return false; } }; - int Op = 0; - if (!isa<ShuffleVectorInst>(I->getOperand(Op))) - Op = 1; - if (!IsSinker(I, Op)) - return false; - if (!match(I->getOperand(Op), - m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()), - m_Undef(), m_Zero()))) { - return false; - } - Instruction *Shuffle = cast<Instruction>(I->getOperand(Op)); - // All uses of the shuffle should be sunk to avoid duplicating it across gpr - // and vector registers - for (Use &U : Shuffle->uses()) { - Instruction *Insn = cast<Instruction>(U.getUser()); - if (!IsSinker(Insn, U.getOperandNo())) - return false; + for (auto OpIdx : enumerate(I->operands())) { + Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); + // Make sure we are not already sinking this operand + if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) + continue; + + Instruction *Shuffle = Op; + if (Shuffle->getOpcode() == Instruction::BitCast) + Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0)); + // We are looking for a splat that can be sunk. + if (!Shuffle || + !match(Shuffle, m_Shuffle( + m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), + m_Undef(), m_ZeroMask()))) + continue; + if (!IsSinker(I, OpIdx.index())) + continue; + + // All uses of the shuffle should be sunk to avoid duplicating it across gpr + // and vector registers + for (Use &U : Op->uses()) { + Instruction *Insn = cast<Instruction>(U.getUser()); + if (!IsSinker(Insn, U.getOperandNo())) + return false; + } + + Ops.push_back(&Shuffle->getOperandUse(0)); + if (Shuffle != Op) + Ops.push_back(&Op->getOperandUse(0)); + Ops.push_back(&OpIdx.value()); } - Ops.push_back(&Shuffle->getOperandUse(0)); - Ops.push_back(&I->getOperandUse(Op)); return true; } +Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { + if (!Subtarget->hasMVEIntegerOps()) + return nullptr; + Type *SVIType = SVI->getType(); + Type *ScalarType = SVIType->getScalarType(); + + if (ScalarType->isFloatTy()) + return Type::getInt32Ty(SVIType->getContext()); + if (ScalarType->isHalfTy()) + return Type::getInt16Ty(SVIType->getContext()); + return nullptr; +} + bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); @@ -15024,6 +16403,9 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { return false; } + if (Subtarget->hasMVEIntegerOps()) + return true; + // Don't create a loadext if we can fold the extension into a wide/long // instruction. // If there's more than one user instruction, the loadext is desirable no @@ -15445,7 +16827,7 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, return false; } -static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, +static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { @@ -15480,16 +16862,16 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align, // (in BE/masked) type. Base = Ptr->getOperand(0); if (VT == MVT::v4i16) { - if (Align >= 2 && IsInRange(RHSC, 0x80, 2)) + if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2)) return true; } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { if (IsInRange(RHSC, 0x80, 1)) return true; - } else if (Align >= 4 && + } else if (Alignment >= 4 && (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && IsInRange(RHSC, 0x80, 4)) return true; - else if (Align >= 2 && + else if (Alignment >= 2 && (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && IsInRange(RHSC, 0x80, 2)) return true; @@ -15511,28 +16893,28 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, EVT VT; SDValue Ptr; - unsigned Align; + Align Alignment; bool isSEXTLoad = false; bool IsMasked = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); - Align = ST->getAlignment(); + Alignment = ST->getAlign(); } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; IsMasked = true; } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); - Align = ST->getAlignment(); + Alignment = ST->getAlign(); IsMasked = true; } else return false; @@ -15541,9 +16923,9 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && - getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad, - IsMasked, Subtarget->isLittle(), Base, - Offset, isInc, DAG); + getMVEIndexedAddressParts( + Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked, + Subtarget->isLittle(), Base, Offset, isInc, DAG); else { if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, @@ -15569,31 +16951,31 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; - unsigned Align; + Align Alignment; bool isSEXTLoad = false, isNonExt; bool IsMasked = false; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); - Align = ST->getAlignment(); + Alignment = ST->getAlign(); isNonExt = !ST->isTruncatingStore(); } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); - Align = LD->getAlignment(); + Alignment = LD->getAlign(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; IsMasked = true; } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); - Align = ST->getAlignment(); + Alignment = ST->getAlign(); isNonExt = !ST->isTruncatingStore(); IsMasked = true; } else @@ -15619,7 +17001,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isLegal = false; if (VT.isVector()) isLegal = Subtarget->hasMVEIntegerOps() && - getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked, + getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked, Subtarget->isLittle(), Base, Offset, isInc, DAG); else { @@ -15734,18 +17116,23 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, if (Op.getOpcode() == ARMISD::VGETLANEs) Known = Known.sext(DstSz); else { - Known = Known.zext(DstSz, true /* extended bits are known zero */); + Known = Known.zext(DstSz); } assert(DstSz == Known.getBitWidth()); break; } + case ARMISD::VMOVrh: { + KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + assert(KnownOp.getBitWidth() == 16); + Known = KnownOp.zext(32); + break; + } } } -bool -ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, - const APInt &DemandedAPInt, - TargetLoweringOpt &TLO) const { +bool ARMTargetLowering::targetShrinkDemandedConstant( + SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, + TargetLoweringOpt &TLO) const { // Delay optimization, so we don't have to deal with illegal types, or block // optimizations. if (!TLO.LegalOps) @@ -15770,7 +17157,7 @@ ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, unsigned Mask = C->getZExtValue(); - unsigned Demanded = DemandedAPInt.getZExtValue(); + unsigned Demanded = DemandedBits.getZExtValue(); unsigned ShrunkMask = Mask & Demanded; unsigned ExpandedMask = Mask | ~Demanded; @@ -15825,6 +17212,35 @@ ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op, return false; } +bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( + SDValue Op, const APInt &OriginalDemandedBits, + const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, + unsigned Depth) const { + unsigned Opc = Op.getOpcode(); + + switch (Opc) { + case ARMISD::ASRL: + case ARMISD::LSRL: { + // If this is result 0 and the other result is unused, see if the demand + // bits allow us to shrink this long shift into a standard small shift in + // the opposite direction. + if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) && + isa<ConstantSDNode>(Op->getOperand(2))) { + unsigned ShAmt = Op->getConstantOperandVal(2); + if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf( + APInt::getAllOnesValue(32) << (32 - ShAmt))) + return TLO.CombineTo( + Op, TLO.DAG.getNode( + ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1), + TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32))); + } + break; + } + } + + return TargetLowering::SimplifyDemandedBitsForTargetNode( + Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); +} //===----------------------------------------------------------------------===// // ARM Inline Assembly Support @@ -15835,7 +17251,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { if (!Subtarget->hasV6Ops()) return false; - InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); std::string AsmStr = IA->getAsmString(); SmallVector<StringRef, 4> AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); @@ -15843,7 +17259,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { switch (AsmPieces.size()) { default: return false; case 1: - AsmStr = AsmPieces[0]; + AsmStr = std::string(AsmPieces[0]); AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t,"); @@ -16342,13 +17758,15 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const if (DAG.getMachineFunction().getFunction().hasFnAttribute( "no-stack-arg-probe")) { - unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + MaybeAlign Align = + cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); if (Align) - SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), - DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); + SP = + DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), + DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32)); Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); SDValue Ops[2] = { SP, Chain }; return DAG.getMergeValues(Ops, DL); @@ -16552,7 +17970,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); - Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); + Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -16593,7 +18011,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); - Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue()); + Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -16619,6 +18037,34 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } + case Intrinsic::arm_mve_vld2q: + case Intrinsic::arm_mve_vld4q: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + Type *VecTy = cast<StructType>(I.getType())->getElementType(1); + unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; + Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Align(VecTy->getScalarSizeInBits() / 8); + // volatile loads with MVE intrinsics not supported + Info.flags = MachineMemOperand::MOLoad; + return true; + } + case Intrinsic::arm_mve_vst2q: + case Intrinsic::arm_mve_vst4q: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + Type *VecTy = I.getArgOperand(1)->getType(); + unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; + Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Align(VecTy->getScalarSizeInBits() / 8); + // volatile stores with MVE intrinsics not supported + Info.flags = MachineMemOperand::MOStore; + return true; + } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); @@ -16627,7 +18073,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -16639,7 +18085,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.align = DL.getABITypeAlign(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -16873,7 +18319,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); - unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth(); + unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize(); // We can do a store + vector extract on any vector that fits perfectly in a D // or Q register. if (BitWidth == 64 || BitWidth == 128) { @@ -16986,7 +18432,7 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, } bool ARMTargetLowering::isLegalInterleavedAccessType( - unsigned Factor, VectorType *VecTy, const DataLayout &DL) const { + unsigned Factor, FixedVectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); @@ -17045,8 +18491,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); - VectorType *VecTy = Shuffles[0]->getType(); - Type *EltTy = VecTy->getVectorElementType(); + auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); + Type *EltTy = VecTy->getElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); @@ -17061,8 +18507,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) - VecTy = - VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); + VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy); IRBuilder<> Builder(LI); @@ -17072,15 +18517,15 @@ bool ARMTargetLowering::lowerInterleavedLoad( if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. - VecTy = VectorType::get(VecTy->getVectorElementType(), - VecTy->getVectorNumElements() / NumLoads); + VecTy = FixedVectorType::get(VecTy->getElementType(), + VecTy->getNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, VecTy->getVectorElementType()->getPointerTo( - LI->getPointerAddressSpace())); + BaseAddr, + VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); @@ -17105,8 +18550,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( "expected interleave factor of 2 or 4 for MVE"); Intrinsic::ID LoadInts = Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; - Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo( - LI->getPointerAddressSpace()); + Type *VecEltTy = + VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[] = {VecTy, VecEltTy}; Function *VldnFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); @@ -17126,9 +18571,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = - Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, - VecTy->getVectorNumElements() * Factor); + BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr, + VecTy->getNumElements() * Factor); CallInst *VldN = createLoadIntrinsic(BaseAddr); @@ -17143,8 +18587,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( - SubVec, VectorType::get(SV->getType()->getVectorElementType(), - VecTy->getVectorNumElements())); + SubVec, + FixedVectorType::get(SV->getType()->getElementType(), VecTy)); SubVecs[SV].push_back(SubVec); } @@ -17196,13 +18640,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); - VectorType *VecTy = SVI->getType(); - assert(VecTy->getVectorNumElements() % Factor == 0 && - "Invalid interleaved store"); + auto *VecTy = cast<FixedVectorType>(SVI->getType()); + assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); - unsigned LaneLen = VecTy->getVectorNumElements() / Factor; - Type *EltTy = VecTy->getVectorElementType(); - VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); + unsigned LaneLen = VecTy->getNumElements() / Factor; + Type *EltTy = VecTy->getElementType(); + auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); @@ -17224,12 +18667,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *IntTy = DL.getIntPtrType(EltTy); // Convert to the corresponding integer vector. - Type *IntVecTy = - VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); + auto *IntVecTy = + FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType())); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); - SubVecTy = VectorType::get(IntTy, LaneLen); + SubVecTy = FixedVectorType::get(IntTy, LaneLen); } // The base address of the store. @@ -17239,14 +18682,14 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; - SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); + SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( - BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( - SI->getPointerAddressSpace())); + BaseAddr, + SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); @@ -17276,7 +18719,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, "expected interleave factor of 2 or 4 for MVE"); Intrinsic::ID StoreInts = Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; - Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo( + Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo( SI->getPointerAddressSpace()); Type *Tys[] = {EltPtrTy, SubVecTy}; Function *VstNFunc = @@ -17298,7 +18741,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), + BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), BaseAddr, LaneLen * Factor); SmallVector<Value *, 4> Shuffles; @@ -17308,7 +18751,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Shuffles.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); + Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { @@ -17325,7 +18768,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Shuffles.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); + Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); } } @@ -17373,11 +18816,11 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, case HA_DOUBLE: return false; case HA_VECT64: - return VT->getBitWidth() == 64; + return VT->getPrimitiveSizeInBits().getFixedSize() == 64; case HA_VECT128: - return VT->getBitWidth() == 128; + return VT->getPrimitiveSizeInBits().getFixedSize() == 128; case HA_UNKNOWN: - switch (VT->getBitWidth()) { + switch (VT->getPrimitiveSizeInBits().getFixedSize()) { case 64: Base = HA_VECT64; return true; @@ -17396,7 +18839,7 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, /// Return the correct alignment for the current calling convention. Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const { - const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy)); + const Align ABITypeAlign = DL.getABITypeAlign(ArgTy); if (!ArgTy->isVectorTy()) return ABITypeAlign; @@ -17423,18 +18866,18 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( return IsHA || IsIntArray; } -unsigned ARMTargetLowering::getExceptionPointerRegister( +Register ARMTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. - return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; + return Subtarget->useSjLjEH() ? Register() : ARM::R0; } -unsigned ARMTargetLowering::getExceptionSelectorRegister( +Register ARMTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. - return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; + return Subtarget->useSjLjEH() ? Register() : ARM::R1; } void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h index 6061a65d3b89..8b1f4183032e 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.h @@ -68,10 +68,12 @@ class VectorType; CALL, // Function call. CALL_PRED, // Function call that's predicable. CALL_NOLINK, // Function call with branch not branch-and-link. + tSECALL, // CMSE non-secure function call. BRCOND, // Conditional branch. BR_JT, // Jumptable branch. BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). RET_FLAG, // Return with a flag operand. + SERET_FLAG, // CMSE Entry function return with a flag operand. INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand. PIC_ADD, // Add with a PC operand and a PIC label. @@ -133,6 +135,7 @@ class VectorType; LE, // Low-overhead loops, Loop End PREDICATE_CAST, // Predicate cast for MVE i1 types + VECTOR_REG_CAST, // Reinterpret the current contents of a vector register VCMP, // Vector compare. VCMPZ, // Vector compare to zero. @@ -201,10 +204,36 @@ class VectorType; VTBL2, // 2-register shuffle with mask VMOVN, // MVE vmovn + // MVE Saturating truncates + VQMOVNs, // Vector (V) Saturating (Q) Move and Narrow (N), signed (s) + VQMOVNu, // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u) + + // MVE float <> half converts + VCVTN, // MVE vcvt f32 -> f16, truncating into either the bottom or top lanes + VCVTL, // MVE vcvt f16 -> f32, extending from either the bottom or top lanes + // Vector multiply long: VMULLs, // ...signed VMULLu, // ...unsigned + // MVE reductions + VADDVs, // sign- or zero-extend the elements of a vector to i32, + VADDVu, // add them all together, and return an i32 of their sum + VADDLVs, // sign- or zero-extend elements to i64 and sum, returning + VADDLVu, // the low and high 32-bit halves of the sum + VADDLVAs, // same as VADDLV[su] but also add an input accumulator + VADDLVAu, // provided as low and high halves + VADDLVps, // same as VADDLVs but with a v4i1 predicate mask + VADDLVpu, // same as VADDLVu but with a v4i1 predicate mask + VADDLVAps, // same as VADDLVps but with a v4i1 predicate mask + VADDLVApu, // same as VADDLVpu but with a v4i1 predicate mask + VMLAVs, + VMLAVu, + VMLALVs, + VMLALVu, + VMLALVAs, + VMLALVAu, + SMULWB, // Signed multiply word by half word, bottom SMULWT, // Signed multiply word by half word, top UMLAL, // 64bit Unsigned Accumulate Multiply @@ -280,7 +309,11 @@ class VectorType; VST4_UPD, VST2LN_UPD, VST3LN_UPD, - VST4LN_UPD + VST4LN_UPD, + + // Load/Store of dual registers + LDRD, + STRD }; } // end namespace ARMISD @@ -333,8 +366,16 @@ class VectorType; SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; + SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + bool SimplifyDemandedBitsForTargetNode(SDValue Op, + const APInt &OriginalDemandedBits, + const APInt &OriginalDemandedElts, + KnownBits &Known, + TargetLoweringOpt &TLO, + unsigned Depth) const override; + bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override; /// allowsMisalignedMemoryAccesses - Returns true if the target allows @@ -345,10 +386,7 @@ class VectorType; MachineMemOperand::Flags Flags, bool *Fast) const override; - EVT getOptimalMemOpType(uint64_t Size, - unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, + EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; @@ -356,6 +394,7 @@ class VectorType; bool isZExtFree(SDValue Val, EVT VT2) const override; bool shouldSinkOperands(Instruction *I, SmallVectorImpl<Use *> &Ops) const override; + Type* shouldConvertSplatType(ShuffleVectorInst* SVI) const override; bool isFNegFree(EVT VT) const override; @@ -414,10 +453,10 @@ class VectorType; const SelectionDAG &DAG, unsigned Depth) const override; - bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, TargetLoweringOpt &TLO) const override; - bool ExpandInlineAsm(CallInst *CI) const override; ConstraintType getConstraintType(StringRef Constraint) const override; @@ -522,6 +561,12 @@ class VectorType; bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; + bool shouldFormOverflowOp(unsigned Opcode, EVT VT, + bool MathUsed) const override { + // Using overflow ops for overflow checks only should beneficial on ARM. + return TargetLowering::shouldFormOverflowOp(Opcode, VT, true); + } + /// Returns true if an argument of type Ty needs to be passed in a /// contiguous block of registers in calling convention CallConv. bool functionArgumentNeedsConsecutiveRegisters( @@ -529,12 +574,12 @@ class VectorType; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. - unsigned + Register getExceptionPointerRegister(const Constant *PersonalityFn) const override; /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. - unsigned + Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override; Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const; @@ -606,7 +651,7 @@ class VectorType; /// Returns true if \p VecTy is a legal interleaved access type. This /// function checks the vector element type and the overall width of the /// vector. - bool isLegalInterleavedAccessType(unsigned Factor, VectorType *VecTy, + bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, const DataLayout &DL) const; bool alignLoopsWithOptSize() const override; @@ -723,6 +768,8 @@ class VectorType; SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, SmallVectorImpl<SDValue> &Results) const; + SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) const; SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const; SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const; @@ -734,6 +781,8 @@ class VectorType; SDValue LowerFSETCC(SDValue Op, SelectionDAG &DAG) const; void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const; + void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; @@ -744,6 +793,11 @@ class VectorType; bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; + SDValue MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, MVT LocVT, MVT ValVT, + SDValue Val) const; + SDValue MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, MVT LocVT, + MVT ValVT, SDValue Val) const; + SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, @@ -763,6 +817,17 @@ class VectorType; MachineBasicBlock *Entry, const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + bool + splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val, + SDValue *Parts, unsigned NumParts, MVT PartVT, + Optional<CallingConv::ID> CC) const override; + + SDValue + joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL, + const SDValue *Parts, unsigned NumParts, + MVT PartVT, EVT ValueVT, + Optional<CallingConv::ID> CC) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -783,7 +848,7 @@ class VectorType; SmallVectorImpl<SDValue> &InVals) const override; /// HandleByVal - Target-specific cleanup for ByVal support. - void HandleByVal(CCState *, unsigned &, unsigned) const override; + void HandleByVal(CCState *, unsigned &, Align) const override; /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrCDE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrCDE.td new file mode 100644 index 000000000000..0e97668e2e01 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrCDE.td @@ -0,0 +1,666 @@ +//===-- ARMInstrCDE.td - CDE support for ARM ---------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the Arm CDE (Custom Datapath Extension) instruction set. +// +//===----------------------------------------------------------------------===// + +// Immediate operand of arbitrary bit width +class BitWidthImmOperand<int width> + : ImmAsmOperand<0, !add(!shl(1, width), -1)> { + let Name = "Imm"#width#"b"; +} + +class BitWidthImm<int width> + : Operand<i32>, + ImmLeaf<i32, "{ return Imm >= 0 && Imm < (1 << "#width#"); }"> { + let ParserMatchClass = BitWidthImmOperand<width>; +} + +def CDEDualRegOp : RegisterOperand<GPRPairnosp, "printGPRPairOperand">; + +// Used by VCX3 FP +def imm_3b : BitWidthImm<3>; + +// Used by VCX3 vector +def imm_4b : BitWidthImm<4>; + +// Used by VCX2 FP and CX3 +def imm_6b : BitWidthImm<6>; + +// Used by VCX2 vector +def imm_7b : BitWidthImm<7>; + +// Used by CX2 +def imm_9b : BitWidthImm<9>; + +// Used by VCX1 FP +def imm_11b : BitWidthImm<11>; + +// Used by VCX1 vector +def imm_12b : BitWidthImm<12>; + +// Used by CX1 +def imm_13b : BitWidthImm<13>; + +// Base class for all CDE instructions +class CDE_Instr<bit acc, dag oops, dag iops, string asm, string cstr> + : Thumb2XI<oops, !con((ins p_imm:$coproc), iops), + AddrModeNone, /*sz=*/4, NoItinerary, + asm, cstr, /*pattern=*/[]>, + Sched<[]> { + bits<3> coproc; + + let Inst{31-29} = 0b111; // 15:13 + let Inst{28} = acc; + let Inst{27-26} = 0b11; + let Inst{11} = 0b0; + let Inst{10-8} = coproc{2-0}; + + let isPredicable = 0; + let DecoderNamespace = "Thumb2CDE"; +} + +// Base class for CX* CDE instructions +class CDE_GPR_Instr<bit dual, bit acc, dag oops, dag iops, + string asm, string cstr> + : CDE_Instr<acc, oops, iops, asm, cstr>, + Requires<[HasCDE]> { + + let Inst{25-24} = 0b10; + let Inst{6} = dual; + let isPredicable = acc; +} + +// Set of registers used by the CDE instructions. +class CDE_RegisterOperands { + dag Rd; + dag Rd_src; + dag Rn; + dag Rm; +} + +// CX* CDE instruction parameter set +class CX_Params { + dag Oops; // Output operands for CX* instructions + dag Iops1; // Input operands for CX1* instructions + dag Iops2; // Input operands for CX2* instructions + dag Iops3; // Input operands for CX3* instructions + dag PredOp; // Input predicate operand + string PAsm; // Predicate assembly string + string Cstr; // asm constraint string + bit Dual; // "dual" field for encoding + bit Acc; // "acc" field for encoding +} + +// VCX* CDE instruction parameter set +class VCX_Params { + dag Oops; // Output operands for VCX* instructions + dag Iops1; // Input operands for VCX1* instructions + dag Iops2; // Input operands for VCX2* instructions + dag Iops3; // Input operands for VCX3* instructions + string Cstr; // asm constraint string + bit Acc; // "acc" field for encoding + vpred_ops Vpred; // Predication type for VCX* vector instructions +} + +// CX1, CX1A, CX1D, CX1DA +class CDE_CX1_Instr<string iname, CX_Params params> + : CDE_GPR_Instr<params.Dual, params.Acc, params.Oops, + !con(params.Iops1, (ins imm_13b:$imm), params.PredOp), + !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $imm"), + params.Cstr> { + bits<13> imm; + bits<4> Rd; + + let Inst{23-22} = 0b00; + let Inst{21-16} = imm{12-7}; + let Inst{15-12} = Rd{3-0}; + let Inst{7} = imm{6}; + let Inst{5-0} = imm{5-0}; +} + +// CX2, CX2A, CX2D, CX2DA +class CDE_CX2_Instr<string iname, CX_Params params> + : CDE_GPR_Instr<params.Dual, params.Acc, params.Oops, + !con(params.Iops2, (ins imm_9b:$imm), params.PredOp), + !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $Rn, $imm"), + params.Cstr> { + bits<9> imm; + bits<4> Rd; + bits<4> Rn; + + let Inst{23-22} = 0b01; + let Inst{21-20} = imm{8-7}; + let Inst{19-16} = Rn{3-0}; + let Inst{15-12} = Rd{3-0}; + let Inst{7} = imm{6}; + let Inst{5-0} = imm{5-0}; +} + +// CX3, CX3A, CX3D, CX3DA +class CDE_CX3_Instr<string iname, CX_Params params> + : CDE_GPR_Instr<params.Dual, params.Acc, params.Oops, + !con(params.Iops3, (ins imm_6b:$imm), params.PredOp), + !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $Rn, $Rm, $imm"), + params.Cstr> { + bits<6> imm; + bits<4> Rd; + bits<4> Rn; + bits<4> Rm; + + let Inst{23} = 0b1; + let Inst{22-20} = imm{5-3}; + let Inst{19-16} = Rn{3-0}; + let Inst{15-12} = Rm{3-0}; + let Inst{7} = imm{2}; + let Inst{5-4} = imm{1-0}; + let Inst{3-0} = Rd{3-0}; +} + +// Registers for single-register variants of CX* instructions +def cde_cx_single_regs : CDE_RegisterOperands { + let Rd = (outs GPRwithAPSR_NZCVnosp:$Rd); + let Rd_src = (ins GPRwithAPSR_NZCVnosp:$Rd_src); + let Rn = (ins GPRwithAPSR_NZCVnosp:$Rn); + let Rm = (ins GPRwithAPSR_NZCVnosp:$Rm); +} + +// Registers for single-register variants of CX* instructions +def cde_cx_dual_regs : CDE_RegisterOperands { + let Rd = (outs CDEDualRegOp:$Rd); + let Rd_src = (ins CDEDualRegOp:$Rd_src); + let Rn = (ins GPRwithAPSR_NZCVnosp:$Rn); + let Rm = (ins GPRwithAPSR_NZCVnosp:$Rm); +} + +class CDE_CX_ParamsTemplate<bit dual, bit acc, CDE_RegisterOperands ops> + : CX_Params { + + dag IOpsPrefix = !if(acc, ops.Rd_src, (ins)); + + let Oops = ops.Rd; + let Iops1 = IOpsPrefix; + let Iops2 = !con(IOpsPrefix, ops.Rn); + let Iops3 = !con(IOpsPrefix, ops.Rn, ops.Rm); + let PredOp = !if(acc, (ins pred:$p), (ins)); + let PAsm = !if(acc, "${p}", ""); + let Cstr = !if(acc, "$Rd = $Rd_src", ""); + let Dual = dual; + let Acc = acc; +} + +def cde_cx_params_single_noacc : CDE_CX_ParamsTemplate<0b0, 0b0, cde_cx_single_regs>; +def cde_cx_params_single_acc : CDE_CX_ParamsTemplate<0b0, 0b1, cde_cx_single_regs>; +def cde_cx_params_dual_noacc : CDE_CX_ParamsTemplate<0b1, 0b0, cde_cx_dual_regs>; +def cde_cx_params_dual_acc : CDE_CX_ParamsTemplate<0b1, 0b1, cde_cx_dual_regs>; + +def CDE_CX1 : CDE_CX1_Instr<"cx1", cde_cx_params_single_noacc>; +def CDE_CX1A : CDE_CX1_Instr<"cx1a", cde_cx_params_single_acc>; +def CDE_CX1D : CDE_CX1_Instr<"cx1d", cde_cx_params_dual_noacc>; +def CDE_CX1DA : CDE_CX1_Instr<"cx1da", cde_cx_params_dual_acc>; + +def CDE_CX2 : CDE_CX2_Instr<"cx2", cde_cx_params_single_noacc>; +def CDE_CX2A : CDE_CX2_Instr<"cx2a", cde_cx_params_single_acc>; +def CDE_CX2D : CDE_CX2_Instr<"cx2d", cde_cx_params_dual_noacc>; +def CDE_CX2DA : CDE_CX2_Instr<"cx2da", cde_cx_params_dual_acc>; + +def CDE_CX3 : CDE_CX3_Instr<"cx3", cde_cx_params_single_noacc>; +def CDE_CX3A : CDE_CX3_Instr<"cx3a", cde_cx_params_single_acc>; +def CDE_CX3D : CDE_CX3_Instr<"cx3d", cde_cx_params_dual_noacc>; +def CDE_CX3DA : CDE_CX3_Instr<"cx3da", cde_cx_params_dual_acc>; + +let Predicates = [HasCDE] in { + def : Pat<(i32 (int_arm_cde_cx1 timm:$coproc, timm:$imm)), + (i32 (CDE_CX1 p_imm:$coproc, imm_13b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx1a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + timm:$imm)), + (i32 (CDE_CX1A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + imm_13b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx2 timm:$coproc, GPRwithAPSR_NZCVnosp:$n, + timm:$imm)), + (i32 (CDE_CX2 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n, + imm_9b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx2a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, timm:$imm)), + (i32 (CDE_CX2A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, imm_9b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx3 timm:$coproc, GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, timm:$imm)), + (i32 (CDE_CX3 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx3a timm:$coproc, + GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, timm:$imm)), + (i32 (CDE_CX3A p_imm:$coproc, + GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>; +} + +class CDE_RequiresSReg : Requires<[HasCDE, HasFPRegs]>; +class CDE_RequiresDReg : Requires<[HasCDE, HasFPRegs]>; +class CDE_RequiresQReg : Requires<[HasCDE, HasMVEInt]>; + +// Base class for CDE VCX* instructions +class CDE_FP_Vec_Instr<bit vec, bit acc, dag oops, dag iops, string asm, string cstr> + : CDE_Instr<acc, oops, iops, asm, cstr> { + let Inst{25} = 0b0; + let Inst{6} = vec; +} + +// Base class for floating-point variants of CDE VCX* instructions +class CDE_FP_Instr<bit acc, bit sz, dag oops, dag iops, string asm, string cstr> + : CDE_FP_Vec_Instr<0b0, acc, oops, iops, asm, cstr> { + let Inst{24} = sz; +} + +// Base class for vector variants of CDE VCX* instruction +class CDE_Vec_Instr<bit acc, dag oops, dag iops, string asm, string cstr, + vpred_ops vpred> + : CDE_FP_Vec_Instr<0b1, acc, oops, + !con(iops, (ins vpred:$vp)), asm, + !strconcat(cstr, vpred.vpred_constraint)>, + CDE_RequiresQReg { +} + + +// VCX1/VCX1A, vector variant +class CDE_VCX1_Vec_Instr<string iname, VCX_Params params> + : CDE_Vec_Instr<params.Acc, params.Oops, + !con(params.Iops1, (ins imm_12b:$imm)), + iname#"${vp}\t$coproc, $Qd, $imm", params.Cstr, params.Vpred> { + bits<12> imm; + bits<3> Qd; + + let Inst{24} = imm{11}; + let Inst{23} = 0b0; + let Inst{22} = 0b0; + let Inst{21-20} = 0b10; + let Inst{19-16} = imm{10-7}; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = 0b0; + let Inst{7} = imm{6}; + let Inst{5-0} = imm{5-0}; + + let Unpredictable{22} = 0b1; +} + +// VCX1/VCX1A, base class for FP variants +class CDE_VCX1_FP_Instr<bit sz, string iname, VCX_Params params> + : CDE_FP_Instr<params.Acc, sz, params.Oops, + !con(params.Iops1, (ins imm_11b:$imm)), + iname#"\t$coproc, $Vd, $imm", params.Cstr> { + bits<11> imm; + + let Inst{23} = 0b0; + let Inst{21-20} = 0b10; + let Inst{19-16} = imm{10-7}; + let Inst{7} = imm{6}; + let Inst{5-0} = imm{5-0}; +} + +// VCX1/VCX1A, S registers +class CDE_VCX1_FP_Instr_S<string iname, VCX_Params params> + : CDE_VCX1_FP_Instr<0b0, iname, params>, + CDE_RequiresSReg { + bits<5> Vd; + + let Inst{22} = Vd{0}; + let Inst{15-12} = Vd{4-1}; +} + +// VCX1/VCX1A, D registers +class CDE_VCX1_FP_Instr_D<string iname, VCX_Params params> + : CDE_VCX1_FP_Instr<0b1, iname, params>, + CDE_RequiresDReg { + bits<5> Vd; + + let Inst{22} = Vd{4}; + let Inst{15-12} = Vd{3-0}; +} + +// VCX2/VCX2A, vector variant +class CDE_VCX2_Vec_Instr<string iname, VCX_Params params> + : CDE_Vec_Instr<params.Acc, params.Oops, + !con(params.Iops2, (ins imm_7b:$imm)), + iname#"${vp}\t$coproc, $Qd, $Qm, $imm", params.Cstr, + params.Vpred> { + bits<7> imm; + bits<3> Qd; + bits<3> Qm; + + let Inst{24} = imm{6}; + let Inst{23} = 0b0; + let Inst{22} = 0b0; + let Inst{21-20} = 0b11; + let Inst{19-16} = imm{5-2}; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = 0b0; + let Inst{7} = imm{1}; + let Inst{5} = 0b0; + let Inst{4} = imm{0}; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; + + let Unpredictable{22} = 0b1; + let Unpredictable{5} = 0b1; +} + +// VCX2/VCX2A, base class for FP variants +class CDE_VCX2_FP_Instr<bit sz, string iname, VCX_Params params> + : CDE_FP_Instr<params.Acc, sz, params.Oops, + !con(params.Iops2, (ins imm_6b:$imm)), + iname#"\t$coproc, $Vd, $Vm, $imm", params.Cstr> { + bits<6> imm; + + let Inst{23} = 0b0; + let Inst{21-20} = 0b11; + let Inst{19-16} = imm{5-2}; + let Inst{7} = imm{1}; + let Inst{4} = imm{0}; +} + +// VCX2/VCX2A, S registers +class CDE_VCX2_FP_Instr_S<string iname, VCX_Params params> + : CDE_VCX2_FP_Instr<0b0, iname, params>, + CDE_RequiresSReg { + bits<5> Vd; + bits<5> Vm; + + let Inst{15-12} = Vd{4-1}; + let Inst{22} = Vd{0}; + let Inst{3-0} = Vm{4-1}; + let Inst{5} = Vm{0}; +} + +// VCX2/VCX2A, D registers +class CDE_VCX2_FP_Instr_D<string iname, VCX_Params params> + : CDE_VCX2_FP_Instr<0b1, iname, params>, + CDE_RequiresDReg { + bits<5> Vd; + bits<5> Vm; + + let Inst{15-12} = Vd{3-0}; + let Inst{22} = Vd{4}; + let Inst{3-0} = Vm{3-0}; + let Inst{5} = Vm{4}; +} + +// VCX3/VCX3A, vector variant +class CDE_VCX3_Vec_Instr<string iname, VCX_Params params> + : CDE_Vec_Instr<params.Acc, params.Oops, + !con(params.Iops3, (ins imm_4b:$imm)), + iname#"${vp}\t$coproc, $Qd, $Qn, $Qm, $imm", params.Cstr, + params.Vpred> { + bits<4> imm; + bits<3> Qd; + bits<3> Qm; + bits<3> Qn; + + let Inst{24} = imm{3}; + let Inst{23} = 0b1; + let Inst{22} = 0b0; + let Inst{21-20} = imm{2-1}; + let Inst{19-17} = Qn{2-0}; + let Inst{16} = 0b0; + let Inst{15-13} = Qd{2-0}; + let Inst{12} = 0b0; + let Inst{7} = 0b0; + let Inst{5} = 0b0; + let Inst{4} = imm{0}; + let Inst{3-1} = Qm{2-0}; + let Inst{0} = 0b0; + + let Unpredictable{22} = 0b1; + let Unpredictable{7} = 0b1; + let Unpredictable{5} = 0b1; +} + +// VCX3/VCX3A, base class for FP variants +class CDE_VCX3_FP_Instr<bit sz, string iname, VCX_Params params> + : CDE_FP_Instr<params.Acc, sz, params.Oops, + !con(params.Iops3, (ins imm_3b:$imm)), + iname#"\t$coproc, $Vd, $Vn, $Vm, $imm", params.Cstr> { + bits<3> imm; + + let Inst{23} = 0b1; + let Inst{21-20} = imm{2-1}; + let Inst{4} = imm{0}; +} + +// VCX3/VCX3A, S registers +class CDE_VCX3_FP_Instr_S<string iname, VCX_Params params> + : CDE_VCX3_FP_Instr<0b0, iname, params>, + CDE_RequiresSReg { + bits<5> Vd; + bits<5> Vm; + bits<5> Vn; + + let Inst{22} = Vd{0}; + let Inst{19-16} = Vn{4-1}; + let Inst{15-12} = Vd{4-1}; + let Inst{7} = Vn{0}; + let Inst{5} = Vm{0}; + let Inst{3-0} = Vm{4-1}; +} + +// VCX3/VCX3A, D registers +class CDE_VCX3_FP_Instr_D<string iname, VCX_Params params> + : CDE_VCX3_FP_Instr<0b1, iname, params>, + CDE_RequiresDReg { + bits<5> Vd; + bits<5> Vm; + bits<5> Vn; + + let Inst{22} = Vd{4}; + let Inst{19-16} = Vn{3-0}; + let Inst{15-12} = Vd{3-0}; + let Inst{7} = Vn{4}; + let Inst{5} = Vm{4}; + let Inst{3-0} = Vm{3-0}; +} + +// Register operands for VCX* instructions +class CDE_VCX_RegisterOperandsTemplate<RegisterClass regclass> + : CDE_RegisterOperands { + let Rd = (outs regclass:$Vd); + let Rd_src = (ins regclass:$Vd_src); + let Rn = (ins regclass:$Vn); + let Rm = (ins regclass:$Vm); +} + +class CDE_VCXQ_RegisterOperandsTemplate<RegisterClass regclass> + : CDE_RegisterOperands { + let Rd = (outs regclass:$Qd); + let Rd_src = (ins regclass:$Qd_src); + let Rn = (ins regclass:$Qn); + let Rm = (ins regclass:$Qm); +} + +def cde_vcx_s_regs : CDE_VCX_RegisterOperandsTemplate<SPR>; +def cde_vcx_d_regs : CDE_VCX_RegisterOperandsTemplate<DPR_VFP2>; +def cde_vcx_q_regs : CDE_VCXQ_RegisterOperandsTemplate<MQPR>; + +class CDE_VCX_ParamsTemplate<bit acc, CDE_RegisterOperands ops> + : VCX_Params { + + dag IOpsPrefix = !if(acc, ops.Rd_src, (ins)); + + let Oops = ops.Rd; + let Iops1 = IOpsPrefix; + let Iops2 = !con(IOpsPrefix, ops.Rm); + let Iops3 = !con(IOpsPrefix, ops.Rn, ops.Rm); + let Cstr = !if(acc, "$Vd = $Vd_src", ""); + let Acc = acc; +} + +class CDE_VCXQ_ParamsTemplate<bit acc, CDE_RegisterOperands ops> + : VCX_Params { + + dag IOpsPrefix = !if(acc, ops.Rd_src, (ins)); + + let Oops = ops.Rd; + let Iops1 = IOpsPrefix; + let Iops2 = !con(IOpsPrefix, ops.Rm); + let Iops3 = !con(IOpsPrefix, ops.Rn, ops.Rm); + let Cstr = !if(acc, "$Qd = $Qd_src", ""); + let Acc = acc; + let Vpred = !if(acc, vpred_n, vpred_r); +} + +def cde_vcx_params_s_noacc : CDE_VCX_ParamsTemplate<0b0, cde_vcx_s_regs>; +def cde_vcx_params_s_acc : CDE_VCX_ParamsTemplate<0b1, cde_vcx_s_regs>; +def cde_vcx_params_d_noacc : CDE_VCX_ParamsTemplate<0b0, cde_vcx_d_regs>; +def cde_vcx_params_d_acc : CDE_VCX_ParamsTemplate<0b1, cde_vcx_d_regs>; +def cde_vcx_params_q_noacc : CDE_VCXQ_ParamsTemplate<0b0, cde_vcx_q_regs>; +def cde_vcx_params_q_acc : CDE_VCXQ_ParamsTemplate<0b1, cde_vcx_q_regs>; + +def CDE_VCX1_fpsp : CDE_VCX1_FP_Instr_S<"vcx1", cde_vcx_params_s_noacc>; +def CDE_VCX1A_fpsp : CDE_VCX1_FP_Instr_S<"vcx1a", cde_vcx_params_s_acc>; +def CDE_VCX1_fpdp : CDE_VCX1_FP_Instr_D<"vcx1", cde_vcx_params_d_noacc>; +def CDE_VCX1A_fpdp : CDE_VCX1_FP_Instr_D<"vcx1a", cde_vcx_params_d_acc>; +def CDE_VCX1_vec : CDE_VCX1_Vec_Instr<"vcx1", cde_vcx_params_q_noacc>; +def CDE_VCX1A_vec : CDE_VCX1_Vec_Instr<"vcx1a", cde_vcx_params_q_acc>; + +def CDE_VCX2_fpsp : CDE_VCX2_FP_Instr_S<"vcx2", cde_vcx_params_s_noacc>; +def CDE_VCX2A_fpsp : CDE_VCX2_FP_Instr_S<"vcx2a", cde_vcx_params_s_acc>; +def CDE_VCX2_fpdp : CDE_VCX2_FP_Instr_D<"vcx2", cde_vcx_params_d_noacc>; +def CDE_VCX2A_fpdp : CDE_VCX2_FP_Instr_D<"vcx2a", cde_vcx_params_d_acc>; +def CDE_VCX2_vec : CDE_VCX2_Vec_Instr<"vcx2", cde_vcx_params_q_noacc>; +def CDE_VCX2A_vec : CDE_VCX2_Vec_Instr<"vcx2a", cde_vcx_params_q_acc>; + +def CDE_VCX3_fpsp : CDE_VCX3_FP_Instr_S<"vcx3", cde_vcx_params_s_noacc>; +def CDE_VCX3A_fpsp : CDE_VCX3_FP_Instr_S<"vcx3a", cde_vcx_params_s_acc>; +def CDE_VCX3_fpdp : CDE_VCX3_FP_Instr_D<"vcx3", cde_vcx_params_d_noacc>; +def CDE_VCX3A_fpdp : CDE_VCX3_FP_Instr_D<"vcx3a", cde_vcx_params_d_acc>; +def CDE_VCX3_vec : CDE_VCX3_Vec_Instr<"vcx3", cde_vcx_params_q_noacc>; +def CDE_VCX3A_vec : CDE_VCX3_Vec_Instr<"vcx3a", cde_vcx_params_q_acc>; + + +let Predicates = [HasCDE, HasFPRegs] in { + def : Pat<(f32 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)), + (f32 (CDE_VCX1_fpsp p_imm:$coproc, imm_11b:$imm))>; + def : Pat<(f32 (int_arm_cde_vcx1a timm:$coproc, (f32 SPR:$acc), timm:$imm)), + (f32 (CDE_VCX1A_fpsp p_imm:$coproc, SPR:$acc, imm_11b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)), + (f64 (CDE_VCX1_fpdp p_imm:$coproc, imm_11b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx1a timm:$coproc, (f64 DPR:$acc), timm:$imm)), + (f64 (CDE_VCX1A_fpdp p_imm:$coproc, DPR:$acc, imm_11b:$imm))>; + + def : Pat<(f32 (int_arm_cde_vcx2 timm:$coproc, (f32 SPR:$n), timm:$imm)), + (f32 (CDE_VCX2_fpsp p_imm:$coproc, SPR:$n, imm_6b:$imm))>; + def : Pat<(f32 (int_arm_cde_vcx2a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n), + timm:$imm)), + (f32 (CDE_VCX2A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, imm_6b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx2 timm:$coproc, (f64 DPR:$n), timm:$imm)), + (f64 (CDE_VCX2_fpdp p_imm:$coproc, DPR:$n, imm_6b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx2a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n), + timm:$imm)), + (f64 (CDE_VCX2A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, imm_6b:$imm))>; + + def : Pat<(f32 (int_arm_cde_vcx3 timm:$coproc, (f32 SPR:$n), (f32 SPR:$m), + timm:$imm)), + (f32 (CDE_VCX3_fpsp p_imm:$coproc, (f32 SPR:$n), (f32 SPR:$m), + imm_3b:$imm))>; + def : Pat<(f32 (int_arm_cde_vcx3a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n), + (f32 SPR:$m), timm:$imm)), + (f32 (CDE_VCX3A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, SPR:$m, + imm_3b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx3 timm:$coproc, (f64 DPR:$n), (f64 DPR:$m), + timm:$imm)), + (f64 (CDE_VCX3_fpdp p_imm:$coproc, DPR:$n, DPR:$m, imm_3b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx3a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n), + (f64 DPR:$m), timm:$imm)), + (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m, + imm_3b:$imm))>; +} + +let Predicates = [HasCDE, HasMVEInt] in { + def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)), + (v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>; + def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc), + timm:$imm)), + (v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>; + + def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)), + (v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>; + def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc), + (v16i8 MQPR:$n), timm:$imm)), + (v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, + imm_7b:$imm))>; + + def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n), + (v16i8 MQPR:$m), timm:$imm)), + (v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m, + imm_4b:$imm))>; + def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc), + (v16i8 MQPR:$n), (v16i8 MQPR:$m), + timm:$imm)), + (v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m, + imm_4b:$imm))>; +} + +multiclass VCXPredicatedPat_m<MVEVectorVTInfo VTI> { + def : Pat<(VTI.Vec (int_arm_cde_vcx1q_predicated timm:$coproc, + (VTI.Vec MQPR:$inactive), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive)))>; + def : Pat<(VTI.Vec (int_arm_cde_vcx1qa_predicated timm:$coproc, + (VTI.Vec MQPR:$acc), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX1A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc), + imm_12b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred)))>; + + def : Pat<(VTI.Vec (int_arm_cde_vcx2q_predicated timm:$coproc, + (VTI.Vec MQPR:$inactive), + (v16i8 MQPR:$n), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX2_vec p_imm:$coproc, (v16i8 MQPR:$n), + imm_7b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive)))>; + def : Pat<(VTI.Vec (int_arm_cde_vcx2qa_predicated timm:$coproc, + (VTI.Vec MQPR:$acc), + (v16i8 MQPR:$n), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX2A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc), + (v16i8 MQPR:$n), timm:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred)))>; + + def : Pat<(VTI.Vec (int_arm_cde_vcx3q_predicated timm:$coproc, + (VTI.Vec MQPR:$inactive), + (v16i8 MQPR:$n), (v16i8 MQPR:$m), + timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX3_vec p_imm:$coproc, (v16i8 MQPR:$n), + (v16i8 MQPR:$m), + imm_4b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive)))>; + def : Pat<(VTI.Vec (int_arm_cde_vcx3qa_predicated timm:$coproc, + (VTI.Vec MQPR:$acc), + (v16i8 MQPR:$n), (v16i8 MQPR:$m), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX3A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc), + (v16i8 MQPR:$n), (v16i8 MQPR:$m), + imm_4b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred)))>; +} + +let Predicates = [HasCDE, HasMVEInt] in + foreach VTI = [ MVE_v16i8, MVE_v8i16, MVE_v4i32, MVE_v2i64 ] in + defm : VCXPredicatedPat_m<VTI>; + +let Predicates = [HasCDE, HasMVEFloat] in + foreach VTI = [ MVE_v8f16, MVE_v4f32 ] in + defm : VCXPredicatedPat_m<VTI>; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td index 1da32ad2af6c..e13f3437cc7b 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -205,7 +205,6 @@ def VPTPredROperand : AsmOperandClass { let Name = "VPTPredR"; let PredicateMethod = "isVPTPred"; } -def undef_tied_input; // Operand classes for the cluster of MC operands describing a // VPT-predicated MVE instruction. @@ -409,6 +408,9 @@ class InstTemplate<AddrMode am, int sz, IndexMode im, bit thumbArithFlagSetting = 0; bit validForTailPredication = 0; + bit retainsPreviousHalfElement = 0; + bit horizontalReduction = 0; + bit doubleWidthResult = 0; // If this is a pseudo instruction, mark it isCodeGenOnly. let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); @@ -422,6 +424,9 @@ class InstTemplate<AddrMode am, int sz, IndexMode im, let TSFlags{18-15} = D.Value; let TSFlags{19} = thumbArithFlagSetting; let TSFlags{20} = validForTailPredication; + let TSFlags{21} = retainsPreviousHalfElement; + let TSFlags{22} = horizontalReduction; + let TSFlags{23} = doubleWidthResult; let Constraints = cstr; let Itinerary = itin; @@ -1123,6 +1128,9 @@ class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> { class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> { list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP]; } +class FPRegs16Pat<dag pattern, dag result> : Pat<pattern, result> { + list<Predicate> Predicates = [HasFPRegs16]; +} class FP16Pat<dag pattern, dag result> : Pat<pattern, result> { list<Predicate> Predicates = [HasFP16]; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp index a802d5a06f07..2790ac215f86 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -126,7 +126,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const { MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( - MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4); + MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, Align(4)); MIB.addMemOperand(MMO); BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg) .addReg(Reg, RegState::Kill) diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td index 3efe85a7d45c..da0a836c8f95 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -159,6 +159,8 @@ def ARMcall_nolink : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall, def ARMretflag : SDNode<"ARMISD::RET_FLAG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def ARMseretflag : SDNode<"ARMISD::SERET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMintretflag : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMcmov : SDNode<"ARMISD::CMOV", SDT_ARMCMov, @@ -243,6 +245,12 @@ def ARMqsub8b : SDNode<"ARMISD::QSUB8b", SDT_ARMAnd, []>; def ARMqadd16b : SDNode<"ARMISD::QADD16b", SDT_ARMAnd, []>; def ARMqsub16b : SDNode<"ARMISD::QSUB16b", SDT_ARMAnd, []>; +def SDT_ARMldrd : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def ARMldrd : SDNode<"ARMISD::LDRD", SDT_ARMldrd, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def SDT_ARMstrd : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def ARMstrd : SDNode<"ARMISD::STRD", SDT_ARMstrd, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + // Vector operations shared between NEON and MVE def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>; @@ -258,7 +266,7 @@ def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>; def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>; def ARMvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>; -def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, +def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>; def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; @@ -268,6 +276,10 @@ def ARMvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>; def ARMvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>; +def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def ARMvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>; +def ARMvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>; def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; @@ -279,6 +291,11 @@ def ARMvshruImm : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>; def ARMvshls : SDNode<"ARMISD::VSHLs", SDTARMVSH>; def ARMvshlu : SDNode<"ARMISD::VSHLu", SDTARMVSH>; +def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def ARMvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; +def ARMvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; + def SDTARMVCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisInt<3>]>; def SDTARMVCMPZ : SDTypeProfile<1, 2, [SDTCisInt<2>]>; @@ -290,6 +307,36 @@ def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>; def ARMLE : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>; def ARMLoopDec : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>; +// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a +// vector register as a different vector type, without changing the contents of +// the register. It differs from 'bitconvert' in that bitconvert reinterprets +// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST +// reinterprets the _register_ format - and in big-endian, the memory and +// register formats are different, so they are different operations. +// +// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of +// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness, +// whereas 'bitconvert' will map it to the high byte in big-endian mode, +// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the +// bitconvert would have to emit a VREV16.8 instruction, whereas the +// VECTOR_REG_CAST emits no code at all if the vector is already in a register. +def ARMVectorRegCastImpl : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>; + +// In little-endian, VECTOR_REG_CAST is often turned into bitconvert during +// lowering (because in that situation they're identical). So an isel pattern +// that needs to match something that's _logically_ a VECTOR_REG_CAST must +// _physically_ match a different node type depending on endianness. +// +// This 'PatFrags' instance is a centralized facility to make that easy. It +// matches VECTOR_REG_CAST in either endianness, and also bitconvert in the +// endianness where it's equivalent. +def ARMVectorRegCast: PatFrags< + (ops node:$x), [(ARMVectorRegCastImpl node:$x), (bitconvert node:$x)], [{ + // Reject a match against bitconvert (aka ISD::BITCAST) if big-endian + return !(CurDAG->getDataLayout().isBigEndian() && + N->getOpcode() == ISD::BITCAST); + }]>; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. @@ -396,6 +443,62 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; +def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>; +def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>; + +//===----------------------------------------------------------------------===// +// NEON/MVE pattern fragments +// + +// Extract D sub-registers of Q registers. +def DSubReg_i8_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N), + MVT::i32); +}]>; +def DSubReg_i16_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N), + MVT::i32); +}]>; +def DSubReg_i32_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N), + MVT::i32); +}]>; +def DSubReg_f64_reg : SDNodeXForm<imm, [{ + assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N), + MVT::i32); +}]>; + +// Extract S sub-registers of Q/D registers. +def SSubReg_f32_reg : SDNodeXForm<imm, [{ + assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N), + MVT::i32); +}]>; + +// Extract S sub-registers of Q/D registers containing a given f16/bf16 lane. +def SSubReg_f16_reg : SDNodeXForm<imm, [{ + assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering"); + return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N), + MVT::i32); +}]>; + +// Translate lane numbers from Q registers to D subregs. +def SubReg_i8_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32); +}]>; +def SubReg_i16_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32); +}]>; +def SubReg_i32_lane : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32); +}]>; + + + //===----------------------------------------------------------------------===// // Operand Definitions. // @@ -2695,6 +2798,14 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { Requires<[IsARM, HasV5TE]>; } +let mayLoad = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in { +def LOADDUAL : ARMPseudoInst<(outs GPRPairOp:$Rt), (ins addrmode3:$addr), + 64, IIC_iLoad_d_r, []>, + Requires<[IsARM, HasV5TE]> { + let AM = AddrMode3; +} +} + def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr), NoItinerary, "lda", "\t$Rt, $addr", []>; def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr), @@ -2766,7 +2877,7 @@ multiclass AI2_ldridx<bit isByte, string opc, } let mayLoad = 1, hasSideEffects = 0 in { -// FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or +// FIXME: for LDR_PRE_REG etc. the itinerary should be either IIC_iLoad_ru or // IIC_iLoad_siu depending on whether it the offset register is shifted. defm LDR : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>; defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>; @@ -2933,6 +3044,9 @@ multiclass AI3ldrT<bits<4> op, string opc> { let Inst{3-0} = Rm{3-0}; let DecoderMethod = "DecodeLDR"; } + + def ii : ARMAsmPseudo<!strconcat(opc, "${p} $Rt, $addr"), + (ins addr_offset_none:$addr, pred:$p), (outs GPR:$Rt)>; } defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">; @@ -2970,6 +3084,14 @@ let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in { } } +let mayStore = 1, hasSideEffects = 0, hasNoSchedulingInfo = 1 in { +def STOREDUAL : ARMPseudoInst<(outs), (ins GPRPairOp:$Rt, addrmode3:$addr), + 64, IIC_iStore_d_r, []>, + Requires<[IsARM, HasV5TE]> { + let AM = AddrMode3; +} +} + // Indexed stores multiclass AI2_stridx<bit isByte, string opc, InstrItinClass iii, InstrItinClass iir> { @@ -3036,7 +3158,7 @@ multiclass AI2_stridx<bit isByte, string opc, } let mayStore = 1, hasSideEffects = 0 in { -// FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or +// FIXME: for STR_PRE_REG etc. the itinerary should be either IIC_iStore_ru or // IIC_iStore_siu depending on whether it the offset register is shifted. defm STR : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>; defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>; @@ -3770,9 +3892,8 @@ def QSUB16 : AAIIntrinsic<0b01100010, 0b11110111, "qsub16", int_arm_qsub16>; def QSUB8 : AAIIntrinsic<0b01100010, 0b11111111, "qsub8", int_arm_qsub8>; def QDADD : AAIRevOpr<0b00010100, 0b00000101, "qdadd", - [(set GPRnopc:$Rd, (int_arm_qadd (int_arm_qadd GPRnopc:$Rm, - GPRnopc:$Rm), - GPRnopc:$Rn))]>; + [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, + (int_arm_qadd GPRnopc:$Rn, GPRnopc:$Rn)))]>; def QDSUB : AAIRevOpr<0b00010110, 0b00000101, "qdsub", [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, (int_arm_qadd GPRnopc:$Rn, GPRnopc:$Rn)))]>; @@ -3787,7 +3908,7 @@ def : ARMV5TEPat<(saddsat GPR:$a, GPR:$b), (QADD GPR:$a, GPR:$b)>; def : ARMV5TEPat<(ssubsat GPR:$a, GPR:$b), (QSUB GPR:$a, GPR:$b)>; -def : ARMV5TEPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn), +def : ARMV5TEPat<(saddsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)), (QDADD rGPR:$Rm, rGPR:$Rn)>; def : ARMV5TEPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)), (QDSUB rGPR:$Rm, rGPR:$Rn)>; @@ -5414,7 +5535,8 @@ def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm", def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */, (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, - imm0_7:$opc2), []>; + imm0_7:$opc2), []>, + ComplexDeprecationPredicate<"MRC">; def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm", (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm, 0, pred:$p)>; @@ -5691,7 +5813,7 @@ def : ARMPat<(ARMthread_pointer), (MRC 15, 0, 13, 0, 3)>, // when we get here from a longjmp(). We force everything out of registers // except for our own input by listing the relevant registers in Defs. By // doing so, we also cause the prologue/epilogue code to actively preserve -// all of the callee-saved resgisters, which is exactly what we want. +// all of the callee-saved registers, which is exactly what we want. // A constant value is passed in $val, and we use the location as a scratch. // // These are pseudo-instructions and are lowered to individual MC-insts, so @@ -5976,6 +6098,12 @@ include "ARMInstrNEON.td" include "ARMInstrMVE.td" //===----------------------------------------------------------------------===// +// CDE (Custom Datapath Extension) +// + +include "ARMInstrCDE.td" + +//===----------------------------------------------------------------------===// // Assembler aliases // diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td index 604291be822c..2a1f50d97e3b 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -10,44 +10,6 @@ // //===----------------------------------------------------------------------===// -class ExpandImmAsmOp<string shift> : AsmOperandClass { - let Name = !strconcat("ExpandImm", shift); - let PredicateMethod = !strconcat("isExpImm<", shift, ">"); - let RenderMethod = "addImmOperands"; -} -class InvertedExpandImmAsmOp<string shift, string size> : AsmOperandClass { - let Name = !strconcat("InvertedExpandImm", shift, "_", size); - let PredicateMethod = !strconcat("isInvertedExpImm<", shift, ",", size, ">"); - let RenderMethod = "addImmOperands"; -} - -class ExpandImm<string shift> : Operand<i32> { - let ParserMatchClass = ExpandImmAsmOp<shift>; - let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",false>"); - let DecoderMethod = !strconcat("DecodeExpandedImmOperand<",shift,">"); - let PrintMethod = "printExpandedImmOperand"; -} -class InvertedExpandImm<string shift, string size> : Operand<i32> { - let ParserMatchClass = InvertedExpandImmAsmOp<shift, size>; - let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",true>"); - let PrintMethod = "printExpandedImmOperand"; - // No decoder method needed, because this operand type is only used - // by aliases (VAND and VORN) -} - -def expzero00 : ExpandImm<"0">; -def expzero08 : ExpandImm<"8">; -def expzero16 : ExpandImm<"16">; -def expzero24 : ExpandImm<"24">; - -def expzero00inv16 : InvertedExpandImm<"0", "16">; -def expzero08inv16 : InvertedExpandImm<"8", "16">; - -def expzero00inv32 : InvertedExpandImm<"0", "32">; -def expzero08inv32 : InvertedExpandImm<"8", "32">; -def expzero16inv32 : InvertedExpandImm<"16", "32">; -def expzero24inv32 : InvertedExpandImm<"24", "32">; - // VPT condition mask def vpt_mask : Operand<i32> { let PrintMethod = "printVPTMask"; @@ -277,7 +239,8 @@ class mve_addr_q_shift<int shift> : MemOperand { // A family of classes wrapping up information about the vector types // used by MVE. -class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred, +class MVEVectorVTInfo<ValueType vec, ValueType dblvec, + ValueType pred, ValueType dblpred, bits<2> size, string suffixletter, bit unsigned> { // The LLVM ValueType representing the vector, so we can use it in // ISel patterns. @@ -300,6 +263,9 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred, // directly. ValueType Pred = pred; + // Same as Pred but for DblVec rather than Vec. + ValueType DblPred = dblpred; + // The most common representation of the vector element size in MVE // instruction encodings: a 2-bit value V representing an (8<<V)-bit // vector element. @@ -319,38 +285,38 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred, !cast<string>(LaneBits)); // The suffix used on an instruction that mentions the whole type. - string Suffix = suffixletter ## BitsSuffix; + string Suffix = suffixletter # BitsSuffix; // The letter part of the suffix only. string SuffixLetter = suffixletter; } // Integer vector types that don't treat signed and unsigned differently. -def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "i", ?>; -def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "i", ?>; -def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "i", ?>; -def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "i", ?>; +def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "i", ?>; +def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "i", ?>; +def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "i", ?>; +def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "i", ?>; // Explicitly signed and unsigned integer vectors. They map to the // same set of LLVM ValueTypes as above, but are represented // differently in assembly and instruction encodings. -def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "s", 0b0>; -def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "s", 0b0>; -def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "s", 0b0>; -def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "s", 0b0>; -def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "u", 0b1>; -def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b01, "u", 0b1>; -def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, 0b10, "u", 0b1>; -def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, 0b11, "u", 0b1>; +def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "s", 0b0>; +def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "s", 0b0>; +def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "s", 0b0>; +def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "s", 0b0>; +def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "u", 0b1>; +def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "u", 0b1>; +def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "u", 0b1>; +def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "u", 0b1>; // FP vector types. -def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, 0b01, "f", ?>; -def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, 0b10, "f", ?>; -def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, 0b11, "f", ?>; +def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, v4i1, 0b01, "f", ?>; +def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, v4i1, 0b10, "f", ?>; +def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, ?, 0b11, "f", ?>; // Polynomial vector types. -def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b11, "p", 0b0>; -def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, 0b11, "p", 0b1>; +def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>; +def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b11, "p", 0b1>; // --------- Start of base classes for the instructions themselves @@ -473,6 +439,8 @@ class MVE_ScalarShiftDoubleReg<string iname, dag iops, string asm, let Inst{19-17} = RdaLo{3-1}; let Inst{11-9} = RdaHi{3-1}; + + let hasSideEffects = 0; } class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16, @@ -590,6 +558,7 @@ class MVE_VABAV<string suffix, bit U, bits<2> size> let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; + let horizontalReduction = 1; } multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> { @@ -639,38 +608,63 @@ class MVE_VADDV<string iname, string suffix, dag iops, string cstr, let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let horizontalReduction = 1; + let validForTailPredication = 1; } -multiclass MVE_VADDV_A<string suffix, bit U, bits<2> size, - list<dag> pattern=[]> { - def acc : MVE_VADDV<"vaddva", suffix, +def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>; +def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>; + +multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> { + def acc : MVE_VADDV<"vaddva", VTI.Suffix, (ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src", - 0b1, U, size, pattern>; - def no_acc : MVE_VADDV<"vaddv", suffix, + 0b1, VTI.Unsigned, VTI.Size>; + def no_acc : MVE_VADDV<"vaddv", VTI.Suffix, (ins MQPR:$Qm), "", - 0b0, U, size, pattern>; -} + 0b0, VTI.Unsigned, VTI.Size>; -defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>; -defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>; -defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>; -defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>; -defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; -defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; + defvar InstA = !cast<Instruction>(NAME # "acc"); + defvar InstN = !cast<Instruction>(NAME # "no_acc"); -let Predicates = [HasMVEInt] in { - def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))), - (i32 (MVE_VADDVu32acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))), - (i32 (MVE_VADDVu16acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))), - (i32 (MVE_VADDVu8acc $src2, $src1))>; + let Predicates = [HasMVEInt] in { + if VTI.Unsigned then { + def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + } else { + def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + } + def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec), + (i32 VTI.Unsigned), + (VTI.Pred VCCR:$pred))), + (i32 (InstN $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec), + (i32 VTI.Unsigned), + (VTI.Pred VCCR:$pred)), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; + } } +defm MVE_VADDVs8 : MVE_VADDV_A<MVE_v16s8>; +defm MVE_VADDVs16 : MVE_VADDV_A<MVE_v8s16>; +defm MVE_VADDVs32 : MVE_VADDV_A<MVE_v4s32>; +defm MVE_VADDVu8 : MVE_VADDV_A<MVE_v16u8>; +defm MVE_VADDVu16 : MVE_VADDV_A<MVE_v8u16>; +defm MVE_VADDVu32 : MVE_VADDV_A<MVE_v4u32>; + class MVE_VADDLV<string iname, string suffix, dag iops, string cstr, bit A, bit U, list<dag> pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, @@ -689,21 +683,58 @@ class MVE_VADDLV<string iname, string suffix, dag iops, string cstr, let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; -} - -multiclass MVE_VADDLV_A<string suffix, bit U, list<dag> pattern=[]> { - def acc : MVE_VADDLV<"vaddlva", suffix, + let horizontalReduction = 1; +} + +def SDTVecReduceL : SDTypeProfile<2, 1, [ // VADDLV + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2> +]>; +def SDTVecReduceLA : SDTypeProfile<2, 3, [ // VADDLVA + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, + SDTCisVec<4> +]>; +def SDTVecReduceLP : SDTypeProfile<2, 2, [ // VADDLVp + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<2> +]>; +def SDTVecReduceLPA : SDTypeProfile<2, 4, [ // VADDLVAp + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, + SDTCisVec<4>, SDTCisVec<5> +]>; + +multiclass MVE_VADDLV_A<MVEVectorVTInfo VTI> { + def acc : MVE_VADDLV<"vaddlva", VTI.Suffix, (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm), "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", - 0b1, U, pattern>; - def no_acc : MVE_VADDLV<"vaddlv", suffix, + 0b1, VTI.Unsigned>; + def no_acc : MVE_VADDLV<"vaddlv", VTI.Suffix, (ins MQPR:$Qm), "", - 0b0, U, pattern>; -} + 0b0, VTI.Unsigned>; + + defvar InstA = !cast<Instruction>(NAME # "acc"); + defvar InstN = !cast<Instruction>(NAME # "no_acc"); + defvar letter = VTI.SuffixLetter; + defvar ARMVADDLV = SDNode<"ARMISD::VADDLV" # letter, SDTVecReduceL>; + defvar ARMVADDLVA = SDNode<"ARMISD::VADDLVA" # letter, SDTVecReduceLA>; + defvar ARMVADDLVp = SDNode<"ARMISD::VADDLVp" # letter, SDTVecReduceLP>; + defvar ARMVADDLVAp = SDNode<"ARMISD::VADDLVAp" # letter, SDTVecReduceLPA>; -defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>; -defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>; + let Predicates = [HasMVEInt] in { + def : Pat<(ARMVADDLV (v4i32 MQPR:$vec)), + (InstN (v4i32 MQPR:$vec))>; + def : Pat<(ARMVADDLVA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec)), + (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec))>; + def : Pat<(ARMVADDLVp (v4i32 MQPR:$vec), (VTI.Pred VCCR:$pred)), + (InstN (v4i32 MQPR:$vec), ARMVCCThen, (VTI.Pred VCCR:$pred))>; + def : Pat<(ARMVADDLVAp tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec), + (VTI.Pred VCCR:$pred)), + (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred))>; + } +} + +defm MVE_VADDLVs32 : MVE_VADDLV_A<MVE_v4s32>; +defm MVE_VADDLVu32 : MVE_VADDLV_A<MVE_v4u32>; class MVE_VMINMAXNMV<string iname, string suffix, bit sz, bit bit_17, bit bit_7, list<dag> pattern=[]> @@ -724,25 +755,48 @@ class MVE_VMINMAXNMV<string iname, string suffix, bit sz, let Inst{6-5} = 0b00; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let horizontalReduction = 1; let Predicates = [HasMVEFloat]; + let hasSideEffects = 0; } -multiclass MVE_VMINMAXNMV_fty<string iname, bit bit_7, list<dag> pattern=[]> { - def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b1, bit_7, pattern>; - def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b1, bit_7, pattern>; -} +multiclass MVE_VMINMAXNMV_p<string iname, bit notAbs, bit isMin, + MVEVectorVTInfo VTI, string intrBaseName, + ValueType Scalar, RegisterClass ScalarReg> { + def "": MVE_VMINMAXNMV<iname, VTI.Suffix, VTI.Size{0}, notAbs, isMin>; + defvar Inst = !cast<Instruction>(NAME); + defvar unpred_intr = !cast<Intrinsic>(intrBaseName); + defvar pred_intr = !cast<Intrinsic>(intrBaseName#"_predicated"); -defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>; -defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>; + let Predicates = [HasMVEFloat] in { + def : Pat<(Scalar (unpred_intr (Scalar ScalarReg:$prev), + (VTI.Vec MQPR:$vec))), + (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR), + (VTI.Vec MQPR:$vec)), + ScalarReg)>; + def : Pat<(Scalar (pred_intr (Scalar ScalarReg:$prev), + (VTI.Vec MQPR:$vec), + (VTI.Pred VCCR:$pred))), + (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR), + (VTI.Vec MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred)), + ScalarReg)>; + } +} -multiclass MVE_VMINMAXNMAV_fty<string iname, bit bit_7, list<dag> pattern=[]> { - def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b0, bit_7, pattern>; - def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b0, bit_7, pattern>; +multiclass MVE_VMINMAXNMV_fty<string iname, bit notAbs, bit isMin, + string intrBase> { + defm f32 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v4f32, intrBase, + f32, SPR>; + defm f16 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v8f16, intrBase, + f16, HPR>; } -defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>; -defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>; +defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 1, 1, "int_arm_mve_minnmv">; +defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 1, 0, "int_arm_mve_maxnmv">; +defm MVE_VMINNMAV: MVE_VMINMAXNMV_fty<"vminnmav", 0, 1, "int_arm_mve_minnmav">; +defm MVE_VMAXNMAV: MVE_VMINMAXNMV_fty<"vmaxnmav", 0, 0, "int_arm_mve_maxnmav">; class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size, bit bit_17, bit bit_7, list<dag> pattern=[]> @@ -762,33 +816,40 @@ class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size, let Inst{6-5} = 0b00; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let horizontalReduction = 1; } -multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7, - MVEVectorVTInfo VTI, Intrinsic intr> { +multiclass MVE_VMINMAXV_p<string iname, bit notAbs, bit isMin, + MVEVectorVTInfo VTI, string intrBaseName> { def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, - bit_17, bit_7>; - defvar Inst = !cast<Instruction>(NAME); + notAbs, isMin>; + defvar Inst = !cast<Instruction>(NAME); + defvar unpred_intr = !cast<Intrinsic>(intrBaseName); + defvar pred_intr = !cast<Intrinsic>(intrBaseName#"_predicated"); + defvar base_args = (? (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)); + defvar args = !if(notAbs, !con(base_args, (? (i32 VTI.Unsigned))), + base_args); - let Predicates = [HasMVEInt] in - def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))), - (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>; + let Predicates = [HasMVEInt] in { + def : Pat<(i32 !con(args, (unpred_intr))), + (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>; + def : Pat<(i32 !con(args, (pred_intr (VTI.Pred VCCR:$pred)))), + (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred)))>; + } } -multiclass MVE_VMINMAXV_ty<string iname, bit bit_7, - Intrinsic intr_s, Intrinsic intr_u> { - defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>; - defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>; - defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>; - defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>; - defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>; - defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>; +multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> { + defm s8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16s8, intrBaseName>; + defm s16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8s16, intrBaseName>; + defm s32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4s32, intrBaseName>; + defm u8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16u8, intrBaseName>; + defm u16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8u16, intrBaseName>; + defm u32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4u32, intrBaseName>; } -defm MVE_VMINV : MVE_VMINMAXV_ty< - "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>; -defm MVE_VMAXV : MVE_VMINMAXV_ty< - "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>; +defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">; +defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">; let Predicates = [HasMVEInt] in { def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))), @@ -819,14 +880,14 @@ let Predicates = [HasMVEInt] in { } -multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> { - def s8 : MVE_VMINMAXV<iname, "s8", 0b0, 0b00, 0b0, bit_7>; - def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>; - def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b0, bit_7>; +multiclass MVE_VMINMAXAV_ty<string iname, bit isMin, string intrBaseName> { + defm s8 : MVE_VMINMAXV_p<iname, 0, isMin, MVE_v16s8, intrBaseName>; + defm s16: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v8s16, intrBaseName>; + defm s32: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v4s32, intrBaseName>; } -defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>; -defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>; +defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 1, "int_arm_mve_minav">; +defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0, "int_arm_mve_maxav">; class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr, bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0> @@ -847,6 +908,12 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr, let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = bit_0; + let horizontalReduction = 1; + // Allow tail predication for non-exchanging versions. As this is also a + // horizontalReduction, ARMLowOverheadLoops will also have to check that + // the vector operands contain zeros in their false lanes for the instruction + // to be properly valid. + let validForTailPredication = !eq(X, 0); } multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI, @@ -932,6 +999,58 @@ defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v16s8, 0b0, 0b1>; defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v8s16, 0b0, 0b0>; defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v4s32, 0b1, 0b0>; +def SDTVecReduce2 : SDTypeProfile<1, 2, [ // VMLAV + SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2> +]>; +def SDTVecReduce2L : SDTypeProfile<2, 2, [ // VMLALV + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<3> +]>; +def SDTVecReduce2LA : SDTypeProfile<2, 4, [ // VMLALVA + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, + SDTCisVec<4>, SDTCisVec<5> +]>; +def ARMVMLAVs : SDNode<"ARMISD::VMLAVs", SDTVecReduce2>; +def ARMVMLAVu : SDNode<"ARMISD::VMLAVu", SDTVecReduce2>; +def ARMVMLALVs : SDNode<"ARMISD::VMLALVs", SDTVecReduce2L>; +def ARMVMLALVu : SDNode<"ARMISD::VMLALVu", SDTVecReduce2L>; +def ARMVMLALVAs : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>; +def ARMVMLALVAu : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>; + +let Predicates = [HasMVEInt] in { + def : Pat<(i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))), + (i32 (MVE_VMLADAVu32 $src1, $src2))>; + def : Pat<(i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))), + (i32 (MVE_VMLADAVu16 $src1, $src2))>; + def : Pat<(i32 (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(i32 (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))), + (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))), + (i32 (MVE_VMLADAVu8 $src1, $src2))>; + def : Pat<(i32 (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(i32 (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))), + (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + + def : Pat<(i32 (add (i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))), + (i32 tGPREven:$src3))), + (i32 (MVE_VMLADAVau32 $src3, $src1, $src2))>; + def : Pat<(i32 (add (i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))), + (i32 tGPREven:$src3))), + (i32 (MVE_VMLADAVau16 $src3, $src1, $src2))>; + def : Pat<(i32 (add (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(i32 (add (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>; + def : Pat<(i32 (add (i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))), + (i32 tGPREven:$src3))), + (i32 (MVE_VMLADAVau8 $src3, $src1, $src2))>; + def : Pat<(i32 (add (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; + def : Pat<(i32 (add (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)), + (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>; +} + // vmlav aliases vmladav foreach acc = ["", "a"] in { foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in { @@ -963,6 +1082,14 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr, let Inst{5} = A; let Inst{3-1} = Qm{2-0}; let Inst{0} = bit_0; + let horizontalReduction = 1; + // Allow tail predication for non-exchanging versions. As this is also a + // horizontalReduction, ARMLowOverheadLoops will also have to check that + // the vector operands contain zeros in their false lanes for the instruction + // to be properly valid. + let validForTailPredication = !eq(X, 0); + + let hasSideEffects = 0; } multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix, @@ -1023,6 +1150,26 @@ multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> { defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>; defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(ARMVMLALVs (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)), + (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>; + def : Pat<(ARMVMLALVu (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)), + (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>; + def : Pat<(ARMVMLALVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), + (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; + def : Pat<(ARMVMLALVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), + (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; + + def : Pat<(ARMVMLALVAs tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)), + (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>; + def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)), + (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>; + def : Pat<(ARMVMLALVAs tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), + (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; + def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), + (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>; +} + // vmlalv aliases vmlaldav foreach acc = ["", "a"] in { foreach suffix = ["s16", "s32", "u16", "u32"] in { @@ -1244,28 +1391,29 @@ let Predicates = [HasMVEInt] in { (v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>; } -let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))), - (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>; - def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))), - (v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>; - def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))), - (v16i8 (MVE_VREV64_8 (v16i8 MQPR:$src)))>; +multiclass MVE_VREV_basic_patterns<int revbits, list<MVEVectorVTInfo> VTIs, + Instruction Inst> { + defvar unpred_op = !cast<SDNode>("ARMvrev" # revbits); - def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))), - (v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>; - def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))), - (v16i8 (MVE_VREV32_8 (v16i8 MQPR:$src)))>; + foreach VTI = VTIs in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$src))), + (VTI.Vec (Inst (VTI.Vec MQPR:$src)))>; + def : Pat<(VTI.Vec (int_arm_mve_vrev_predicated (VTI.Vec MQPR:$src), + revbits, (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$src), ARMVCCThen, + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>; + } +} + +let Predicates = [HasMVEInt] in { + defm: MVE_VREV_basic_patterns<64, [MVE_v4i32, MVE_v4f32], MVE_VREV64_32>; + defm: MVE_VREV_basic_patterns<64, [MVE_v8i16, MVE_v8f16], MVE_VREV64_16>; + defm: MVE_VREV_basic_patterns<64, [MVE_v16i8 ], MVE_VREV64_8>; - def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))), - (v16i8 (MVE_VREV16_8 (v16i8 MQPR:$src)))>; + defm: MVE_VREV_basic_patterns<32, [MVE_v8i16, MVE_v8f16], MVE_VREV32_16>; + defm: MVE_VREV_basic_patterns<32, [MVE_v16i8 ], MVE_VREV32_8>; - def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))), - (v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>; - def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))), - (v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>; - def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))), - (v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>; + defm: MVE_VREV_basic_patterns<16, [MVE_v16i8 ], MVE_VREV16_8>; } def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), @@ -1280,14 +1428,14 @@ def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm), } let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (vnotq (v16i8 MQPR:$val1))), - (v16i8 (MVE_VMVN (v16i8 MQPR:$val1)))>; - def : Pat<(v8i16 (vnotq (v8i16 MQPR:$val1))), - (v8i16 (MVE_VMVN (v8i16 MQPR:$val1)))>; - def : Pat<(v4i32 (vnotq (v4i32 MQPR:$val1))), - (v4i32 (MVE_VMVN (v4i32 MQPR:$val1)))>; - def : Pat<(v2i64 (vnotq (v2i64 MQPR:$val1))), - (v2i64 (MVE_VMVN (v2i64 MQPR:$val1)))>; + foreach VTI = [ MVE_v16i8, MVE_v8i16, MVE_v4i32, MVE_v2i64 ] in { + def : Pat<(VTI.Vec (vnotq (VTI.Vec MQPR:$val1))), + (VTI.Vec (MVE_VMVN (VTI.Vec MQPR:$val1)))>; + def : Pat<(VTI.Vec (int_arm_mve_mvn_predicated (VTI.Vec MQPR:$val1), + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))), + (VTI.Vec (MVE_VMVN (VTI.Vec MQPR:$val1), ARMVCCThen, + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>; + } } class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28> @@ -1383,10 +1531,10 @@ defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>; defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>; defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>; -class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps> +class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps> : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary, iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> { - bits<8> imm; + bits<12> imm; bits<4> Qd; let Inst{28} = imm{7}; @@ -1396,66 +1544,59 @@ class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps> let Inst{18-16} = imm{6-4}; let Inst{15-13} = Qd{2-0}; let Inst{12} = 0b0; - let Inst{11-8} = cmode; + let Inst{11} = halfword; + let Inst{10} = !if(halfword, 0, imm{10}); + let Inst{9} = imm{9}; + let Inst{8} = 0b1; let Inst{7-6} = 0b01; let Inst{4} = 0b1; let Inst{3-0} = imm{3-0}; } -class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type> - : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { - let Inst{5} = 0b0; - let validForTailPredication = 1; -} +multiclass MVE_bit_cmode_p<string iname, bit opcode, + MVEVectorVTInfo VTI, Operand imm_type, SDNode op> { + def "" : MVE_bit_cmode<iname, VTI.Suffix, VTI.Size{0}, + (ins MQPR:$Qd_src, imm_type:$imm)> { + let Inst{5} = opcode; + let validForTailPredication = 1; + } -def MVE_VORRIZ0v4i32 : MVE_VORR<"i32", 0b0001, expzero00>; -def MVE_VORRIZ0v8i16 : MVE_VORR<"i16", 0b1001, expzero00>; -def MVE_VORRIZ8v4i32 : MVE_VORR<"i32", 0b0011, expzero08>; -def MVE_VORRIZ8v8i16 : MVE_VORR<"i16", 0b1011, expzero08>; -def MVE_VORRIZ16v4i32 : MVE_VORR<"i32", 0b0101, expzero16>; -def MVE_VORRIZ24v4i32 : MVE_VORR<"i32", 0b0111, expzero24>; - -def MVE_VORNIZ0v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ0v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ8v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ8v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ16v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; -def MVE_VORNIZ24v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm", - (ins MQPR:$Qd_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>; + defvar Inst = !cast<Instruction>(NAME); + defvar UnpredPat = (VTI.Vec (op (VTI.Vec MQPR:$src), timm:$simm)); -def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm", - (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>; + let Predicates = [HasMVEInt] in { + def : Pat<UnpredPat, (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>; + def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred), + UnpredPat, (VTI.Vec MQPR:$src))), + (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm, + ARMVCCThen, (VTI.Pred VCCR:$pred)))>; + } +} -class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type> - : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> { - let Inst{5} = 0b1; - let validForTailPredication = 1; +multiclass MVE_VORRimm<MVEVectorVTInfo VTI, Operand imm_type> { + defm "": MVE_bit_cmode_p<"vorr", 0, VTI, imm_type, ARMvorrImm>; +} +multiclass MVE_VBICimm<MVEVectorVTInfo VTI, Operand imm_type> { + defm "": MVE_bit_cmode_p<"vbic", 1, VTI, imm_type, ARMvbicImm>; } -def MVE_VBICIZ0v4i32 : MVE_VBIC<"i32", 0b0001, expzero00>; -def MVE_VBICIZ0v8i16 : MVE_VBIC<"i16", 0b1001, expzero00>; -def MVE_VBICIZ8v4i32 : MVE_VBIC<"i32", 0b0011, expzero08>; -def MVE_VBICIZ8v8i16 : MVE_VBIC<"i16", 0b1011, expzero08>; -def MVE_VBICIZ16v4i32 : MVE_VBIC<"i32", 0b0101, expzero16>; -def MVE_VBICIZ24v4i32 : MVE_VBIC<"i32", 0b0111, expzero24>; - -def MVE_VANDIZ0v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ0v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ8v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ8v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ16v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; -def MVE_VANDIZ24v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm", - (ins MQPR:$Qda_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>; +defm MVE_VORRimmi16 : MVE_VORRimm<MVE_v8i16, nImmSplatI16>; +defm MVE_VORRimmi32 : MVE_VORRimm<MVE_v4i32, nImmSplatI32>; +defm MVE_VBICimmi16 : MVE_VBICimm<MVE_v8i16, nImmSplatI16>; +defm MVE_VBICimmi32 : MVE_VBICimm<MVE_v4i32, nImmSplatI32>; + +def MVE_VORNimmi16 : MVEInstAlias<"vorn${vp}.i16\t$Qd, $imm", + (MVE_VORRimmi16 MQPR:$Qd, nImmSplatNotI16:$imm, vpred_n:$vp), 0>; +def MVE_VORNimmi32 : MVEInstAlias<"vorn${vp}.i32\t$Qd, $imm", + (MVE_VORRimmi32 MQPR:$Qd, nImmSplatNotI32:$imm, vpred_n:$vp), 0>; + +def MVE_VANDimmi16 : MVEInstAlias<"vand${vp}.i16\t$Qd, $imm", + (MVE_VBICimmi16 MQPR:$Qd, nImmSplatNotI16:$imm, vpred_n:$vp), 0>; +def MVE_VANDimmi32 : MVEInstAlias<"vand${vp}.i32\t$Qd, $imm", + (MVE_VBICimmi32 MQPR:$Qd, nImmSplatNotI32:$imm, vpred_n:$vp), 0>; + +def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm", + (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>; class MVE_VMOV_lane_direction { bit bit_20; @@ -1494,6 +1635,8 @@ class MVE_VMOV_lane<string suffix, bit U, dag indexop, let Inst{11-8} = 0b1011; let Inst{7} = Qd{3}; let Inst{4-0} = 0b10000; + + let hasSideEffects = 0; } class MVE_VMOV_lane_32<MVE_VMOV_lane_direction dir> @@ -1557,10 +1700,14 @@ let Predicates = [HasMVEInt] in { (MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlanes (v8f16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>; def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane), (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; + def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane), + (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>; def : Pat<(v16i8 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; @@ -1575,8 +1722,8 @@ let Predicates = [HasMVEInt] in { def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane), (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>; - def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane), - (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>; + def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm:$lane), + (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>; def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane), (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>; def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane), @@ -1588,8 +1735,8 @@ let Predicates = [HasMVEInt] in { (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>; def : Pat<(v4f32 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; - def : Pat<(v8f16 (scalar_to_vector HPR:$src)), - (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>; + def : Pat<(v8f16 (scalar_to_vector (f16 HPR:$src))), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), (f16 HPR:$src), ssub_0)>; def : Pat<(v8f16 (scalar_to_vector GPR:$src)), (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>; } @@ -1882,6 +2029,26 @@ class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]> let validForTailPredication = 1; } +def addnuw : PatFrag<(ops node:$lhs, node:$rhs), + (add node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoUnsignedWrap(); +}]>; + +def addnsw : PatFrag<(ops node:$lhs, node:$rhs), + (add node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoSignedWrap(); +}]>; + +def subnuw : PatFrag<(ops node:$lhs, node:$rhs), + (sub node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoUnsignedWrap(); +}]>; + +def subnsw : PatFrag<(ops node:$lhs, node:$rhs), + (sub node:$lhs, node:$rhs), [{ + return N->getFlags().hasNoSignedWrap(); +}]>; + multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int> { def "" : MVE_VRHADD_Base<VTI.Suffix, VTI.Unsigned, VTI.Size>; @@ -1913,6 +2080,37 @@ defm MVE_VRHADDu8 : MVE_VRHADD<MVE_v16u8>; defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16>; defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>; +// Rounding Halving Add perform the arithemtic operation with an extra bit of +// precision, before performing the shift, to void clipping errors. We're not +// modelling that here with these patterns, but we're using no wrap forms of +// add to ensure that the extra bit of information is not needed for the +// arithmetic or the rounding. +def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvmovImm (i32 3585)))), + (i32 1))), + (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvmovImm (i32 2049)))), + (i32 1))), + (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvmovImm (i32 1)))), + (i32 1))), + (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvmovImm (i32 3585)))), + (i32 1))), + (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvmovImm (i32 2049)))), + (i32 1))), + (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>; +def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvmovImm (i32 1)))), + (i32 1))), + (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; + + class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract, bits<2> size, list<dag> pattern=[]> : MVE_int<iname, suffix, size, pattern> { @@ -1936,7 +2134,8 @@ class MVE_VHSUB_<string suffix, bit U, bits<2> size, : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>; multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, - SDNode unpred_op, Intrinsic pred_int> { + SDNode unpred_op, Intrinsic pred_int, PatFrag add_op, + SDNode shift_op> { def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>; defvar Inst = !cast<Instruction>(NAME); @@ -1945,6 +2144,9 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))), + (Inst MQPR:$Qm, MQPR:$Qn)>; + // Predicated add-and-divide-by-two def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))), @@ -1954,18 +2156,24 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI, } } -multiclass MVE_VHADD<MVEVectorVTInfo VTI> - : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>; +multiclass MVE_VHADD<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op> + : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, add_op, + shift_op>; -defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8>; -defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16>; -defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32>; -defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8>; -defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16>; -defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32>; +// Halving add/sub perform the arithemtic operation with an extra bit of +// precision, before performing the shift, to void clipping errors. We're not +// modelling that here with these patterns, but we're using no wrap forms of +// add/sub to ensure that the extra bit of information is not needed. +defm MVE_VHADDs8 : MVE_VHADD<MVE_v16s8, addnsw, ARMvshrsImm>; +defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16, addnsw, ARMvshrsImm>; +defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32, addnsw, ARMvshrsImm>; +defm MVE_VHADDu8 : MVE_VHADD<MVE_v16u8, addnuw, ARMvshruImm>; +defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16, addnuw, ARMvshruImm>; +defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32, addnuw, ARMvshruImm>; multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI, - SDNode unpred_op, Intrinsic pred_int> { + SDNode unpred_op, Intrinsic pred_int, PatFrag sub_op, + SDNode shift_op> { def "" : MVE_VHSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>; defvar Inst = !cast<Instruction>(NAME); @@ -1975,6 +2183,10 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI, (i32 VTI.Unsigned))), (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + def : Pat<(VTI.Vec (shift_op (sub_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))), + (Inst MQPR:$Qm, MQPR:$Qn)>; + + // Predicated subtract-and-divide-by-two def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask), @@ -1985,15 +2197,16 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI, } } -multiclass MVE_VHSUB<MVEVectorVTInfo VTI> - : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated>; +multiclass MVE_VHSUB<MVEVectorVTInfo VTI, PatFrag sub_op, SDNode shift_op> + : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated, sub_op, + shift_op>; -defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8>; -defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16>; -defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32>; -defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8>; -defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16>; -defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32>; +defm MVE_VHSUBs8 : MVE_VHSUB<MVE_v16s8, subnsw, ARMvshrsImm>; +defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16, subnsw, ARMvshrsImm>; +defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32, subnsw, ARMvshrsImm>; +defm MVE_VHSUBu8 : MVE_VHSUB<MVE_v16u8, subnuw, ARMvshruImm>; +defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16, subnuw, ARMvshruImm>; +defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32, subnuw, ARMvshruImm>; class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary, @@ -2028,24 +2241,37 @@ let Predicates = [HasMVEInt] in { def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))), (MVE_VDUP32 rGPR:$elem)>; - def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)), - (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; - // For the 16-bit and 8-bit vduplanes we don't care about the signedness - // of the lane move operation as we only want the lowest 8/16 bits anyway. - def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)), - (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; - def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)), - (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>; - - def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))), - (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>; - def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))), - (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>; + def : Pat<(v8f16 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP16 rGPR:$elem)>; + def : Pat<(v4f32 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP32 rGPR:$elem)>; - def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)), - (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; - def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)), - (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>; + // Match a vselect with an ARMvdup as a predicated MVE_VDUP + def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), + (v16i8 (ARMvdup (i32 rGPR:$elem))), + (v16i8 MQPR:$inactive))), + (MVE_VDUP8 rGPR:$elem, ARMVCCThen, (v16i1 VCCR:$pred), + (v16i8 MQPR:$inactive))>; + def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), + (v8i16 (ARMvdup (i32 rGPR:$elem))), + (v8i16 MQPR:$inactive))), + (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred), + (v8i16 MQPR:$inactive))>; + def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), + (v4i32 (ARMvdup (i32 rGPR:$elem))), + (v4i32 MQPR:$inactive))), + (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), + (v4i32 MQPR:$inactive))>; + def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), + (v4f32 (ARMvdup (i32 rGPR:$elem))), + (v4f32 MQPR:$inactive))), + (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), + (v4f32 MQPR:$inactive))>; + def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), + (v8f16 (ARMvdup (i32 rGPR:$elem))), + (v8f16 MQPR:$inactive))), + (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred), + (v8f16 MQPR:$inactive))>; } @@ -2079,32 +2305,43 @@ class MVE_VCLSCLZ<string iname, string suffix, bits<2> size, let validForTailPredication = 1; } -def MVE_VCLSs8 : MVE_VCLSCLZ<"vcls", "s8", 0b00, 0b0>; -def MVE_VCLSs16 : MVE_VCLSCLZ<"vcls", "s16", 0b01, 0b0>; -def MVE_VCLSs32 : MVE_VCLSCLZ<"vcls", "s32", 0b10, 0b0>; +multiclass MVE_VCLSCLZ_p<string opname, bit opcode, MVEVectorVTInfo VTI, + SDNode unpred_op> { + def "": MVE_VCLSCLZ<"v"#opname, VTI.Suffix, VTI.Size, opcode>; -def MVE_VCLZs8 : MVE_VCLSCLZ<"vclz", "i8", 0b00, 0b1>; -def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>; -def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = !cast<Intrinsic>("int_arm_mve_"#opname#"_predicated"); -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))), - (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>; - def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))), - (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>; - def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))), - (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>; + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$val))), + (VTI.Vec (Inst (VTI.Vec MQPR:$val)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen, + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>; + } } +defm MVE_VCLSs8 : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, int_arm_mve_vcls>; +defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, int_arm_mve_vcls>; +defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, int_arm_mve_vcls>; + +defm MVE_VCLZs8 : MVE_VCLSCLZ_p<"clz", 1, MVE_v16i8, ctlz>; +defm MVE_VCLZs16 : MVE_VCLSCLZ_p<"clz", 1, MVE_v8i16, ctlz>; +defm MVE_VCLZs32 : MVE_VCLSCLZ_p<"clz", 1, MVE_v4i32, ctlz>; + class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate, - list<dag> pattern=[]> + bit saturate, list<dag> pattern=[]> : MVEIntSingleSrc<iname, suffix, size, pattern> { let Inst{28} = 0b1; let Inst{25-23} = 0b111; let Inst{21-20} = 0b11; - let Inst{17-16} = 0b01; - let Inst{12-8} = 0b00011; + let Inst{17} = 0b0; + let Inst{16} = !eq(saturate, 0); + let Inst{12-11} = 0b00; + let Inst{10} = saturate; + let Inst{9-8} = 0b11; let Inst{7} = negate; let Inst{6} = 0b1; let Inst{4} = 0b0; @@ -2112,61 +2349,40 @@ class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate, let validForTailPredication = 1; } -def MVE_VABSs8 : MVE_VABSNEG_int<"vabs", "s8", 0b00, 0b0>; -def MVE_VABSs16 : MVE_VABSNEG_int<"vabs", "s16", 0b01, 0b0>; -def MVE_VABSs32 : MVE_VABSNEG_int<"vabs", "s32", 0b10, 0b0>; - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (abs (v16i8 MQPR:$v))), - (v16i8 (MVE_VABSs8 $v))>; - def : Pat<(v8i16 (abs (v8i16 MQPR:$v))), - (v8i16 (MVE_VABSs16 $v))>; - def : Pat<(v4i32 (abs (v4i32 MQPR:$v))), - (v4i32 (MVE_VABSs32 $v))>; -} +multiclass MVE_VABSNEG_int_m<string iname, bit negate, bit saturate, + SDNode unpred_op, Intrinsic pred_int, + MVEVectorVTInfo VTI> { + def "" : MVE_VABSNEG_int<iname, VTI.Suffix, VTI.Size, negate, saturate>; + defvar Inst = !cast<Instruction>(NAME); -def MVE_VNEGs8 : MVE_VABSNEG_int<"vneg", "s8", 0b00, 0b1>; -def MVE_VNEGs16 : MVE_VABSNEG_int<"vneg", "s16", 0b01, 0b1>; -def MVE_VNEGs32 : MVE_VABSNEG_int<"vneg", "s32", 0b10, 0b1>; + let Predicates = [HasMVEInt] in { + // VQABS and VQNEG have more difficult isel patterns defined elsewhere + if !eq(saturate, 0) then { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>; + } -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (vnegq (v16i8 MQPR:$v))), - (v16i8 (MVE_VNEGs8 $v))>; - def : Pat<(v8i16 (vnegq (v8i16 MQPR:$v))), - (v8i16 (MVE_VNEGs16 $v))>; - def : Pat<(v4i32 (vnegq (v4i32 MQPR:$v))), - (v4i32 (MVE_VNEGs32 $v))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>; + } } -class MVE_VQABSNEG<string iname, string suffix, bits<2> size, - bit negate, list<dag> pattern=[]> - : MVEIntSingleSrc<iname, suffix, size, pattern> { - - let Inst{28} = 0b1; - let Inst{25-23} = 0b111; - let Inst{21-20} = 0b11; - let Inst{17-16} = 0b00; - let Inst{12-8} = 0b00111; - let Inst{7} = negate; - let Inst{6} = 0b1; - let Inst{4} = 0b0; - let Inst{0} = 0b0; - let validForTailPredication = 1; +foreach VTI = [ MVE_v16s8, MVE_v8s16, MVE_v4s32 ] in { + defm "MVE_VABS" # VTI.Suffix : MVE_VABSNEG_int_m< + "vabs", 0, 0, abs, int_arm_mve_abs_predicated, VTI>; + defm "MVE_VQABS" # VTI.Suffix : MVE_VABSNEG_int_m< + "vqabs", 0, 1, ?, int_arm_mve_qabs_predicated, VTI>; + defm "MVE_VNEG" # VTI.Suffix : MVE_VABSNEG_int_m< + "vneg", 1, 0, vnegq, int_arm_mve_neg_predicated, VTI>; + defm "MVE_VQNEG" # VTI.Suffix : MVE_VABSNEG_int_m< + "vqneg", 1, 1, ?, int_arm_mve_qneg_predicated, VTI>; } -def MVE_VQABSs8 : MVE_VQABSNEG<"vqabs", "s8", 0b00, 0b0>; -def MVE_VQABSs16 : MVE_VQABSNEG<"vqabs", "s16", 0b01, 0b0>; -def MVE_VQABSs32 : MVE_VQABSNEG<"vqabs", "s32", 0b10, 0b0>; - -def MVE_VQNEGs8 : MVE_VQABSNEG<"vqneg", "s8", 0b00, 0b1>; -def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>; -def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>; - // int_min/int_max: vector containing INT_MIN/INT_MAX VTI.Size times // zero_vec: v4i32-initialized zero vector, potentially wrapped in a bitconvert multiclass vqabsneg_pattern<MVEVectorVTInfo VTI, dag int_min, dag int_max, - dag zero_vec, MVE_VQABSNEG vqabs_instruction, - MVE_VQABSNEG vqneg_instruction> { + dag zero_vec, MVE_VABSNEG_int vqabs_instruction, + MVE_VABSNEG_int vqneg_instruction> { let Predicates = [HasMVEInt] in { // The below tree can be replaced by a vqabs instruction, as it represents // the following vectorized expression (r being the value in $reg): @@ -2257,6 +2473,8 @@ let Predicates = [HasMVEInt] in { (v8i16 (MVE_VMOVimmi16 nImmSplatI16:$simm))>; def : Pat<(v4i32 (ARMvmovImm timm:$simm)), (v4i32 (MVE_VMOVimmi32 nImmVMOVI32:$simm))>; + def : Pat<(v2i64 (ARMvmovImm timm:$simm)), + (v2i64 (MVE_VMOVimmi64 nImmSplatI64:$simm))>; def : Pat<(v8i16 (ARMvmvnImm timm:$simm)), (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm))>; @@ -2265,6 +2483,15 @@ let Predicates = [HasMVEInt] in { def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)), (v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>; + + def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (ARMvmvnImm timm:$simm), + MQPR:$inactive)), + (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm, + ARMVCCThen, VCCR:$pred, MQPR:$inactive))>; + def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (ARMvmvnImm timm:$simm), + MQPR:$inactive)), + (v4i32 (MVE_VMVNimmi32 nImmSplatI32:$simm, + ARMVCCThen, VCCR:$pred, MQPR:$inactive))>; } class MVE_VMINMAXA<string iname, string suffix, bits<2> size, @@ -2291,13 +2518,37 @@ class MVE_VMINMAXA<string iname, string suffix, bits<2> size, let validForTailPredication = 1; } -def MVE_VMAXAs8 : MVE_VMINMAXA<"vmaxa", "s8", 0b00, 0b0>; -def MVE_VMAXAs16 : MVE_VMINMAXA<"vmaxa", "s16", 0b01, 0b0>; -def MVE_VMAXAs32 : MVE_VMINMAXA<"vmaxa", "s32", 0b10, 0b0>; +multiclass MVE_VMINMAXA_m<string iname, MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int, bit bit_12> { + def "" : MVE_VMINMAXA<iname, VTI.Suffix, VTI.Size, bit_12>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated v(min|max)a + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qd), (abs (VTI.Vec MQPR:$Qm)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>; + + // Predicated v(min|max)a + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm), + ARMVCCThen, (VTI.Pred VCCR:$mask)))>; + } +} + +multiclass MVE_VMINA<MVEVectorVTInfo VTI> + : MVE_VMINMAXA_m<"vmina", VTI, umin, int_arm_mve_vmina_predicated, 0b1>; + +defm MVE_VMINAs8 : MVE_VMINA<MVE_v16s8>; +defm MVE_VMINAs16 : MVE_VMINA<MVE_v8s16>; +defm MVE_VMINAs32 : MVE_VMINA<MVE_v4s32>; -def MVE_VMINAs8 : MVE_VMINMAXA<"vmina", "s8", 0b00, 0b1>; -def MVE_VMINAs16 : MVE_VMINMAXA<"vmina", "s16", 0b01, 0b1>; -def MVE_VMINAs32 : MVE_VMINMAXA<"vmina", "s32", 0b10, 0b1>; +multiclass MVE_VMAXA<MVEVectorVTInfo VTI> + : MVE_VMINMAXA_m<"vmaxa", VTI, umax, int_arm_mve_vmaxa_predicated, 0b0>; + +defm MVE_VMAXAs8 : MVE_VMAXA<MVE_v16s8>; +defm MVE_VMAXAs16 : MVE_VMAXA<MVE_v8s16>; +defm MVE_VMAXAs32 : MVE_VMAXA<MVE_v4s32>; // end of MVE Integer instructions @@ -2334,7 +2585,7 @@ class MVE_shift_imm<dag oops, dag iops, string iname, string suffix, let Inst{3-1} = Qm{2-0}; } -class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, +class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, bit top, list<dag> pattern=[]> : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm), iname, suffix, "$Qd, $Qm", vpred_r, "", @@ -2344,25 +2595,36 @@ class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, let Inst{21} = 0b1; let Inst{20-19} = sz{1-0}; let Inst{18-16} = 0b000; + let Inst{12} = top; let Inst{11-6} = 0b111101; let Inst{4} = 0b0; let Inst{0} = 0b0; + let doubleWidthResult = 1; } -multiclass MVE_VMOVL_shift_half<string iname, string suffix, bits<2> sz, bit U, - list<dag> pattern=[]> { - def bh : MVE_VMOVL<!strconcat(iname, "b"), suffix, sz, U, pattern> { - let Inst{12} = 0b0; - } - def th : MVE_VMOVL<!strconcat(iname, "t"), suffix, sz, U, pattern> { - let Inst{12} = 0b1; - } +multiclass MVE_VMOVL_m<bit top, string chr, MVEVectorVTInfo OutVTI, + MVEVectorVTInfo InVTI> { + def "": MVE_VMOVL<"vmovl" # chr, InVTI.Suffix, OutVTI.Size, + InVTI.Unsigned, top>; + defvar Inst = !cast<Instruction>(NAME); + + def : Pat<(OutVTI.Vec (int_arm_mve_vmovl_predicated (InVTI.Vec MQPR:$src), + (i32 InVTI.Unsigned), (i32 top), + (OutVTI.Pred VCCR:$pred), + (OutVTI.Vec MQPR:$inactive))), + (OutVTI.Vec (Inst (InVTI.Vec MQPR:$src), ARMVCCThen, + (OutVTI.Pred VCCR:$pred), + (OutVTI.Vec MQPR:$inactive)))>; } -defm MVE_VMOVLs8 : MVE_VMOVL_shift_half<"vmovl", "s8", 0b01, 0b0>; -defm MVE_VMOVLu8 : MVE_VMOVL_shift_half<"vmovl", "u8", 0b01, 0b1>; -defm MVE_VMOVLs16 : MVE_VMOVL_shift_half<"vmovl", "s16", 0b10, 0b0>; -defm MVE_VMOVLu16 : MVE_VMOVL_shift_half<"vmovl", "u16", 0b10, 0b1>; +defm MVE_VMOVLs8bh : MVE_VMOVL_m<0, "b", MVE_v8s16, MVE_v16s8>; +defm MVE_VMOVLs8th : MVE_VMOVL_m<1, "t", MVE_v8s16, MVE_v16s8>; +defm MVE_VMOVLu8bh : MVE_VMOVL_m<0, "b", MVE_v8u16, MVE_v16u8>; +defm MVE_VMOVLu8th : MVE_VMOVL_m<1, "t", MVE_v8u16, MVE_v16u8>; +defm MVE_VMOVLs16bh : MVE_VMOVL_m<0, "b", MVE_v4s32, MVE_v8s16>; +defm MVE_VMOVLs16th : MVE_VMOVL_m<1, "t", MVE_v4s32, MVE_v8s16>; +defm MVE_VMOVLu16bh : MVE_VMOVL_m<0, "b", MVE_v4s32, MVE_v8u16>; +defm MVE_VMOVLu16th : MVE_VMOVL_m<1, "t", MVE_v4s32, MVE_v8u16>; let Predicates = [HasMVEInt] in { def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i16), @@ -2372,12 +2634,23 @@ let Predicates = [HasMVEInt] in { def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8), (MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>; + def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), v8i8), + (MVE_VMOVLs8th MQPR:$src)>; + def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), v4i16), + (MVE_VMOVLs16th MQPR:$src)>; + + // zext_inreg 8 -> 16 + def : Pat<(ARMvbicImm (v8i16 MQPR:$src), (i32 0xAFF)), + (MVE_VMOVLu8bh MQPR:$src)>; // zext_inreg 16 -> 32 def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))), (MVE_VMOVLu16bh MQPR:$src)>; - // zext_inreg 8 -> 16 - def : Pat<(and (v8i16 MQPR:$src), (v8i16 (ARMvmovImm (i32 0x8FF)))), - (MVE_VMOVLu8bh MQPR:$src)>; + // Same zext_inreg with vrevs, picking the top half + def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), (i32 0xAFF)), + (MVE_VMOVLu8th MQPR:$src)>; + def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), + (v4i32 (ARMvmovImm (i32 0xCFF)))), + (MVE_VMOVLu16th MQPR:$src)>; } @@ -2395,6 +2668,8 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th, // For the MVE_VSHLL_patterns multiclass to refer to Operand immediateType = immtype; + + let doubleWidthResult = 1; } // The immediate VSHLL instructions accept shift counts from 1 up to @@ -2438,6 +2713,7 @@ class MVE_VSHLL_by_lane_width<string iname, string suffix, bits<2> size, let Inst{11-6} = 0b111000; let Inst{4} = 0b0; let Inst{0} = 0b1; + let doubleWidthResult = 1; } multiclass MVE_VSHLL_lw<string iname, string suffix, bits<2> sz, bit U, @@ -2472,17 +2748,17 @@ multiclass MVE_VSHLL_patterns<MVEVectorVTInfo VTI, int top> { def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), imm:$imm, (i32 VTI.Unsigned), (i32 top), - (VTI.Pred VCCR:$mask), + (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive))), (VTI.DblVec (inst_imm (VTI.Vec MQPR:$src), imm:$imm, - ARMVCCThen, (VTI.Pred VCCR:$mask), + ARMVCCThen, (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive)))>; def : Pat<(VTI.DblVec (pred_int (VTI.Vec MQPR:$src), (i32 VTI.LaneBits), (i32 VTI.Unsigned), (i32 top), - (VTI.Pred VCCR:$mask), + (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive))), (VTI.DblVec (inst_lw (VTI.Vec MQPR:$src), ARMVCCThen, - (VTI.Pred VCCR:$mask), + (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive)))>; } @@ -2509,6 +2785,8 @@ class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28, let Inst{11-6} = 0b111111; let Inst{4} = 0b0; let Inst{0} = 0b1; + let validForTailPredication = 1; + let retainsPreviousHalfElement = 1; } def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> { @@ -2550,6 +2828,8 @@ class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12, let Inst{11-6} = 0b111111; let Inst{4} = 0b0; let Inst{0} = 0b0; + let validForTailPredication = 1; + let retainsPreviousHalfElement = 1; } def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN< @@ -2598,6 +2878,8 @@ class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12, let Inst{11-6} = 0b111101; let Inst{4} = 0b0; let Inst{0} = bit_0; + let validForTailPredication = 1; + let retainsPreviousHalfElement = 1; } multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> { @@ -3131,41 +3413,34 @@ class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size, } -multiclass MVE_VRINT_ops<string suffix, bits<2> size, list<dag> pattern=[]> { - def N : MVE_VRINT<"n", 0b000, suffix, size, pattern>; - def X : MVE_VRINT<"x", 0b001, suffix, size, pattern>; - def A : MVE_VRINT<"a", 0b010, suffix, size, pattern>; - def Z : MVE_VRINT<"z", 0b011, suffix, size, pattern>; - def M : MVE_VRINT<"m", 0b101, suffix, size, pattern>; - def P : MVE_VRINT<"p", 0b111, suffix, size, pattern>; -} +multiclass MVE_VRINT_m<MVEVectorVTInfo VTI, string suffix, bits<3> opcode, + SDNode unpred_op> { + def "": MVE_VRINT<suffix, opcode, VTI.Suffix, VTI.Size>; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = !cast<Intrinsic>("int_arm_mve_vrint"#suffix#"_predicated"); -defm MVE_VRINTf16 : MVE_VRINT_ops<"f16", 0b01>; -defm MVE_VRINTf32 : MVE_VRINT_ops<"f32", 0b10>; + let Predicates = [HasMVEFloat] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$val))), + (VTI.Vec (Inst (VTI.Vec MQPR:$val)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen, + (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>; + } +} -let Predicates = [HasMVEFloat] in { - def : Pat<(v4f32 (frint (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32X (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (frint (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16X (v8f16 MQPR:$val1)))>; - def : Pat<(v4f32 (fround (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32A (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (fround (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16A (v8f16 MQPR:$val1)))>; - def : Pat<(v4f32 (ftrunc (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32Z (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (ftrunc (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16Z (v8f16 MQPR:$val1)))>; - def : Pat<(v4f32 (ffloor (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32M (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (ffloor (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16M (v8f16 MQPR:$val1)))>; - def : Pat<(v4f32 (fceil (v4f32 MQPR:$val1))), - (v4f32 (MVE_VRINTf32P (v4f32 MQPR:$val1)))>; - def : Pat<(v8f16 (fceil (v8f16 MQPR:$val1))), - (v8f16 (MVE_VRINTf16P (v8f16 MQPR:$val1)))>; +multiclass MVE_VRINT_ops<MVEVectorVTInfo VTI> { + defm N : MVE_VRINT_m<VTI, "n", 0b000, int_arm_mve_vrintn>; + defm X : MVE_VRINT_m<VTI, "x", 0b001, frint>; + defm A : MVE_VRINT_m<VTI, "a", 0b010, fround>; + defm Z : MVE_VRINT_m<VTI, "z", 0b011, ftrunc>; + defm M : MVE_VRINT_m<VTI, "m", 0b101, ffloor>; + defm P : MVE_VRINT_m<VTI, "p", 0b111, fceil>; } +defm MVE_VRINTf16 : MVE_VRINT_ops<MVE_v8f16>; +defm MVE_VRINTf32 : MVE_VRINT_ops<MVE_v4f32>; + class MVEFloatArithNeon<string iname, string suffix, bit size, dag oops, dag iops, string ops, vpred_ops vpred, string cstr, list<dag> pattern=[]> @@ -3281,29 +3556,40 @@ class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4, let Inst{8} = bit_8; let Inst{7} = Qn{3}; let Inst{4} = bit_4; + let validForTailPredication = 1; } -def MVE_VFMAf32 : MVE_VADDSUBFMA_fp<"vfma", "f32", 0b0, 0b1, 0b0, 0b0, - (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; -def MVE_VFMAf16 : MVE_VADDSUBFMA_fp<"vfma", "f16", 0b1, 0b1, 0b0, 0b0, - (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; - -def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1, - (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; -def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1, - (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; +multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> { + def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0b1, 0b0, fms, + (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = int_arm_mve_fma_predicated; + defvar m1 = (VTI.Vec MQPR:$m1); + defvar m2 = (VTI.Vec MQPR:$m2); + defvar add = (VTI.Vec MQPR:$add); + defvar pred = (VTI.Pred VCCR:$pred); -let Predicates = [HasMVEFloat] in { - def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), - (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>; - def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), - (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>; - def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))), - (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>; - def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))), - (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>; + let Predicates = [HasMVEFloat] in { + if fms then { + def : Pat<(VTI.Vec (fma (fneg m1), m2, add)), (Inst $add, $m1, $m2)>; + def : Pat<(VTI.Vec (fma m1, (fneg m2), add)), (Inst $add, $m1, $m2)>; + def : Pat<(VTI.Vec (pred_int (fneg m1), m2, add, pred)), + (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; + def : Pat<(VTI.Vec (pred_int m1, (fneg m2), add, pred)), + (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; + } else { + def : Pat<(VTI.Vec (fma m1, m2, add)), (Inst $add, $m1, $m2)>; + def : Pat<(VTI.Vec (pred_int m1, m2, add, pred)), + (Inst $add, $m1, $m2, ARMVCCThen, $pred)>; + } + } } +defm MVE_VFMAf32 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v4f32>; +defm MVE_VFMAf16 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v8f16>; +defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>; +defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>; + multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int> { def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> { @@ -3423,10 +3709,10 @@ defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>; defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>; class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op, - Operand imm_operand_type, list<dag> pattern=[]> + Operand imm_operand_type> : MVE_float<"vcvt", suffix, (outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6), - "$Qd, $Qm, $imm6", vpred_r, "", pattern> { + "$Qd, $Qm, $imm6", vpred_r, "", []> { bits<4> Qd; bits<6> imm6; @@ -3468,14 +3754,43 @@ class MVE_VCVT_fix_f16<string suffix, bit U, bit op> let Inst{20} = 0b1; } -def MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16<"f16.s16", 0b0, 0b0>; -def MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16<"s16.f16", 0b0, 0b1>; -def MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16<"f16.u16", 0b1, 0b0>; -def MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16<"u16.f16", 0b1, 0b1>; -def MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32<"f32.s32", 0b0, 0b0>; -def MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32<"s32.f32", 0b0, 0b1>; -def MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32<"f32.u32", 0b1, 0b0>; -def MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32<"u32.f32", 0b1, 0b1>; +multiclass MVE_VCVT_fix_patterns<Instruction Inst, bit U, MVEVectorVTInfo DestVTI, + MVEVectorVTInfo SrcVTI> { + let Predicates = [HasMVEFloat] in { + def : Pat<(DestVTI.Vec (int_arm_mve_vcvt_fix + (i32 U), (SrcVTI.Vec MQPR:$Qm), imm:$scale)), + (DestVTI.Vec (Inst (SrcVTI.Vec MQPR:$Qm), imm:$scale))>; + def : Pat<(DestVTI.Vec (int_arm_mve_vcvt_fix_predicated (i32 U), + (DestVTI.Vec MQPR:$inactive), + (SrcVTI.Vec MQPR:$Qm), + imm:$scale, + (DestVTI.Pred VCCR:$mask))), + (DestVTI.Vec (Inst (SrcVTI.Vec MQPR:$Qm), imm:$scale, + ARMVCCThen, (DestVTI.Pred VCCR:$mask), + (DestVTI.Vec MQPR:$inactive)))>; + } +} + +multiclass MVE_VCVT_fix_f32_m<bit U, bit op, + MVEVectorVTInfo DestVTI, MVEVectorVTInfo SrcVTI> { + def "" : MVE_VCVT_fix_f32<DestVTI.Suffix#"."#SrcVTI.Suffix, U, op>; + defm : MVE_VCVT_fix_patterns<!cast<Instruction>(NAME), U, DestVTI, SrcVTI>; +} + +multiclass MVE_VCVT_fix_f16_m<bit U, bit op, + MVEVectorVTInfo DestVTI, MVEVectorVTInfo SrcVTI> { + def "" : MVE_VCVT_fix_f16<DestVTI.Suffix#"."#SrcVTI.Suffix, U, op>; + defm : MVE_VCVT_fix_patterns<!cast<Instruction>(NAME), U, DestVTI, SrcVTI>; +} + +defm MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16_m<0b0, 0b0, MVE_v8f16, MVE_v8s16>; +defm MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16_m<0b0, 0b1, MVE_v8s16, MVE_v8f16>; +defm MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16_m<0b1, 0b0, MVE_v8f16, MVE_v8u16>; +defm MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16_m<0b1, 0b1, MVE_v8u16, MVE_v8f16>; +defm MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32_m<0b0, 0b0, MVE_v4f32, MVE_v4s32>; +defm MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32_m<0b0, 0b1, MVE_v4s32, MVE_v4f32>; +defm MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32_m<0b1, 0b0, MVE_v4f32, MVE_v4u32>; +defm MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32_m<0b1, 0b1, MVE_v4u32, MVE_v4f32>; class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm, bits<2> rm, list<dag> pattern=[]> @@ -3497,23 +3812,44 @@ class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm, let validForTailPredication = 1; } -multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op, - list<dag> pattern=[]> { - def a : MVE_VCVT_fp_int_anpm<suffix, size, op, "a", 0b00>; - def n : MVE_VCVT_fp_int_anpm<suffix, size, op, "n", 0b01>; - def p : MVE_VCVT_fp_int_anpm<suffix, size, op, "p", 0b10>; - def m : MVE_VCVT_fp_int_anpm<suffix, size, op, "m", 0b11>; +multiclass MVE_VCVT_fp_int_anpm_inner<MVEVectorVTInfo Int, MVEVectorVTInfo Flt, + string anpm, bits<2> rm> { + def "": MVE_VCVT_fp_int_anpm<Int.Suffix # "." # Flt.Suffix, Int.Size, + Int.Unsigned, anpm, rm>; + + defvar Inst = !cast<Instruction>(NAME); + defvar IntrBaseName = "int_arm_mve_vcvt" # anpm; + defvar UnpredIntr = !cast<Intrinsic>(IntrBaseName); + defvar PredIntr = !cast<Intrinsic>(IntrBaseName # "_predicated"); + + let Predicates = [HasMVEFloat] in { + def : Pat<(Int.Vec (UnpredIntr (i32 Int.Unsigned), (Flt.Vec MQPR:$in))), + (Int.Vec (Inst (Flt.Vec MQPR:$in)))>; + + def : Pat<(Int.Vec (PredIntr (i32 Int.Unsigned), (Int.Vec MQPR:$inactive), + (Flt.Vec MQPR:$in), (Flt.Pred VCCR:$pred))), + (Int.Vec (Inst (Flt.Vec MQPR:$in), ARMVCCThen, + (Flt.Pred VCCR:$pred), (Int.Vec MQPR:$inactive)))>; + } +} + +multiclass MVE_VCVT_fp_int_anpm_outer<MVEVectorVTInfo Int, + MVEVectorVTInfo Flt> { + defm a : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "a", 0b00>; + defm n : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "n", 0b01>; + defm p : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "p", 0b10>; + defm m : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "m", 0b11>; } // This defines instructions such as MVE_VCVTu16f16a, with an explicit // rounding-mode suffix on the mnemonic. The class below will define // the bare MVE_VCVTu16f16 (with implied rounding toward zero). -defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_multi<"s16.f16", 0b01, 0b0>; -defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_multi<"u16.f16", 0b01, 0b1>; -defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_multi<"s32.f32", 0b10, 0b0>; -defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_multi<"u32.f32", 0b10, 0b1>; +defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_outer<MVE_v8s16, MVE_v8f16>; +defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_outer<MVE_v8u16, MVE_v8f16>; +defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4s32, MVE_v4f32>; +defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4u32, MVE_v4f32>; -class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op, +class MVE_VCVT_fp_int<string suffix, bits<2> size, bit toint, bit unsigned, list<dag> pattern=[]> : MVE_float<"vcvt", suffix, (outs MQPR:$Qd), (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> { @@ -3527,41 +3863,43 @@ class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op, let Inst{17-16} = 0b11; let Inst{15-13} = Qd{2-0}; let Inst{12-9} = 0b0011; - let Inst{8-7} = op; + let Inst{8} = toint; + let Inst{7} = unsigned; let Inst{4} = 0b0; let validForTailPredication = 1; } +multiclass MVE_VCVT_fp_int_m<MVEVectorVTInfo Dest, MVEVectorVTInfo Src, + SDNode unpred_op> { + defvar Unsigned = !or(!eq(Dest.SuffixLetter,"u"), !eq(Src.SuffixLetter,"u")); + defvar ToInt = !eq(Src.SuffixLetter,"f"); + + def "" : MVE_VCVT_fp_int<Dest.Suffix # "." # Src.Suffix, Dest.Size, + ToInt, Unsigned>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(Dest.Vec (unpred_op (Src.Vec MQPR:$src))), + (Dest.Vec (Inst (Src.Vec MQPR:$src)))>; + def : Pat<(Dest.Vec (int_arm_mve_vcvt_fp_int_predicated + (Src.Vec MQPR:$src), (i32 Unsigned), + (Src.Pred VCCR:$mask), (Dest.Vec MQPR:$inactive))), + (Dest.Vec (Inst (Src.Vec MQPR:$src), ARMVCCThen, + (Src.Pred VCCR:$mask), + (Dest.Vec MQPR:$inactive)))>; + } +} // The unsuffixed VCVT for float->int implicitly rounds toward zero, // which I reflect here in the llvm instruction names -def MVE_VCVTs16f16z : MVE_VCVT_fp_int<"s16.f16", 0b01, 0b10>; -def MVE_VCVTu16f16z : MVE_VCVT_fp_int<"u16.f16", 0b01, 0b11>; -def MVE_VCVTs32f32z : MVE_VCVT_fp_int<"s32.f32", 0b10, 0b10>; -def MVE_VCVTu32f32z : MVE_VCVT_fp_int<"u32.f32", 0b10, 0b11>; +defm MVE_VCVTs16f16z : MVE_VCVT_fp_int_m<MVE_v8s16, MVE_v8f16, fp_to_sint>; +defm MVE_VCVTu16f16z : MVE_VCVT_fp_int_m<MVE_v8u16, MVE_v8f16, fp_to_uint>; +defm MVE_VCVTs32f32z : MVE_VCVT_fp_int_m<MVE_v4s32, MVE_v4f32, fp_to_sint>; +defm MVE_VCVTu32f32z : MVE_VCVT_fp_int_m<MVE_v4u32, MVE_v4f32, fp_to_uint>; // Whereas VCVT for int->float rounds to nearest -def MVE_VCVTf16s16n : MVE_VCVT_fp_int<"f16.s16", 0b01, 0b00>; -def MVE_VCVTf16u16n : MVE_VCVT_fp_int<"f16.u16", 0b01, 0b01>; -def MVE_VCVTf32s32n : MVE_VCVT_fp_int<"f32.s32", 0b10, 0b00>; -def MVE_VCVTf32u32n : MVE_VCVT_fp_int<"f32.u32", 0b10, 0b01>; - -let Predicates = [HasMVEFloat] in { - def : Pat<(v4i32 (fp_to_sint (v4f32 MQPR:$src))), - (v4i32 (MVE_VCVTs32f32z (v4f32 MQPR:$src)))>; - def : Pat<(v4i32 (fp_to_uint (v4f32 MQPR:$src))), - (v4i32 (MVE_VCVTu32f32z (v4f32 MQPR:$src)))>; - def : Pat<(v8i16 (fp_to_sint (v8f16 MQPR:$src))), - (v8i16 (MVE_VCVTs16f16z (v8f16 MQPR:$src)))>; - def : Pat<(v8i16 (fp_to_uint (v8f16 MQPR:$src))), - (v8i16 (MVE_VCVTu16f16z (v8f16 MQPR:$src)))>; - def : Pat<(v4f32 (sint_to_fp (v4i32 MQPR:$src))), - (v4f32 (MVE_VCVTf32s32n (v4i32 MQPR:$src)))>; - def : Pat<(v4f32 (uint_to_fp (v4i32 MQPR:$src))), - (v4f32 (MVE_VCVTf32u32n (v4i32 MQPR:$src)))>; - def : Pat<(v8f16 (sint_to_fp (v8i16 MQPR:$src))), - (v8f16 (MVE_VCVTf16s16n (v8i16 MQPR:$src)))>; - def : Pat<(v8f16 (uint_to_fp (v8i16 MQPR:$src))), - (v8f16 (MVE_VCVTf16u16n (v8i16 MQPR:$src)))>; -} +defm MVE_VCVTf16s16n : MVE_VCVT_fp_int_m<MVE_v8f16, MVE_v8s16, sint_to_fp>; +defm MVE_VCVTf16u16n : MVE_VCVT_fp_int_m<MVE_v8f16, MVE_v8u16, uint_to_fp>; +defm MVE_VCVTf32s32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4s32, sint_to_fp>; +defm MVE_VCVTf32u32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4u32, uint_to_fp>; class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate, list<dag> pattern=[]> @@ -3582,26 +3920,29 @@ class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate, let validForTailPredication = 1; } -def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>; -def MVE_VABSf32 : MVE_VABSNEG_fp<"vabs", "f32", 0b10, 0b0>; - -let Predicates = [HasMVEFloat] in { - def : Pat<(v8f16 (fabs MQPR:$src)), - (MVE_VABSf16 MQPR:$src)>; - def : Pat<(v4f32 (fabs MQPR:$src)), - (MVE_VABSf32 MQPR:$src)>; -} +multiclass MVE_VABSNEG_fp_m<string iname, SDNode unpred_op, Intrinsic pred_int, + MVEVectorVTInfo VTI, bit opcode> { + def "" : MVE_VABSNEG_fp<iname, VTI.Suffix, VTI.Size, opcode>; + defvar Inst = !cast<Instruction>(NAME); -def MVE_VNEGf16 : MVE_VABSNEG_fp<"vneg", "f16", 0b01, 0b1>; -def MVE_VNEGf32 : MVE_VABSNEG_fp<"vneg", "f32", 0b10, 0b1>; + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>; -let Predicates = [HasMVEFloat] in { - def : Pat<(v8f16 (fneg MQPR:$src)), - (MVE_VNEGf16 MQPR:$src)>; - def : Pat<(v4f32 (fneg MQPR:$src)), - (MVE_VNEGf32 MQPR:$src)>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive))), + (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>; + } } +defm MVE_VABSf16 : MVE_VABSNEG_fp_m<"vabs", fabs, int_arm_mve_abs_predicated, + MVE_v8f16, 0>; +defm MVE_VABSf32 : MVE_VABSNEG_fp_m<"vabs", fabs, int_arm_mve_abs_predicated, + MVE_v4f32, 0>; +defm MVE_VNEGf16 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated, + MVE_v8f16, 1>; +defm MVE_VNEGf32 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated, + MVE_v4f32, 1>; + class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12, list<dag> pattern=[]> : MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), @@ -3623,11 +3964,37 @@ class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12, let Inst{0} = 0b1; } -def MVE_VMAXNMAf32 : MVE_VMAXMINNMA<"vmaxnma", "f32", 0b0, 0b0>; -def MVE_VMAXNMAf16 : MVE_VMAXMINNMA<"vmaxnma", "f16", 0b1, 0b0>; +multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI, + SDNode unpred_op, Intrinsic pred_int, + bit bit_12> { + def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size{0}, bit_12>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated v(max|min)nma + def : Pat<(VTI.Vec (unpred_op (fabs (VTI.Vec MQPR:$Qd)), + (fabs (VTI.Vec MQPR:$Qm)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>; + + // Predicated v(max|min)nma + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm), + ARMVCCThen, (VTI.Pred VCCR:$mask)))>; + } +} + +multiclass MVE_VMAXNMA<MVEVectorVTInfo VTI, bit bit_12> + : MVE_VMAXMINNMA_m<"vmaxnma", VTI, fmaxnum, int_arm_mve_vmaxnma_predicated, bit_12>; + +defm MVE_VMAXNMAf32 : MVE_VMAXNMA<MVE_v4f32, 0b0>; +defm MVE_VMAXNMAf16 : MVE_VMAXNMA<MVE_v8f16, 0b0>; -def MVE_VMINNMAf32 : MVE_VMAXMINNMA<"vminnma", "f32", 0b0, 0b1>; -def MVE_VMINNMAf16 : MVE_VMAXMINNMA<"vminnma", "f16", 0b1, 0b1>; +multiclass MVE_VMINNMA<MVEVectorVTInfo VTI, bit bit_12> + : MVE_VMAXMINNMA_m<"vminnma", VTI, fminnum, int_arm_mve_vminnma_predicated, bit_12>; + +defm MVE_VMINNMAf32 : MVE_VMINNMA<MVE_v4f32, 0b1>; +defm MVE_VMINNMAf16 : MVE_VMINNMA<MVE_v8f16, 0b1>; // end of MVE Floating Point instructions @@ -3796,12 +4163,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> { def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)), (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>; - def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)), - (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)), - (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)), - (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>; + def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc))>; def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))), (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; @@ -3810,12 +4177,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> { def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))), (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))), - (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))), - (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))), - (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)))), + (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)))), + (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)))), + (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; } multiclass unpred_vcmpf_z<PatLeaf fc> { @@ -3825,31 +4192,31 @@ multiclass unpred_vcmpf_z<PatLeaf fc> { (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)))), - (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)))), (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>; } multiclass unpred_vcmpf_r<int fc> { - def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)), - (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; - def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)), - (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; + def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)), + (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; + def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)), + (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; - def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>; - def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>; + def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))), (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))), (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; } let Predicates = [HasMVEInt] in { @@ -3889,7 +4256,7 @@ let Predicates = [HasMVEFloat] in { } -// Extra "worst case" and/or/xor partterns, going into and out of GRP +// Extra "worst case" and/or/xor patterns, going into and out of GRP multiclass two_predops<SDPatternOperator opnode, Instruction insn> { def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))), (v16i1 (COPY_TO_REGCLASS @@ -3918,7 +4285,6 @@ let Predicates = [HasMVEInt] in { // example when moving between rGPR and VPR.P0 as part of predicate vector // shuffles. We also sometimes need to cast between different predicate // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles. - def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>; let Predicates = [HasMVEInt] in { @@ -3932,6 +4298,16 @@ let Predicates = [HasMVEInt] in { def : Pat<(VT (predicate_cast (VT2 VCCR:$src))), (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; } + + // Here we match the specific SDNode type 'ARMVectorRegCastImpl' + // rather than the more general 'ARMVectorRegCast' which would also + // match some bitconverts. If we use the latter in cases where the + // input and output types are the same, the bitconvert gets elided + // and we end up generating a nonsense match of nothing. + + foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in + foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in + def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))), (VT MQPR:$src)>; } // end of MVE compares @@ -3973,11 +4349,32 @@ class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract, let Inst{0} = round; } +multiclass MVE_VQxDMLxDH_p<string iname, bit exch, bit round, bit subtract, + MVEVectorVTInfo VTI> { + def "": MVE_VQxDMLxDH<iname, exch, round, subtract, VTI.Suffix, VTI.Size, + !if(!eq(VTI.LaneBits, 32), ",@earlyclobber $Qd", "")>; + defvar Inst = !cast<Instruction>(NAME); + defvar ConstParams = (? (i32 exch), (i32 round), (i32 subtract)); + defvar unpred_intr = int_arm_mve_vqdmlad; + defvar pred_intr = int_arm_mve_vqdmlad_predicated; + + def : Pat<(VTI.Vec !con((unpred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c)), ConstParams)), + (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c)))>; + def : Pat<(VTI.Vec !con((pred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c)), ConstParams, + (? (VTI.Pred VCCR:$pred)))), + (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b), + (VTI.Vec MQPR:$c), + ARMVCCThen, (VTI.Pred VCCR:$pred)))>; +} + multiclass MVE_VQxDMLxDH_multi<string iname, bit exch, bit round, bit subtract> { - def s8 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8", 0b00>; - def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>; - def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">; + defm s8 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v16s8>; + defm s16 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v8s16>; + defm s32 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v4s32>; } defm MVE_VQDMLADH : MVE_VQxDMLxDH_multi<"vqdmladh", 0b0, 0b0, 0b0>; @@ -4051,6 +4448,7 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20, let Inst{7} = Qn{3}; let Inst{0} = 0b0; let validForTailPredication = 1; + let doubleWidthResult = 1; } multiclass MVE_VMULL_m<MVEVectorVTInfo VTI, @@ -4072,10 +4470,10 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI, // Predicated multiply def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), - uflag, (? (i32 Top), (VTI.Pred VCCR:$mask), + uflag, (? (i32 Top), (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive)))), (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), - ARMVCCThen, (VTI.Pred VCCR:$mask), + ARMVCCThen, (VTI.DblPred VCCR:$mask), (VTI.DblVec MQPR:$inactive)))>; } } @@ -4122,6 +4520,50 @@ defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly, defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly, int_arm_mve_mull_poly_predicated, 0b1>; +let Predicates = [HasMVEInt] in { + def : Pat<(v2i64 (ARMvmulls (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))), + (MVE_VMULLBs32 MQPR:$src1, MQPR:$src2)>; + def : Pat<(v2i64 (ARMvmulls (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))), + (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))), + (MVE_VMULLTs32 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (sext_inreg (v4i32 MQPR:$src1), v4i16), + (sext_inreg (v4i32 MQPR:$src2), v4i16)), + (MVE_VMULLBs16 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), v4i16), + (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), v4i16)), + (MVE_VMULLTs16 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (sext_inreg (v8i16 MQPR:$src1), v8i8), + (sext_inreg (v8i16 MQPR:$src2), v8i8)), + (MVE_VMULLBs8 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), v8i8), + (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), v8i8)), + (MVE_VMULLTs8 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(v2i64 (ARMvmullu (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))), + (MVE_VMULLBu32 MQPR:$src1, MQPR:$src2)>; + def : Pat<(v2i64 (ARMvmullu (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))), + (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))), + (MVE_VMULLTu32 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (and (v4i32 MQPR:$src1), (v4i32 (ARMvmovImm (i32 0xCFF)))), + (and (v4i32 MQPR:$src2), (v4i32 (ARMvmovImm (i32 0xCFF))))), + (MVE_VMULLBu16 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), + (v4i32 (ARMvmovImm (i32 0xCFF)))), + (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), + (v4i32 (ARMvmovImm (i32 0xCFF))))), + (MVE_VMULLTu16 MQPR:$src1, MQPR:$src2)>; + + def : Pat<(mul (ARMvbicImm (v8i16 MQPR:$src1), (i32 0xAFF)), + (ARMvbicImm (v8i16 MQPR:$src2), (i32 0xAFF))), + (MVE_VMULLBu8 MQPR:$src1, MQPR:$src2)>; + def : Pat<(mul (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), (i32 0xAFF)), + (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), (i32 0xAFF))), + (MVE_VMULLTu8 MQPR:$src1, MQPR:$src2)>; +} + class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round, list<dag> pattern=[]> : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), @@ -4195,6 +4637,8 @@ class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17, let Inst{8} = 0b0; let Inst{7} = !if(!eq(bit_17, 0), 1, 0); let Inst{0} = 0b1; + let validForTailPredication = 1; + let retainsPreviousHalfElement = 1; } multiclass MVE_VxMOVxN_halves<string iname, string suffix, @@ -4213,21 +4657,121 @@ defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>; defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>; def MVEvmovn : SDNode<"ARMISD::VMOVN", SDTARMVEXT>; + +multiclass MVE_VMOVN_p<Instruction Inst, bit top, + MVEVectorVTInfo VTI, MVEVectorVTInfo InVTI> { + // Match the most obvious MVEvmovn(a,b,t), which overwrites the odd or even + // lanes of a (depending on t) with the even lanes of b. + def : Pat<(VTI.Vec (MVEvmovn (VTI.Vec MQPR:$Qd_src), + (VTI.Vec MQPR:$Qm), (i32 top))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>; + + if !eq(top, 0) then { + // If we see MVEvmovn(a,ARMvrev(b),1), that wants to overwrite the odd + // lanes of a with the odd lanes of b. In other words, the lanes we're + // _keeping_ from a are the even ones. So we can flip it round and say that + // this is the same as overwriting the even lanes of b with the even lanes + // of a, i.e. it's a VMOVNB with the operands reversed. + defvar vrev = !cast<SDNode>("ARMvrev" # InVTI.LaneBits); + def : Pat<(VTI.Vec (MVEvmovn (VTI.Vec MQPR:$Qm), + (VTI.Vec (vrev MQPR:$Qd_src)), (i32 1))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>; + } + + // Match the IR intrinsic for a predicated VMOVN. This regards the Qm input + // as having wider lanes that we're narrowing, instead of already-narrow + // lanes that we're taking every other one of. + def : Pat<(VTI.Vec (int_arm_mve_vmovn_predicated (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), (i32 top), + (InVTI.Pred VCCR:$pred))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), + ARMVCCThen, (InVTI.Pred VCCR:$pred)))>; +} + +defm : MVE_VMOVN_p<MVE_VMOVNi32bh, 0, MVE_v8i16, MVE_v4i32>; +defm : MVE_VMOVN_p<MVE_VMOVNi32th, 1, MVE_v8i16, MVE_v4i32>; +defm : MVE_VMOVN_p<MVE_VMOVNi16bh, 0, MVE_v16i8, MVE_v8i16>; +defm : MVE_VMOVN_p<MVE_VMOVNi16th, 1, MVE_v16i8, MVE_v8i16>; + +multiclass MVE_VQMOVN_p<Instruction Inst, bit outU, bit inU, bit top, + MVEVectorVTInfo VTI, MVEVectorVTInfo InVTI> { + def : Pat<(VTI.Vec (int_arm_mve_vqmovn (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), + (i32 outU), (i32 inU), (i32 top))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm)))>; + + def : Pat<(VTI.Vec (int_arm_mve_vqmovn_predicated (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), + (i32 outU), (i32 inU), (i32 top), + (InVTI.Pred VCCR:$pred))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), + (InVTI.Vec MQPR:$Qm), + ARMVCCThen, (InVTI.Pred VCCR:$pred)))>; +} + +defm : MVE_VQMOVN_p<MVE_VQMOVNs32bh, 0, 0, 0, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVNs32th, 0, 0, 1, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVNs16bh, 0, 0, 0, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVNs16th, 0, 0, 1, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVNu32bh, 1, 1, 0, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVNu32th, 1, 1, 1, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVNu16bh, 1, 1, 0, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVNu16th, 1, 1, 1, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVUNs32bh, 1, 0, 0, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVUNs32th, 1, 0, 1, MVE_v8i16, MVE_v4i32>; +defm : MVE_VQMOVN_p<MVE_VQMOVUNs16bh, 1, 0, 0, MVE_v16i8, MVE_v8i16>; +defm : MVE_VQMOVN_p<MVE_VQMOVUNs16th, 1, 0, 1, MVE_v16i8, MVE_v8i16>; + +def SDTARMVMOVNQ : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVec<2>, SDTCisVT<3, i32>]>; +def MVEvqmovns : SDNode<"ARMISD::VQMOVNs", SDTARMVMOVNQ>; +def MVEvqmovnu : SDNode<"ARMISD::VQMOVNu", SDTARMVMOVNQ>; + let Predicates = [HasMVEInt] in { - def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), - (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; - def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), - (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; - def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))), - (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; - def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))), - (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))), + (v8i16 (MVE_VQMOVNs32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))), + (v8i16 (MVE_VQMOVNs32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), + (v16i8 (MVE_VQMOVNs16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), + (v16i8 (MVE_VQMOVNs16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))), + (v8i16 (MVE_VQMOVNu32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))), + (v8i16 (MVE_VQMOVNu32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))), + (v16i8 (MVE_VQMOVNu16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))), + (v16i8 (MVE_VQMOVNu16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>; + + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshrsImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 0))), + (v8i16 (MVE_VQSHRNbhs32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshrsImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 0))), + (v16i8 (MVE_VQSHRNbhs16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>; + def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshrsImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 1))), + (v8i16 (MVE_VQSHRNths32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>; + def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshrsImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 1))), + (v16i8 (MVE_VQSHRNths16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>; + + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshruImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 0))), + (v8i16 (MVE_VQSHRNbhu32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshruImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 0))), + (v16i8 (MVE_VQSHRNbhu16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>; + def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshruImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 1))), + (v8i16 (MVE_VQSHRNthu32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>; + def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshruImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 1))), + (v16i8 (MVE_VQSHRNthu16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>; } class MVE_VCVT_ff<string iname, string suffix, bit op, bit T, - list<dag> pattern=[]> - : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm), - "$Qd, $Qm", vpred_n, "$Qd = $Qd_src", pattern> { + dag iops_extra, vpred_ops vpred, string cstr> + : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), + !con(iops_extra, (ins MQPR:$Qm)), "$Qd, $Qm", + vpred, cstr, []> { let Inst{28} = op; let Inst{21-16} = 0b111111; let Inst{12} = T; @@ -4235,10 +4779,17 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T, let Inst{0} = 0b1; let Predicates = [HasMVEFloat]; + let retainsPreviousHalfElement = 1; } +def SDTARMVCVTL : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisVT<2, i32>]>; +def MVEvcvtn : SDNode<"ARMISD::VCVTN", SDTARMVMOVNQ>; +def MVEvcvtl : SDNode<"ARMISD::VCVTL", SDTARMVCVTL>; + multiclass MVE_VCVT_f2h_m<string iname, int half> { - def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half>; + def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half, + (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">; defvar Inst = !cast<Instruction>(NAME); let Predicates = [HasMVEFloat] in { @@ -4250,11 +4801,28 @@ multiclass MVE_VCVT_f2h_m<string iname, int half> { (v4i1 VCCR:$mask))), (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), ARMVCCThen, (v4i1 VCCR:$mask)))>; + + def : Pat<(v8f16 (MVEvcvtn (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))), + (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>; } } multiclass MVE_VCVT_h2f_m<string iname, int half> { - def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half>; + def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half, (ins), vpred_r, "">; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(v4f32 (int_arm_mve_vcvt_widen (v8f16 MQPR:$Qm), (i32 half))), + (v4f32 (Inst (v8f16 MQPR:$Qm)))>; + def : Pat<(v4f32 (int_arm_mve_vcvt_widen_predicated + (v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half), + (v4i1 VCCR:$mask))), + (v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen, + (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>; + + def : Pat<(v4f32 (MVEvcvtl (v8f16 MQPR:$Qm), (i32 half))), + (v4f32 (Inst (v8f16 MQPR:$Qm)))>; + } } defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>; @@ -4353,15 +4921,37 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T, let Inst{7} = Qn{3}; let Inst{0} = 0b1; let validForTailPredication = 1; + let doubleWidthResult = 1; +} + +multiclass MVE_VQDMULL_m<string iname, MVEVectorVTInfo VTI, bit size, bit T, + string cstr> { + def "" : MVE_VQDMULL<iname, VTI.Suffix, size, T, cstr>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated saturating multiply + def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm), + (VTI.Vec MQPR:$Qn), (i32 T))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>; + // Predicated saturating multiply + def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated + (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + (i32 T), (VTI.DblPred VCCR:$mask), + (VTI.DblVec MQPR:$inactive))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), + ARMVCCThen, (VTI.DblPred VCCR:$mask), + (VTI.DblVec MQPR:$inactive)))>; + } } -multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> { - def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>; - def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>; +multiclass MVE_VQDMULL_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> { + defm bh : MVE_VQDMULL_m<"vqdmullb", VTI, size, 0b0, cstr>; + defm th : MVE_VQDMULL_m<"vqdmullt", VTI, size, 0b1, cstr>; } -defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>; -defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">; +defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<MVE_v8s16, 0b0>; +defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">; // end of mve_qDest_qSrc @@ -4407,10 +4997,61 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]> let Inst{3-0} = Rm{3-0}; } +// Patterns for vector-scalar instructions with integer operands +multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI, + SDNode unpred_op, SDNode pred_op, + bit unpred_has_sign = 0, + bit pred_has_sign = 0> { + defvar UnpredSign = !if(unpred_has_sign, (? (i32 VTI.Unsigned)), (?)); + defvar PredSign = !if(pred_has_sign, (? (i32 VTI.Unsigned)), (?)); + + let Predicates = [HasMVEInt] in { + // Unpredicated version + def : Pat<(VTI.Vec !con((unpred_op (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup rGPR:$val))), + UnpredSign)), + (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>; + // Predicated version + def : Pat<(VTI.Vec !con((pred_op (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup rGPR:$val))), + PredSign, + (pred_op (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))), + (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; + } +} + +// Patterns for vector-scalar instructions with FP operands +multiclass MVE_vec_scalar_fp_pat_m<SDNode unpred_op, Intrinsic pred_int, + Instruction instr_f16, + Instruction instr_f32> { + let Predicates = [HasMVEFloat] in { + // Unpredicated F16 + def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)))), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val)))>; + // Unpredicated F32 + def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)))), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val)))>; + // Predicated F16 + def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)), + (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val), + ARMVCCThen, (v8i1 VCCR:$mask), + (v8f16 MQPR:$inactive)))>; + // Predicated F32 + def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)), + (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val), + ARMVCCThen, (v4i1 VCCR:$mask), + (v4f32 MQPR:$inactive)))>; + } +} + class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size, - bit bit_5, bit bit_12, bit bit_16, - bit bit_28, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, "", pattern> { + bit bit_5, bit bit_12, bit bit_16, bit bit_28> + : MVE_qDest_rSrc<iname, suffix, ""> { let Inst{28} = bit_28; let Inst{21-20} = size; @@ -4421,42 +5062,60 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size, let validForTailPredication = 1; } -multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix, - bit bit_5, bit bit_12, bit bit_16, - bit bit_28, list<dag> pattern=[]> { - def "8" : MVE_VADDSUB_qr<iname, suffix#"8", 0b00, - bit_5, bit_12, bit_16, bit_28>; - def "16" : MVE_VADDSUB_qr<iname, suffix#"16", 0b01, - bit_5, bit_12, bit_16, bit_28>; - def "32" : MVE_VADDSUB_qr<iname, suffix#"32", 0b10, - bit_5, bit_12, bit_16, bit_28>; -} - -defm MVE_VADD_qr_i : MVE_VADDSUB_qr_sizes<"vadd", "i", 0b0, 0b0, 0b1, 0b0>; -defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>; -defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>; - -defm MVE_VSUB_qr_i : MVE_VADDSUB_qr_sizes<"vsub", "i", 0b0, 0b1, 0b1, 0b0>; -defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>; -defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>; - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), - (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), - (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), - (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; -} - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), - (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), - (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), - (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; -} +// Vector-scalar add/sub +multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, + SDNode unpred_op, Intrinsic pred_int> { + def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>; + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, + unpred_op, pred_int>; +} + +multiclass MVE_VADD_qr_m<MVEVectorVTInfo VTI> + : MVE_VADDSUB_qr_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>; + +multiclass MVE_VSUB_qr_m<MVEVectorVTInfo VTI> + : MVE_VADDSUB_qr_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>; + +defm MVE_VADD_qr_i8 : MVE_VADD_qr_m<MVE_v16i8>; +defm MVE_VADD_qr_i16 : MVE_VADD_qr_m<MVE_v8i16>; +defm MVE_VADD_qr_i32 : MVE_VADD_qr_m<MVE_v4i32>; + +defm MVE_VSUB_qr_i8 : MVE_VSUB_qr_m<MVE_v16i8>; +defm MVE_VSUB_qr_i16 : MVE_VSUB_qr_m<MVE_v8i16>; +defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m<MVE_v4i32>; + +// Vector-scalar saturating add/sub +multiclass MVE_VQADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, + SDNode unpred_op_s, SDNode unpred_op_u, + Intrinsic pred_int> { + def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b1, subtract, + 0b0, VTI.Unsigned>; + defvar unpred_op = !if(VTI.Unsigned, unpred_op_u, unpred_op_s); + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, + unpred_op, pred_int, 0, 1>; +} + +multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI> + : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, saddsat, uaddsat, + int_arm_mve_qadd_predicated>; + +multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI> + : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, ssubsat, usubsat, + int_arm_mve_qsub_predicated>; + +defm MVE_VQADD_qr_s8 : MVE_VQADD_qr_m<MVE_v16s8>; +defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16>; +defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32>; +defm MVE_VQADD_qr_u8 : MVE_VQADD_qr_m<MVE_v16u8>; +defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16>; +defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32>; + +defm MVE_VQSUB_qr_s8 : MVE_VQSUB_qr_m<MVE_v16s8>; +defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16>; +defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32>; +defm MVE_VQSUB_qr_u8 : MVE_VQSUB_qr_m<MVE_v16u8>; +defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16>; +defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32>; class MVE_VQDMULL_qr<string iname, string suffix, bit size, bit T, string cstr="", list<dag> pattern=[]> @@ -4469,15 +5128,40 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size, let Inst{8} = 0b1; let Inst{5} = 0b1; let validForTailPredication = 1; + let doubleWidthResult = 1; } -multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> { - def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>; - def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>; +multiclass MVE_VQDMULL_qr_m<string iname, MVEVectorVTInfo VTI, bit size, + bit T, string cstr> { + def "" : MVE_VQDMULL_qr<iname, VTI.Suffix, size, T, cstr>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + // Unpredicated saturating multiply + def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup rGPR:$val)), + (i32 T))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>; + // Predicated saturating multiply + def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated + (VTI.Vec MQPR:$Qm), + (VTI.Vec (ARMvdup rGPR:$val)), + (i32 T), + (VTI.DblPred VCCR:$mask), + (VTI.DblVec MQPR:$inactive))), + (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val), + ARMVCCThen, (VTI.DblPred VCCR:$mask), + (VTI.DblVec MQPR:$inactive)))>; + } } -defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>; -defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">; +multiclass MVE_VQDMULL_qr_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> { + defm bh : MVE_VQDMULL_qr_m<"vqdmullb", VTI, size, 0b0, cstr>; + defm th : MVE_VQDMULL_qr_m<"vqdmullt", VTI, size, 0b1, cstr>; +} + +defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<MVE_v8s16, 0b0>; +defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">; class MVE_VxADDSUB_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, bit subtract, @@ -4493,19 +5177,34 @@ class MVE_VxADDSUB_qr<string iname, string suffix, let validForTailPredication = 1; } -def MVE_VHADD_qr_s8 : MVE_VxADDSUB_qr<"vhadd", "s8", 0b0, 0b00, 0b0>; -def MVE_VHADD_qr_s16 : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>; -def MVE_VHADD_qr_s32 : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>; -def MVE_VHADD_qr_u8 : MVE_VxADDSUB_qr<"vhadd", "u8", 0b1, 0b00, 0b0>; -def MVE_VHADD_qr_u16 : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>; -def MVE_VHADD_qr_u32 : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>; +multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, + Intrinsic unpred_int, Intrinsic pred_int> { + def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract>; + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), + VTI, unpred_int, pred_int, 1, 1>; +} + +multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI> : + MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, + int_arm_mve_hadd_predicated>; + +multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI> : + MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, + int_arm_mve_hsub_predicated>; -def MVE_VHSUB_qr_s8 : MVE_VxADDSUB_qr<"vhsub", "s8", 0b0, 0b00, 0b1>; -def MVE_VHSUB_qr_s16 : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>; -def MVE_VHSUB_qr_s32 : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>; -def MVE_VHSUB_qr_u8 : MVE_VxADDSUB_qr<"vhsub", "u8", 0b1, 0b00, 0b1>; -def MVE_VHSUB_qr_u16 : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>; -def MVE_VHSUB_qr_u32 : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>; +defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m<MVE_v16s8>; +defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16>; +defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32>; +defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m<MVE_v16u8>; +defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16>; +defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32>; + +defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m<MVE_v16s8>; +defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16>; +defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32>; +defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m<MVE_v16u8>; +defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>; +defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>; let Predicates = [HasMVEFloat] in { def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd", "f32", 0b0, 0b11, 0b0>; @@ -4515,6 +5214,11 @@ let Predicates = [HasMVEFloat] in { def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub", "f16", 0b1, 0b11, 0b1>; } +defm : MVE_vec_scalar_fp_pat_m<fadd, int_arm_mve_add_predicated, + MVE_VADD_qr_f16, MVE_VADD_qr_f32>; +defm : MVE_vec_scalar_fp_pat_m<fsub, int_arm_mve_sub_predicated, + MVE_VSUB_qr_f16, MVE_VSUB_qr_f32>; + class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size, bit bit_7, bit bit_17, list<dag> pattern=[]> : MVE_qDest_single_rSrc<iname, suffix, pattern> { @@ -4563,19 +5267,19 @@ defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>; defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>; let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), - (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), - (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), - (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))), + (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))), + (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))), + (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), rGPR:$Rm))>; - def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), - (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), - (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), - (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))), + (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))), + (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))), + (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), rGPR:$Rm))>; } class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]> @@ -4594,6 +5298,20 @@ def MVE_VBRSR8 : MVE_VBRSR<"vbrsr", "8", 0b00>; def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>; def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>; +multiclass MVE_VBRSR_pat_m<MVEVectorVTInfo VTI, Instruction Inst> { + // Unpredicated + def : Pat<(VTI.Vec (int_arm_mve_vbrsr (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm)))>; + // Predicated + def : Pat<(VTI.Vec (int_arm_mve_vbrsr_predicated + (VTI.Vec MQPR:$inactive), + (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm), + (VTI.Pred VCCR:$mask))), + (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm), + ARMVCCThen, (VTI.Pred VCCR:$mask), + (VTI.Vec MQPR:$inactive)))>; +} + let Predicates = [HasMVEInt] in { def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))), (v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>; @@ -4603,11 +5321,19 @@ let Predicates = [HasMVEInt] in { def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))), (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>; + + defm : MVE_VBRSR_pat_m<MVE_v16i8, MVE_VBRSR8>; + defm : MVE_VBRSR_pat_m<MVE_v8i16, MVE_VBRSR16>; + defm : MVE_VBRSR_pat_m<MVE_v4i32, MVE_VBRSR32>; } -class MVE_VMUL_qr_int<string iname, string suffix, - bits<2> size, list<dag> pattern=[]> - : MVE_qDest_rSrc<iname, suffix, "", pattern> { +let Predicates = [HasMVEFloat] in { + defm : MVE_VBRSR_pat_m<MVE_v8f16, MVE_VBRSR16>; + defm : MVE_VBRSR_pat_m<MVE_v4f32, MVE_VBRSR32>; +} + +class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size> + : MVE_qDest_rSrc<iname, suffix, ""> { let Inst{28} = 0b0; let Inst{21-20} = size; @@ -4618,19 +5344,16 @@ class MVE_VMUL_qr_int<string iname, string suffix, let validForTailPredication = 1; } -def MVE_VMUL_qr_i8 : MVE_VMUL_qr_int<"vmul", "i8", 0b00>; -def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>; -def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>; - -let Predicates = [HasMVEInt] in { - def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))), - (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))), - (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>; - def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))), - (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>; +multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> { + def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>; + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, + mul, int_arm_mve_mul_predicated>; } +defm MVE_VMUL_qr_i8 : MVE_VMUL_qr_int_m<MVE_v16i8>; +defm MVE_VMUL_qr_i16 : MVE_VMUL_qr_int_m<MVE_v8i16>; +defm MVE_VMUL_qr_i32 : MVE_VMUL_qr_int_m<MVE_v4i32>; + class MVE_VxxMUL_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]> : MVE_qDest_rSrc<iname, suffix, "", pattern> { @@ -4643,19 +5366,37 @@ class MVE_VxxMUL_qr<string iname, string suffix, let Inst{5} = 0b1; } -def MVE_VQDMULH_qr_s8 : MVE_VxxMUL_qr<"vqdmulh", "s8", 0b0, 0b00>; -def MVE_VQDMULH_qr_s16 : MVE_VxxMUL_qr<"vqdmulh", "s16", 0b0, 0b01>; -def MVE_VQDMULH_qr_s32 : MVE_VxxMUL_qr<"vqdmulh", "s32", 0b0, 0b10>; +multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28, + Intrinsic int_unpred, Intrinsic int_pred> { + def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>; + defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, + int_unpred, int_pred>; +} + +multiclass MVE_VQDMULH_qr_m<MVEVectorVTInfo VTI> : + MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0, + int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>; + +multiclass MVE_VQRDMULH_qr_m<MVEVectorVTInfo VTI> : + MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1, + int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>; -def MVE_VQRDMULH_qr_s8 : MVE_VxxMUL_qr<"vqrdmulh", "s8", 0b1, 0b00>; -def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>; -def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>; +defm MVE_VQDMULH_qr_s8 : MVE_VQDMULH_qr_m<MVE_v16s8>; +defm MVE_VQDMULH_qr_s16 : MVE_VQDMULH_qr_m<MVE_v8s16>; +defm MVE_VQDMULH_qr_s32 : MVE_VQDMULH_qr_m<MVE_v4s32>; + +defm MVE_VQRDMULH_qr_s8 : MVE_VQRDMULH_qr_m<MVE_v16s8>; +defm MVE_VQRDMULH_qr_s16 : MVE_VQRDMULH_qr_m<MVE_v8s16>; +defm MVE_VQRDMULH_qr_s32 : MVE_VQRDMULH_qr_m<MVE_v4s32>; let Predicates = [HasMVEFloat], validForTailPredication = 1 in { def MVE_VMUL_qr_f16 : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>; def MVE_VMUL_qr_f32 : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>; } +defm : MVE_vec_scalar_fp_pat_m<fmul, int_arm_mve_mul_predicated, + MVE_VMUL_qr_f16, MVE_VMUL_qr_f32>; + class MVE_VFMAMLA_qr<string iname, string suffix, bit bit_28, bits<2> bits_21_20, bit S, list<dag> pattern=[]> @@ -4668,42 +5409,87 @@ class MVE_VFMAMLA_qr<string iname, string suffix, let Inst{8} = 0b0; let Inst{5} = 0b0; let validForTailPredication = 1; + let hasSideEffects = 0; } -def MVE_VMLA_qr_s8 : MVE_VFMAMLA_qr<"vmla", "s8", 0b0, 0b00, 0b0>; -def MVE_VMLA_qr_s16 : MVE_VFMAMLA_qr<"vmla", "s16", 0b0, 0b01, 0b0>; -def MVE_VMLA_qr_s32 : MVE_VFMAMLA_qr<"vmla", "s32", 0b0, 0b10, 0b0>; -def MVE_VMLA_qr_u8 : MVE_VFMAMLA_qr<"vmla", "u8", 0b1, 0b00, 0b0>; -def MVE_VMLA_qr_u16 : MVE_VFMAMLA_qr<"vmla", "u16", 0b1, 0b01, 0b0>; -def MVE_VMLA_qr_u32 : MVE_VFMAMLA_qr<"vmla", "u32", 0b1, 0b10, 0b0>; +multiclass MVE_VMLA_qr_multi<string iname, MVEVectorVTInfo VTI, + bit scalar_addend> { + def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, + scalar_addend>; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_n_predicated"); + defvar v1 = (VTI.Vec MQPR:$v1); + defvar v2 = (VTI.Vec MQPR:$v2); + defvar vs = (VTI.Vec (ARMvdup rGPR:$s)); + defvar s = (i32 rGPR:$s); + defvar pred = (VTI.Pred VCCR:$pred); + + // The signed and unsigned variants of this instruction have different + // encodings, but they're functionally identical. For the sake of + // determinism, we generate only the unsigned variant. + if VTI.Unsigned then let Predicates = [HasMVEInt] in { + if scalar_addend then { + def : Pat<(VTI.Vec (add (mul v1, v2), vs)), + (VTI.Vec (Inst v1, v2, s))>; + } else { + def : Pat<(VTI.Vec (add (mul v2, vs), v1)), + (VTI.Vec (Inst v1, v2, s))>; + } -def MVE_VMLAS_qr_s8 : MVE_VFMAMLA_qr<"vmlas", "s8", 0b0, 0b00, 0b1>; -def MVE_VMLAS_qr_s16 : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>; -def MVE_VMLAS_qr_s32 : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>; -def MVE_VMLAS_qr_u8 : MVE_VFMAMLA_qr<"vmlas", "u8", 0b1, 0b00, 0b1>; -def MVE_VMLAS_qr_u16 : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>; -def MVE_VMLAS_qr_u32 : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>; + def : Pat<(VTI.Vec (pred_int v1, v2, s, pred)), + (VTI.Vec (Inst v1, v2, s, ARMVCCThen, pred))>; + } +} -let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (add (v4i32 MQPR:$src1), - (v4i32 (mul (v4i32 MQPR:$src2), - (v4i32 (ARMvdup (i32 rGPR:$x))))))), - (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>; - def : Pat<(v8i16 (add (v8i16 MQPR:$src1), - (v8i16 (mul (v8i16 MQPR:$src2), - (v8i16 (ARMvdup (i32 rGPR:$x))))))), - (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>; - def : Pat<(v16i8 (add (v16i8 MQPR:$src1), - (v16i8 (mul (v16i8 MQPR:$src2), - (v16i8 (ARMvdup (i32 rGPR:$x))))))), - (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>; +defm MVE_VMLA_qr_s8 : MVE_VMLA_qr_multi<"vmla", MVE_v16s8, 0b0>; +defm MVE_VMLA_qr_s16 : MVE_VMLA_qr_multi<"vmla", MVE_v8s16, 0b0>; +defm MVE_VMLA_qr_s32 : MVE_VMLA_qr_multi<"vmla", MVE_v4s32, 0b0>; +defm MVE_VMLA_qr_u8 : MVE_VMLA_qr_multi<"vmla", MVE_v16u8, 0b0>; +defm MVE_VMLA_qr_u16 : MVE_VMLA_qr_multi<"vmla", MVE_v8u16, 0b0>; +defm MVE_VMLA_qr_u32 : MVE_VMLA_qr_multi<"vmla", MVE_v4u32, 0b0>; + +defm MVE_VMLAS_qr_s8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16s8, 0b1>; +defm MVE_VMLAS_qr_s16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8s16, 0b1>; +defm MVE_VMLAS_qr_s32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4s32, 0b1>; +defm MVE_VMLAS_qr_u8 : MVE_VMLA_qr_multi<"vmlas", MVE_v16u8, 0b1>; +defm MVE_VMLAS_qr_u16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8u16, 0b1>; +defm MVE_VMLAS_qr_u32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4u32, 0b1>; + +multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI, + bit scalar_addend> { + def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend>; + defvar Inst = !cast<Instruction>(NAME); + defvar pred_int = int_arm_mve_fma_predicated; + defvar v1 = (VTI.Vec MQPR:$v1); + defvar v2 = (VTI.Vec MQPR:$v2); + defvar vs = (VTI.Vec (ARMvdup (i32 rGPR:$s))); + defvar is = (i32 rGPR:$s); + defvar pred = (VTI.Pred VCCR:$pred); + + let Predicates = [HasMVEFloat] in { + if scalar_addend then { + def : Pat<(VTI.Vec (fma v1, v2, vs)), + (VTI.Vec (Inst v1, v2, is))>; + def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)), + (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>; + } else { + def : Pat<(VTI.Vec (fma v1, vs, v2)), + (VTI.Vec (Inst v2, v1, is))>; + def : Pat<(VTI.Vec (fma vs, v1, v2)), + (VTI.Vec (Inst v2, v1, is))>; + def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)), + (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>; + def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)), + (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>; + } + } } let Predicates = [HasMVEFloat] in { - def MVE_VFMA_qr_f16 : MVE_VFMAMLA_qr<"vfma", "f16", 0b1, 0b11, 0b0>; - def MVE_VFMA_qr_f32 : MVE_VFMAMLA_qr<"vfma", "f32", 0b0, 0b11, 0b0>; - def MVE_VFMA_qr_Sf16 : MVE_VFMAMLA_qr<"vfmas", "f16", 0b1, 0b11, 0b1>; - def MVE_VFMA_qr_Sf32 : MVE_VFMAMLA_qr<"vfmas", "f32", 0b0, 0b11, 0b1>; + defm MVE_VFMA_qr_f16 : MVE_VFMA_qr_multi<"vfma", MVE_v8f16, 0>; + defm MVE_VFMA_qr_f32 : MVE_VFMA_qr_multi<"vfma", MVE_v4f32, 0>; + defm MVE_VFMA_qr_Sf16 : MVE_VFMA_qr_multi<"vfmas", MVE_v8f16, 1>; + defm MVE_VFMA_qr_Sf32 : MVE_VFMA_qr_multi<"vfmas", MVE_v4f32, 1>; } class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size, @@ -4718,10 +5504,30 @@ class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size, let Inst{5} = bit_5; } +multiclass MVE_VQDMLAH_qr_multi<string iname, MVEVectorVTInfo VTI, + bit bit_5, bit bit_12> { + def "": MVE_VQDMLAH_qr<iname, VTI.Suffix, 0b0, VTI.Size, bit_5, bit_12>; + defvar Inst = !cast<Instruction>(NAME); + defvar unpred_int = !cast<Intrinsic>("int_arm_mve_" # iname); + defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_predicated"); + + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s))), + (VTI.Vec (Inst (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s)))>; + def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s), (VTI.Pred VCCR:$pred))), + (VTI.Vec (Inst (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2), + (i32 rGPR:$s), ARMVCCThen, + (VTI.Pred VCCR:$pred)))>; + } +} + multiclass MVE_VQDMLAH_qr_types<string iname, bit bit_5, bit bit_12> { - def s8 : MVE_VQDMLAH_qr<iname, "s8", 0b0, 0b00, bit_5, bit_12>; - def s16 : MVE_VQDMLAH_qr<iname, "s16", 0b0, 0b01, bit_5, bit_12>; - def s32 : MVE_VQDMLAH_qr<iname, "s32", 0b0, 0b10, bit_5, bit_12>; + defm s8 : MVE_VQDMLAH_qr_multi<iname, MVE_v16s8, bit_5, bit_12>; + defm s16 : MVE_VQDMLAH_qr_multi<iname, MVE_v8s16, bit_5, bit_12>; + defm s32 : MVE_VQDMLAH_qr_multi<iname, MVE_v4s32, bit_5, bit_12>; } defm MVE_VQDMLAH_qr : MVE_VQDMLAH_qr_types<"vqdmlah", 0b1, 0b0>; @@ -4752,6 +5558,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12, let Inst{6-1} = 0b110111; let Inst{0} = imm{0}; let validForTailPredication = 1; + let hasSideEffects = 0; } def MVE_VIDUPu8 : MVE_VxDUP<"vidup", "u8", 0b00, 0b0>; @@ -4787,6 +5594,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12, let Inst{3-1} = Rm{3-1}; let Inst{0} = imm{0}; let validForTailPredication = 1; + let hasSideEffects = 0; } def MVE_VIWDUPu8 : MVE_VxWDUP<"viwdup", "u8", 0b00, 0b0>; @@ -4855,6 +5663,8 @@ class MVE_VMOV_64bit<dag oops, dag iops, bit to_qreg, string ops, string cstr> let Inst{12-5} = 0b01111000; let Inst{4} = idx2; let Inst{3-0} = Rt{3-0}; + + let hasSideEffects = 0; } // The assembly syntax for these instructions mentions the vector @@ -4924,6 +5734,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size, let mayLoad = load; let mayStore = !eq(load,0); + let hasSideEffects = 0; } // A parameter class used to encapsulate all the ways the writeback @@ -5004,22 +5815,44 @@ foreach wb = [MVE_vldst24_writeback< "vst" # n.nvecs # stage # "." # s.lanesize>; } +def SDTARMVST2 : SDTypeProfile<1, 5, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>, + SDTCisSameAs<3, 4>, SDTCisVT<5, i32>]>; +def SDTARMVST4 : SDTypeProfile<1, 7, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>, + SDTCisSameAs<3, 4>, SDTCisSameAs<3, 5>, + SDTCisSameAs<3, 6>, SDTCisVT<7, i32>]>; +def MVEVST2UPD : SDNode<"ARMISD::VST2_UPD", SDTARMVST2, [SDNPHasChain]>; +def MVEVST4UPD : SDNode<"ARMISD::VST4_UPD", SDTARMVST4, [SDNPHasChain]>; + multiclass MVE_vst24_patterns<int lanesize, ValueType VT> { foreach stage = [0,1] in def : Pat<(int_arm_mve_vst2q i32:$addr, - (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), + (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize) - (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), - t2_addr_offset_none:$addr)>; + (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + t2_addr_offset_none:$addr)>; + foreach stage = [0,1] in + def : Pat<(i32 (MVEVST2UPD i32:$addr, (i32 32), + (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage))), + (i32 (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize#_wb) + (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + t2_addr_offset_none:$addr))>; foreach stage = [0,1,2,3] in def : Pat<(int_arm_mve_vst4q i32:$addr, - (VT MQPR:$v0), (VT MQPR:$v1), - (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), + (VT MQPR:$v0), (VT MQPR:$v1), + (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize) - (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, - VT:$v2, qsub_2, VT:$v3, qsub_3), - t2_addr_offset_none:$addr)>; + (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), + t2_addr_offset_none:$addr)>; + foreach stage = [0,1,2,3] in + def : Pat<(i32 (MVEVST4UPD i32:$addr, (i32 64), + (VT MQPR:$v0), (VT MQPR:$v1), + (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage))), + (i32 (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize#_wb) + (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), + t2_addr_offset_none:$addr))>; } defm : MVE_vst24_patterns<8, v16i8>; defm : MVE_vst24_patterns<16, v8i16>; @@ -5097,6 +5930,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc, let mayLoad = dir.load; let mayStore = !eq(dir.load,0); + let hasSideEffects = 0; let validForTailPredication = 1; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td index 6244d8d9e27e..1b3f6075c0e9 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -509,11 +509,6 @@ def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>; def NEONvsliImm : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>; def NEONvsriImm : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>; -def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, - SDTCisVT<2, i32>]>; -def NEONvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>; -def NEONvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>; - def NEONvbsl : SDNode<"ARMISD::VBSL", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, @@ -531,11 +526,6 @@ def NEONzip : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>; def NEONuzp : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>; def NEONtrn : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>; -def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, - SDTCisSameAs<1, 2>]>; -def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; -def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; - def SDTARMVTBL1 : SDTypeProfile<1, 2, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>, SDTCisVT<2, v8i8>]>; def SDTARMVTBL2 : SDTypeProfile<1, 3, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>, @@ -1084,6 +1074,12 @@ def : Pat<(vector_insert (v4f16 DPR:$src), def : Pat<(vector_insert (v8f16 QPR:$src), (f16 (load addrmode6:$addr)), imm:$lane), (VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; +def : Pat<(vector_insert (v4bf16 DPR:$src), + (bf16 (load addrmode6:$addr)), imm:$lane), + (VLD1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>; +def : Pat<(vector_insert (v8bf16 QPR:$src), + (bf16 (load addrmode6:$addr)), imm:$lane), + (VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>; def : Pat<(vector_insert (v2f32 DPR:$src), (f32 (load addrmode6:$addr)), imm:$lane), (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>; @@ -2459,57 +2455,6 @@ def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr), } //===----------------------------------------------------------------------===// -// NEON pattern fragments -//===----------------------------------------------------------------------===// - -// Extract D sub-registers of Q registers. -def DSubReg_i8_reg : SDNodeXForm<imm, [{ - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N), - MVT::i32); -}]>; -def DSubReg_i16_reg : SDNodeXForm<imm, [{ - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N), - MVT::i32); -}]>; -def DSubReg_i32_reg : SDNodeXForm<imm, [{ - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N), - MVT::i32); -}]>; -def DSubReg_f64_reg : SDNodeXForm<imm, [{ - assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); - return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N), - MVT::i32); -}]>; - -// Extract S sub-registers of Q/D registers. -def SSubReg_f32_reg : SDNodeXForm<imm, [{ - assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering"); - return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N), - MVT::i32); -}]>; - -// Extract S sub-registers of Q/D registers containing a given f16 lane. -def SSubReg_f16_reg : SDNodeXForm<imm, [{ - assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering"); - return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N), - MVT::i32); -}]>; - -// Translate lane numbers from Q registers to D subregs. -def SubReg_i8_lane : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32); -}]>; -def SubReg_i16_lane : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32); -}]>; -def SubReg_i32_lane : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32); -}]>; - -//===----------------------------------------------------------------------===// // Instruction Classes //===----------------------------------------------------------------------===// @@ -4367,7 +4312,7 @@ def : Pat<(v2f32 (fmul DPR:$Rn, (ARMvdup (f32 SPR:$Rm)))), (i32 0))>; def : Pat<(v4f16 (fmul DPR:$Rn, (ARMvdup (f16 HPR:$Rm)))), (VMULslhd DPR:$Rn, - (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), (f16 HPR:$Rm), ssub_0), (i32 0))>; def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))), (VMULslfq QPR:$Rn, @@ -4375,7 +4320,7 @@ def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))), (i32 0))>; def : Pat<(v8f16 (fmul QPR:$Rn, (ARMvdup (f16 HPR:$Rm)))), (VMULslhq QPR:$Rn, - (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0), + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), (f16 HPR:$Rm), ssub_0), (i32 0))>; } @@ -4433,17 +4378,17 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1), let PostEncoderMethod = "NEONThumb2DataIPostEncoder", DecoderNamespace = "NEONData" in { defm VMULLs : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, - "vmull", "s", NEONvmulls, 1>; + "vmull", "s", ARMvmulls, 1>; defm VMULLu : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D, - "vmull", "u", NEONvmullu, 1>; + "vmull", "u", ARMvmullu, 1>; def VMULLp8 : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8", v8i16, v8i8, int_arm_neon_vmullp, 1>; def VMULLp64 : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary, "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>, Requires<[HasV8, HasCrypto]>; } -defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>; -defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>; +defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", ARMvmulls>; +defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", ARMvmullu>; // VQDMULL : Vector Saturating Doubling Multiply Long (Q = D * D) defm VQDMULL : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D, @@ -4513,12 +4458,12 @@ def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1), // VMLAL : Vector Multiply Accumulate Long (Q += D * D) defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlal", "s", NEONvmulls, add>; + "vmlal", "s", ARMvmulls, add>; defm VMLALu : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlal", "u", NEONvmullu, add>; + "vmlal", "u", ARMvmullu, add>; -defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>; -defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>; +defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", ARMvmulls, add>; +defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", ARMvmullu, add>; let Predicates = [HasNEON, HasV8_1a] in { // v8.1a Neon Rounding Double Multiply-Op vector operations, @@ -4746,12 +4691,12 @@ def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1), // VMLSL : Vector Multiply Subtract Long (Q -= D * D) defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlsl", "s", NEONvmulls, sub>; + "vmlsl", "s", ARMvmulls, sub>; defm VMLSLu : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D, - "vmlsl", "u", NEONvmullu, sub>; + "vmlsl", "u", ARMvmullu, sub>; -defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>; -defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>; +defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", ARMvmulls, sub>; +defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", ARMvmullu, sub>; // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D, @@ -4833,10 +4778,10 @@ def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)), // We put them in the VFPV8 decoder namespace because the ARM and Thumb // encodings are the same and thus no further bit twiddling is necessary // in the disassembler. -class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy, - ValueType AccumTy, ValueType InputTy, +class VDOT<bit op6, bit op4, bit op23, RegisterClass RegTy, string Asm, + string AsmTy, ValueType AccumTy, ValueType InputTy, SDPatternOperator OpNode> : - N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst), + N3Vnp<{0b1100, op23}, 0b10, 0b1101, op6, op4, (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD, Asm, AsmTy, [(set (AccumTy RegTy:$dst), @@ -4848,10 +4793,10 @@ class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy, let Constraints = "$dst = $Vd"; } -def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8, int_arm_neon_udot>; -def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8, int_arm_neon_sdot>; -def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>; -def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>; +def VUDOTD : VDOT<0, 1, 0, DPR, "vudot", "u8", v2i32, v8i8, int_arm_neon_udot>; +def VSDOTD : VDOT<0, 0, 0, DPR, "vsdot", "s8", v2i32, v8i8, int_arm_neon_sdot>; +def VUDOTQ : VDOT<1, 1, 0, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>; +def VSDOTQ : VDOT<1, 0, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>; // Indexed dot product instructions: multiclass DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty, @@ -4886,6 +4831,68 @@ defm VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR, v4i32, v16i8, defm VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR, v4i32, v16i8, int_arm_neon_sdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; +// v8.6A matrix multiplication extension +let Predicates = [HasMatMulInt8] in { + class N3VMatMul<bit B, bit U, string Asm, string AsmTy, + SDPatternOperator OpNode> + : N3Vnp<{0b1100, B}, 0b10, 0b1100, 1, U, (outs QPR:$dst), + (ins QPR:$Vd, QPR:$Vn, QPR:$Vm), N3RegFrm, NoItinerary, + Asm, AsmTy, + [(set (v4i32 QPR:$dst), (OpNode (v4i32 QPR:$Vd), + (v16i8 QPR:$Vn), + (v16i8 QPR:$Vm)))]> { + let DecoderNamespace = "VFPV8"; + let Constraints = "$dst = $Vd"; + } + + multiclass N3VMixedDotLane<bit Q, bit U, string Asm, string AsmTy, RegisterClass RegTy, + ValueType AccumTy, ValueType InputTy, SDPatternOperator OpNode, + dag RHS> { + + def "" : N3Vnp<0b11101, 0b00, 0b1101, Q, U, (outs RegTy:$dst), + (ins RegTy:$Vd, RegTy:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), N3RegFrm, + NoItinerary, Asm, AsmTy, []> { + bit lane; + let Inst{5} = lane; + let AsmString = !strconcat(Asm, ".", AsmTy, "\t$Vd, $Vn, $Vm$lane"); + let DecoderNamespace = "VFPV8"; + let Constraints = "$dst = $Vd"; + } + + def : Pat< + (AccumTy (OpNode (AccumTy RegTy:$Vd), + (InputTy RegTy:$Vn), + (InputTy (bitconvert (AccumTy + (ARMvduplane (AccumTy RegTy:$Vm), + VectorIndex32:$lane)))))), + (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>; + + } + + multiclass SUDOTLane<bit Q, RegisterClass RegTy, ValueType AccumTy, ValueType InputTy, dag RHS> + : N3VMixedDotLane<Q, 1, "vsudot", "u8", RegTy, AccumTy, InputTy, null_frag, null_frag> { + def : Pat< + (AccumTy (int_arm_neon_usdot (AccumTy RegTy:$Vd), + (InputTy (bitconvert (AccumTy + (ARMvduplane (AccumTy RegTy:$Vm), + VectorIndex32:$lane)))), + (InputTy RegTy:$Vn))), + (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>; + } + + def VSMMLA : N3VMatMul<0, 0, "vsmmla", "s8", int_arm_neon_smmla>; + def VUMMLA : N3VMatMul<0, 1, "vummla", "u8", int_arm_neon_ummla>; + def VUSMMLA : N3VMatMul<1, 0, "vusmmla", "s8", int_arm_neon_usmmla>; + def VUSDOTD : VDOT<0, 0, 1, DPR, "vusdot", "s8", v2i32, v8i8, int_arm_neon_usdot>; + def VUSDOTQ : VDOT<1, 0, 1, QPR, "vusdot", "s8", v4i32, v16i8, int_arm_neon_usdot>; + + defm VUSDOTDI : N3VMixedDotLane<0, 0, "vusdot", "s8", DPR, v2i32, v8i8, + int_arm_neon_usdot, (v2i32 DPR_VFP2:$Vm)>; + defm VUSDOTQI : N3VMixedDotLane<1, 0, "vusdot", "s8", QPR, v4i32, v16i8, + int_arm_neon_usdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; + defm VSUDOTDI : SUDOTLane<0, DPR, v2i32, v8i8, (v2i32 DPR_VFP2:$Vm)>; + defm VSUDOTQI : SUDOTLane<1, QPR, v4i32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; +} // ARMv8.3 complex operations class BaseN3VCP8ComplexTied<bit op21, bit op4, bit s, bit q, @@ -5232,7 +5239,6 @@ class VFMQ<string opc, string type, bits<2> S> let Inst{3} = idx{0}; } -let hasNoSchedulingInfo = 1 in { // op1 op2 op3 def VFMALD : N3VCP8F16Q0<"vfmal", DPR, SPR, SPR, 0b00, 0b10, 1>; def VFMSLD : N3VCP8F16Q0<"vfmsl", DPR, SPR, SPR, 0b01, 0b10, 1>; @@ -5242,7 +5248,6 @@ def VFMALDI : VFMD<"vfmal", "f16", 0b00>; def VFMSLDI : VFMD<"vfmsl", "f16", 0b01>; def VFMALQI : VFMQ<"vfmal", "f16", 0b00>; def VFMSLQI : VFMQ<"vfmsl", "f16", 0b01>; -} } // HasNEON, HasFP16FML @@ -5296,7 +5301,7 @@ def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1, IIC_VMOVImm, "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", [(set DPR:$Vd, - (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + (v4i16 (ARMvorrImm DPR:$src, timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } @@ -5305,7 +5310,7 @@ def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1, IIC_VMOVImm, "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", [(set DPR:$Vd, - (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + (v2i32 (ARMvorrImm DPR:$src, timm:$SIMM)))]> { let Inst{10-9} = SIMM{10-9}; } @@ -5314,7 +5319,7 @@ def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1, IIC_VMOVImm, "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", [(set QPR:$Vd, - (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + (v8i16 (ARMvorrImm QPR:$src, timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } @@ -5323,7 +5328,7 @@ def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1, IIC_VMOVImm, "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", [(set QPR:$Vd, - (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + (v4i32 (ARMvorrImm QPR:$src, timm:$SIMM)))]> { let Inst{10-9} = SIMM{10-9}; } @@ -5347,7 +5352,7 @@ def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1, IIC_VMOVImm, "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", [(set DPR:$Vd, - (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + (v4i16 (ARMvbicImm DPR:$src, timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } @@ -5356,7 +5361,7 @@ def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1, IIC_VMOVImm, "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", [(set DPR:$Vd, - (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + (v2i32 (ARMvbicImm DPR:$src, timm:$SIMM)))]> { let Inst{10-9} = SIMM{10-9}; } @@ -5365,7 +5370,7 @@ def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1, IIC_VMOVImm, "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", [(set QPR:$Vd, - (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + (v8i16 (ARMvbicImm QPR:$src, timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } @@ -5374,7 +5379,7 @@ def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1, IIC_VMOVImm, "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", [(set QPR:$Vd, - (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + (v4i32 (ARMvbicImm QPR:$src, timm:$SIMM)))]> { let Inst{10-9} = SIMM{10-9}; } @@ -6354,32 +6359,57 @@ def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2), (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>; } -def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>; -def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>; - -let Predicates = [HasNEON] in { -def : Pat<(extractelt (v4f16 DPR:$src), imm_even:$lane), - (EXTRACT_SUBREG - (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), - (SSubReg_f16_reg imm_even:$lane))>; +multiclass ExtractEltEvenF16<ValueType VT4, ValueType VT8> { + def : Pat<(extractelt (VT4 DPR:$src), imm_even:$lane), + (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (VT4 DPR:$src), DPR_VFP2)), + (SSubReg_f16_reg imm_even:$lane))>; + def : Pat<(extractelt (VT8 QPR:$src), imm_even:$lane), + (EXTRACT_SUBREG + (v4f32 (COPY_TO_REGCLASS (VT8 QPR:$src), QPR_VFP2)), + (SSubReg_f16_reg imm_even:$lane))>; +} -def : Pat<(extractelt (v4f16 DPR:$src), imm_odd:$lane), +multiclass ExtractEltOddF16VMOVH<ValueType VT4, ValueType VT8> { + def : Pat<(extractelt (VT4 DPR:$src), imm_odd:$lane), (COPY_TO_REGCLASS (VMOVH (EXTRACT_SUBREG - (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), - (SSubReg_f16_reg imm_odd:$lane))), + (v2f32 (COPY_TO_REGCLASS (VT4 DPR:$src), DPR_VFP2)), + (SSubReg_f16_reg imm_odd:$lane))), HPR)>; + def : Pat<(extractelt (VT8 QPR:$src), imm_odd:$lane), + (COPY_TO_REGCLASS + (VMOVH (EXTRACT_SUBREG + (v4f32 (COPY_TO_REGCLASS (VT8 QPR:$src), QPR_VFP2)), + (SSubReg_f16_reg imm_odd:$lane))), + HPR)>; +} + +let Predicates = [HasNEON] in { + defm : ExtractEltEvenF16<v4f16, v8f16>; + defm : ExtractEltOddF16VMOVH<v4f16, v8f16>; +} + +let AddedComplexity = 1, Predicates = [HasNEON, HasBF16, HasFullFP16] in { + // If VMOVH (vmovx.f16) is available use it to extract BF16 from the odd lanes + defm : ExtractEltOddF16VMOVH<v4bf16, v8bf16>; +} -def : Pat<(extractelt (v8f16 QPR:$src), imm_even:$lane), - (EXTRACT_SUBREG - (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)), - (SSubReg_f16_reg imm_even:$lane))>; +let Predicates = [HasBF16, HasNEON] in { + defm : ExtractEltEvenF16<v4bf16, v8bf16>; -def : Pat<(extractelt (v8f16 QPR:$src), imm_odd:$lane), + // Otherwise, if VMOVH is not available resort to extracting the odd lane + // into a GPR and then moving to HPR + def : Pat<(extractelt (v4bf16 DPR:$src), imm_odd:$lane), (COPY_TO_REGCLASS - (VMOVH (EXTRACT_SUBREG - (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)), - (SSubReg_f16_reg imm_odd:$lane))), + (VGETLNu16 (v4bf16 DPR:$src), imm:$lane), + HPR)>; + + def : Pat<(extractelt (v8bf16 QPR:$src), imm_odd:$lane), + (COPY_TO_REGCLASS + (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)), HPR)>; } @@ -6415,6 +6445,21 @@ def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V), } } +// TODO: for odd lanes we could optimize this a bit by using the VINS +// FullFP16 instruction when it is available +multiclass InsertEltF16<ValueType VTScalar, ValueType VT4, ValueType VT8> { + def : Pat<(insertelt (VT4 DPR:$src1), (VTScalar HPR:$src2), imm:$lane), + (VT4 (VSETLNi16 DPR:$src1, + (COPY_TO_REGCLASS HPR:$src2, GPR), imm:$lane))>; + def : Pat<(insertelt (VT8 QPR:$src1), (VTScalar HPR:$src2), imm:$lane), + (VT8 (INSERT_SUBREG QPR:$src1, + (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, + (DSubReg_i16_reg imm:$lane))), + (COPY_TO_REGCLASS HPR:$src2, GPR), + (SubReg_i16_lane imm:$lane))), + (DSubReg_i16_reg imm:$lane)))>; +} + let Predicates = [HasNEON] in { def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane), (v16i8 (INSERT_SUBREG QPR:$src1, @@ -6442,14 +6487,7 @@ def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)), (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)), SPR:$src2, (SSubReg_f32_reg imm:$src3))>; -def : Pat<(insertelt (v4f16 DPR:$src1), HPR:$src2, imm:$lane), - (v4f16 (VSETLNi16 DPR:$src1, (VMOVRH $src2), imm:$lane))>; -def : Pat<(insertelt (v8f16 QPR:$src1), HPR:$src2, imm:$lane), - (v8f16 (INSERT_SUBREG QPR:$src1, - (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1, - (DSubReg_i16_reg imm:$lane))), - (VMOVRH $src2), (SubReg_i16_lane imm:$lane))), - (DSubReg_i16_reg imm:$lane)))>; +defm : InsertEltF16<f16, v4f16, v8f16>; //def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)), // (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>; @@ -6484,6 +6522,9 @@ def : Pat<(v4i32 (scalar_to_vector GPR:$src)), dsub_0)>; } +let Predicates = [HasNEON, HasBF16] in +defm : InsertEltF16<bf16, v4bf16, v8bf16>; + // VDUP : Vector Duplicate (from ARM core register to all elements) class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty> @@ -6588,18 +6629,35 @@ def : Pat<(v4f32 (ARMvduplane (v4f32 QPR:$src), imm:$lane)), (DSubReg_i32_reg imm:$lane))), (SubReg_i32_lane imm:$lane)))>; -def : Pat<(v4f16 (ARMvdup HPR:$src)), +def : Pat<(v4f16 (ARMvdup (f16 HPR:$src))), (v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), - HPR:$src, ssub_0), (i32 0)))>; + (f16 HPR:$src), ssub_0), (i32 0)))>; def : Pat<(v2f32 (ARMvdup (f32 SPR:$src))), (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; def : Pat<(v4f32 (ARMvdup (f32 SPR:$src))), (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, ssub_0), (i32 0)))>; -def : Pat<(v8f16 (ARMvdup HPR:$src)), +def : Pat<(v8f16 (ARMvdup (f16 HPR:$src))), (v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), - HPR:$src, ssub_0), (i32 0)))>; + (f16 HPR:$src), ssub_0), (i32 0)))>; +} + +let Predicates = [HasNEON, HasBF16] in { +def : Pat<(v4bf16 (ARMvduplane (v4bf16 DPR:$Vm), imm:$lane)), + (VDUPLN16d DPR:$Vm, imm:$lane)>; + +def : Pat<(v8bf16 (ARMvduplane (v8bf16 QPR:$src), imm:$lane)), + (v8bf16 (VDUPLN16q (v4bf16 (EXTRACT_SUBREG QPR:$src, + (DSubReg_i16_reg imm:$lane))), + (SubReg_i16_lane imm:$lane)))>; + +def : Pat<(v4bf16 (ARMvdup (bf16 HPR:$src))), + (v4bf16 (VDUPLN16d (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), + (bf16 HPR:$src), ssub_0), (i32 0)))>; +def : Pat<(v8bf16 (ARMvdup (bf16 HPR:$src))), + (v8bf16 (VDUPLN16q (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), + (bf16 HPR:$src), ssub_0), (i32 0)))>; } // VMOVN : Vector Narrowing Move @@ -7330,7 +7388,7 @@ def : Pat<(arm_vmovsr GPR:$a), Requires<[HasNEON, DontUseVMOVSR]>; //===----------------------------------------------------------------------===// -// Non-Instruction Patterns or Endiness - Revert Patterns +// Non-Instruction Patterns or Endianess - Revert Patterns //===----------------------------------------------------------------------===// // bit_convert @@ -7345,6 +7403,9 @@ def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v4f16 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (v4f16 DPR:$src)>; +def : Pat<(v4i16 (bitconvert (v4bf16 DPR:$src))), (v4i16 DPR:$src)>; +def : Pat<(v4bf16 (bitconvert (v4i16 DPR:$src))), (v4bf16 DPR:$src)>; + // 128 bit conversions def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; @@ -7354,6 +7415,9 @@ def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>; + +def : Pat<(v8i16 (bitconvert (v8bf16 QPR:$src))), (v8i16 QPR:$src)>; +def : Pat<(v8bf16 (bitconvert (v8i16 QPR:$src))), (v8bf16 QPR:$src)>; } let Predicates = [IsLE,HasNEON] in { @@ -7361,24 +7425,28 @@ let Predicates = [IsLE,HasNEON] in { def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4bf16 DPR:$src))), (f64 DPR:$src)>; def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (v1i64 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4bf16 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (v1i64 DPR:$src)>; def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4bf16 DPR:$src))), (v2f32 DPR:$src)>; def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>; def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (v2f32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (v2i32 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4bf16 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (v2i32 DPR:$src)>; @@ -7388,6 +7456,12 @@ let Predicates = [IsLE,HasNEON] in { def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (v4f16 DPR:$src)>; def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (v4f16 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (f64 DPR:$src))), (v4bf16 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (v1i64 DPR:$src))), (v4bf16 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (v2f32 DPR:$src))), (v4bf16 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (v2i32 DPR:$src))), (v4bf16 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (v8i8 DPR:$src))), (v4bf16 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>; @@ -7399,30 +7473,35 @@ let Predicates = [IsLE,HasNEON] in { def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (v8i8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4bf16 DPR:$src))), (v8i8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (v8i8 DPR:$src)>; // 128 bit conversions def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8bf16 QPR:$src))), (v2f64 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8bf16 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8bf16 QPR:$src))), (v4f32 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8bf16 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; @@ -7432,6 +7511,12 @@ let Predicates = [IsLE,HasNEON] in { def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>; def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v2f64 QPR:$src))), (v8bf16 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v2i64 QPR:$src))), (v8bf16 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v4f32 QPR:$src))), (v8bf16 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v4i32 QPR:$src))), (v8bf16 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v16i8 QPR:$src))), (v8bf16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; @@ -7443,6 +7528,7 @@ let Predicates = [IsLE,HasNEON] in { def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8bf16 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; } @@ -7451,24 +7537,28 @@ let Predicates = [IsBE,HasNEON] in { def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4bf16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v1i64 (bitconvert (v4bf16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v1i64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2f32 (bitconvert (v4bf16 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v2f32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; def : Pat<(v2i32 (bitconvert (f64 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v2i32 (bitconvert (v4bf16 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v2i32 (bitconvert (v8i8 DPR:$src))), (VREV32d8 DPR:$src)>; @@ -7478,6 +7568,12 @@ let Predicates = [IsBE,HasNEON] in { def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>; def : Pat<(v4f16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>; + def : Pat<(v4bf16 (bitconvert (v8i8 DPR:$src))), (VREV16d8 DPR:$src)>; + def : Pat<(v4i16 (bitconvert (f64 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>; @@ -7489,30 +7585,35 @@ let Predicates = [IsBE,HasNEON] in { def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v2i32 DPR:$src))), (VREV32d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v4f16 DPR:$src))), (VREV16d8 DPR:$src)>; + def : Pat<(v8i8 (bitconvert (v4bf16 DPR:$src))), (VREV16d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v4i16 DPR:$src))), (VREV16d8 DPR:$src)>; // 128 bit conversions def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8bf16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8bf16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8bf16 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8bf16 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; @@ -7522,6 +7623,12 @@ let Predicates = [IsBE,HasNEON] in { def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v8bf16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; @@ -7533,9 +7640,26 @@ let Predicates = [IsBE,HasNEON] in { def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8bf16 QPR:$src))), (VREV16q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8 QPR:$src)>; } +let Predicates = [HasNEON] in { + // Here we match the specific SDNode type 'ARMVectorRegCastImpl' + // rather than the more general 'ARMVectorRegCast' which would also + // match some bitconverts. If we use the latter in cases where the + // input and output types are the same, the bitconvert gets elided + // and we end up generating a nonsense match of nothing. + + foreach VT = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in + foreach VT2 = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in + def : Pat<(VT (ARMVectorRegCastImpl (VT2 QPR:$src))), (VT QPR:$src)>; + + foreach VT = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, f64 ] in + foreach VT2 = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, f64 ] in + def : Pat<(VT (ARMVectorRegCastImpl (VT2 DPR:$src))), (VT DPR:$src)>; +} + // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian let Predicates = [IsBE,HasNEON] in { def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)), @@ -7863,6 +7987,8 @@ def : Pat<(v4f32 (concat_vectors DPR:$Dn, DPR:$Dm)), (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>; def : Pat<(v8f16 (concat_vectors DPR:$Dn, DPR:$Dm)), (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>; +def : Pat<(v8bf16 (concat_vectors DPR:$Dn, DPR:$Dm)), + (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>; } //===----------------------------------------------------------------------===// @@ -8915,3 +9041,115 @@ def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm", (VMOVv4i32 QPR:$Vd, nImmVMOVI32:$imm, pred:$p)>; def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm", (VMOVv2i32 DPR:$Vd, nImmVMOVI32:$imm, pred:$p)>; + +// ARMv8.6a BFloat16 instructions. +let Predicates = [HasBF16, HasNEON] in { +class BF16VDOT<bits<5> op27_23, bits<2> op21_20, bit op6, + dag oops, dag iops, list<dag> pattern> + : N3Vnp<op27_23, op21_20, 0b1101, op6, 0, oops, iops, + N3RegFrm, IIC_VDOTPROD, "", "", pattern> +{ + let DecoderNamespace = "VFPV8"; +} + +class BF16VDOTS<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy, ValueType InputTy> + : BF16VDOT<0b11000, 0b00, Q, (outs RegTy:$dst), + (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), + [(set (AccumTy RegTy:$dst), + (int_arm_neon_bfdot (AccumTy RegTy:$Vd), + (InputTy RegTy:$Vn), + (InputTy RegTy:$Vm)))]> { + let Constraints = "$dst = $Vd"; + let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm"); + let DecoderNamespace = "VFPV8"; +} + +multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy, + ValueType InputTy, dag RHS> { + + def "" : BF16VDOT<0b11100, 0b00, Q, (outs RegTy:$dst), + (ins RegTy:$Vd, RegTy:$Vn, + DPR_VFP2:$Vm, VectorIndex32:$lane), []> { + bit lane; + let Inst{5} = lane; + let Constraints = "$dst = $Vd"; + let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm$lane"); + let DecoderNamespace = "VFPV8"; + } + + def : Pat< + (AccumTy (int_arm_neon_bfdot (AccumTy RegTy:$Vd), + (InputTy RegTy:$Vn), + (InputTy (bitconvert (AccumTy + (ARMvduplane (AccumTy RegTy:$Vm), + VectorIndex32:$lane)))))), + (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>; +} + +def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>; +def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>; + +defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>; +defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; + +class BF16MM<bit Q, RegisterClass RegTy, + string opc> + : N3Vnp<0b11000, 0b00, 0b1100, Q, 0, + (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), + N3RegFrm, IIC_VDOTPROD, "", "", + [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd), + (v16i8 QPR:$Vn), + (v16i8 QPR:$Vm)))]> { + let Constraints = "$dst = $Vd"; + let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm"); + let DecoderNamespace = "VFPV8"; +} + +def VMMLA : BF16MM<1, QPR, "vmmla">; + +class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode> + : N3VCP8<0b00, 0b11, T, 1, + (outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, QPR:$Vm), + NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "", + [(set (v4f32 QPR:$dst), + (OpNode (v4f32 QPR:$Vd), + (v16i8 QPR:$Vn), + (v16i8 QPR:$Vm)))]> { + let Constraints = "$dst = $Vd"; + let DecoderNamespace = "VFPV8"; +} + +def VBF16MALTQ: VBF16MALQ<1, "t", int_arm_neon_bfmlalt>; +def VBF16MALBQ: VBF16MALQ<0, "b", int_arm_neon_bfmlalb>; + +multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> { + def "" : N3VLaneCP8<0, 0b11, T, 1, (outs QPR:$dst), + (ins QPR:$Vd, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx), + IIC_VMACD, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm$idx", "", []> { + bits<2> idx; + let Inst{5} = idx{1}; + let Inst{3} = idx{0}; + let Constraints = "$dst = $Vd"; + let DecoderNamespace = "VFPV8"; + } + + def : Pat< + (v4f32 (OpNode (v4f32 QPR:$Vd), + (v16i8 QPR:$Vn), + (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm), + VectorIndex16:$lane)))))), + (!cast<Instruction>(NAME) QPR:$Vd, + QPR:$Vn, + (EXTRACT_SUBREG QPR:$Vm, + (DSubReg_i16_reg VectorIndex16:$lane)), + (SubReg_i16_lane VectorIndex16:$lane))>; +} + +defm VBF16MALTQI: VBF16MALQI<1, "t", int_arm_neon_bfmlalt>; +defm VBF16MALBQI: VBF16MALQI<0, "b", int_arm_neon_bfmlalb>; + +def BF16_VCVT : N2V<0b11, 0b11, 0b01, 0b10, 0b01100, 1, 0, + (outs DPR:$Vd), (ins QPR:$Vm), + NoItinerary, "vcvt", "bf16.f32", "$Vd, $Vm", "", []>; +} +// End of BFloat16 instructions diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td index 18bcbda44580..7fae32117243 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -14,6 +14,10 @@ // Thumb specific DAG Nodes. // +def ARMtsecall : SDNode<"ARMISD::tSECALL", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; + def imm_sr_XFORM: SDNodeXForm<imm, [{ unsigned Imm = N->getZExtValue(); return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32); @@ -499,6 +503,10 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in { def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br, [(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>; + // alternative return for CMSE entry functions + def tBXNS_RET : tPseudoInst<(outs), (ins), 2, IIC_Br, + [(ARMseretflag)]>, Sched<[WriteBr]>; + // Alternative return instruction used by vararg functions. def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p), 2, IIC_Br, [], @@ -560,6 +568,10 @@ let isCall = 1, let Unpredictable{1-0} = 0b11; } + def tBLXNS_CALL : PseudoInst<(outs), (ins GPRnopc:$func), IIC_Br, + [(ARMtsecall GPRnopc:$func)]>, + Requires<[IsThumb, Has8MSecExt]>, Sched<[WriteBr]>; + // ARMv4T def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func), 4, IIC_Br, @@ -1513,7 +1525,7 @@ def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br, // tromped upon when we get here from a longjmp(). We force everything out of // registers except for our own input by listing the relevant registers in // Defs. By doing so, we also cause the prologue/epilogue code to actively -// preserve all of the callee-saved resgisters, which is exactly what we want. +// preserve all of the callee-saved registers, which is exactly what we want. // $val is a scratch register for our use. let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R12, CPSR ], hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td index c5aae235f25d..7137e8ee66b8 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -270,7 +270,8 @@ def t2am_imm8_offset : MemOperand, // t2addrmode_imm8s4 := reg +/- (imm8 << 2) def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";} -class T2AddrMode_Imm8s4 : MemOperand { +class T2AddrMode_Imm8s4 : MemOperand, + ComplexPattern<i32, 2, "SelectT2AddrModeImm8<2>", []> { let EncoderMethod = "getT2AddrModeImm8s4OpValue"; let DecoderMethod = "DecodeT2AddrModeImm8s4"; let ParserMatchClass = MemImm8s4OffsetAsmOperand; @@ -1448,7 +1449,8 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in { // Load doubleword def t2LDRDi8 : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2), (ins t2addrmode_imm8s4:$addr), - IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>, + IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", + [(set rGPR:$Rt, rGPR:$Rt2, (ARMldrd t2addrmode_imm8s4:$addr))]>, Sched<[WriteLd]>; } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 @@ -1629,7 +1631,8 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si, let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs), (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr), - IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>, + IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", + [(ARMstrd rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr)]>, Sched<[WriteST]>; // Indexed stores @@ -1745,7 +1748,7 @@ def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>; // ldrd / strd pre / post variants -let mayLoad = 1 in +let mayLoad = 1, hasSideEffects = 0 in def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []>, @@ -1753,13 +1756,13 @@ def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), let DecoderMethod = "DecodeT2LDRDPreInstruction"; } -let mayLoad = 1 in +let mayLoad = 1, hasSideEffects = 0 in def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm", "$addr.base = $wb", []>, Sched<[WriteLd]>; -let mayStore = 1 in +let mayStore = 1, hasSideEffects = 0 in def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb), (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr), IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!", @@ -1767,7 +1770,7 @@ def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb), let DecoderMethod = "DecodeT2STRDPreInstruction"; } -let mayStore = 1 in +let mayStore = 1, hasSideEffects = 0 in def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb), (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr, t2am_imm8s4_offset:$imm), @@ -1871,6 +1874,34 @@ defm t2PLD : T2Ipl<0, 0, "pld">, Requires<[IsThumb2]>; defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>; defm t2PLI : T2Ipl<0, 1, "pli">, Requires<[IsThumb2,HasV7]>; +// PLD/PLDW/PLI aliases w/ the optional .w suffix +def : t2InstAlias<"pld${p}.w\t$addr", + (t2PLDi12 t2addrmode_imm12:$addr, pred:$p)>; +def : t2InstAlias<"pld${p}.w\t$addr", + (t2PLDi8 t2addrmode_negimm8:$addr, pred:$p)>; +def : t2InstAlias<"pld${p}.w\t$addr", + (t2PLDs t2addrmode_so_reg:$addr, pred:$p)>; + +def : InstAlias<"pldw${p}.w\t$addr", + (t2PLDWi12 t2addrmode_imm12:$addr, pred:$p), 0>, + Requires<[IsThumb2,HasV7,HasMP]>; +def : InstAlias<"pldw${p}.w\t$addr", + (t2PLDWi8 t2addrmode_negimm8:$addr, pred:$p), 0>, + Requires<[IsThumb2,HasV7,HasMP]>; +def : InstAlias<"pldw${p}.w\t$addr", + (t2PLDWs t2addrmode_so_reg:$addr, pred:$p), 0>, + Requires<[IsThumb2,HasV7,HasMP]>; + +def : InstAlias<"pli${p}.w\t$addr", + (t2PLIi12 t2addrmode_imm12:$addr, pred:$p), 0>, + Requires<[IsThumb2,HasV7]>; +def : InstAlias<"pli${p}.w\t$addr", + (t2PLIi8 t2addrmode_negimm8:$addr, pred:$p), 0>, + Requires<[IsThumb2,HasV7]>; +def : InstAlias<"pli${p}.w\t$addr", + (t2PLIs t2addrmode_so_reg:$addr, pred:$p), 0>, + Requires<[IsThumb2,HasV7]>; + // pci variant is very similar to i12, but supports negative offsets // from the PC. Only PLD and PLI have pci variants (not PLDW) class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr), @@ -1893,6 +1924,24 @@ class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr), def t2PLDpci : T2Iplpci<0, "pld">, Requires<[IsThumb2]>; def t2PLIpci : T2Iplpci<1, "pli">, Requires<[IsThumb2,HasV7]>; +def : t2InstAlias<"pld${p}.w $addr", + (t2PLDpci t2ldrlabel:$addr, pred:$p)>; +def : InstAlias<"pli${p}.w $addr", + (t2PLIpci t2ldrlabel:$addr, pred:$p), 0>, + Requires<[IsThumb2,HasV7]>; + +// PLD/PLI with alternate literal form. +def : t2InstAlias<"pld${p} $addr", + (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>; +def : InstAlias<"pli${p} $addr", + (t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p), 0>, + Requires<[IsThumb2,HasV7]>; +def : t2InstAlias<"pld${p}.w $addr", + (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>; +def : InstAlias<"pli${p}.w $addr", + (t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p), 0>, + Requires<[IsThumb2,HasV7]>; + //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // @@ -2436,7 +2485,7 @@ def : Thumb2DSPPat<(int_arm_qadd rGPR:$Rm, rGPR:$Rn), (t2QADD rGPR:$Rm, rGPR:$Rn)>; def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, rGPR:$Rn), (t2QSUB rGPR:$Rm, rGPR:$Rn)>; -def : Thumb2DSPPat<(int_arm_qadd(int_arm_qadd rGPR:$Rm, rGPR:$Rm), rGPR:$Rn), +def : Thumb2DSPPat<(int_arm_qadd rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)), (t2QDADD rGPR:$Rm, rGPR:$Rn)>; def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)), (t2QDSUB rGPR:$Rm, rGPR:$Rn)>; @@ -2445,7 +2494,7 @@ def : Thumb2DSPPat<(saddsat rGPR:$Rm, rGPR:$Rn), (t2QADD rGPR:$Rm, rGPR:$Rn)>; def : Thumb2DSPPat<(ssubsat rGPR:$Rm, rGPR:$Rn), (t2QSUB rGPR:$Rm, rGPR:$Rn)>; -def : Thumb2DSPPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn), +def : Thumb2DSPPat<(saddsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)), (t2QDADD rGPR:$Rm, rGPR:$Rn)>; def : Thumb2DSPPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)), (t2QDSUB rGPR:$Rm, rGPR:$Rn)>; @@ -2716,6 +2765,8 @@ def t2SBFX: T2TwoRegBitFI< let Inst{25} = 1; let Inst{24-20} = 0b10100; let Inst{15} = 0; + + let hasSideEffects = 0; } def t2UBFX: T2TwoRegBitFI< @@ -2725,6 +2776,8 @@ def t2UBFX: T2TwoRegBitFI< let Inst{25} = 1; let Inst{24-20} = 0b11100; let Inst{15} = 0; + + let hasSideEffects = 0; } // A8.8.247 UDF - Undefined (Encoding T2) @@ -3708,7 +3761,7 @@ def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr), // when we get here from a longjmp(). We force everything out of registers // except for our own input by listing the relevant registers in Defs. By // doing so, we also cause the prologue/epilogue code to actively preserve -// all of the callee-saved resgisters, which is exactly what we want. +// all of the callee-saved registers, which is exactly what we want. // $val is a scratch register for our use. let Defs = [ R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, CPSR, @@ -4147,7 +4200,7 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp), imm:$cp))]>, Requires<[IsThumb2]>; -// Pseudo isntruction that combines movs + predicated rsbmi +// Pseudo instruction that combines movs + predicated rsbmi // to implement integer ABS let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in { def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src), @@ -4848,9 +4901,15 @@ def : t2InstAlias<"tst${p} $Rn, $Rm", (t2TSTrr rGPR:$Rn, rGPR:$Rm, pred:$p)>; // Memory barriers +def : InstAlias<"dmb${p}.w\t$opt", (t2DMB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>; def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>; +def : InstAlias<"dmb${p}.w", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>; +def : InstAlias<"dsb${p}.w\t$opt", (t2DSB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>; def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>; +def : InstAlias<"dsb${p}.w", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>; +def : InstAlias<"isb${p}.w\t$opt", (t2ISB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>; def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>; +def : InstAlias<"isb${p}.w", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>; // Non-predicable aliases of a predicable DSB: the predicate is (14, 0) where // 14 = AL (always execute) and 0 = "instruction doesn't read the CPSR". @@ -5184,14 +5243,6 @@ def : t2InstAlias<"ldr${p}.w $Rt, $immediate", (t2LDRConstPool GPRnopc:$Rt, const_pool_asm_imm:$immediate, pred:$p)>; -// PLD/PLDW/PLI with alternate literal form. -def : t2InstAlias<"pld${p} $addr", - (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>; -def : InstAlias<"pli${p} $addr", - (t2PLIpci t2ldr_pcrel_imm12:$addr, pred:$p), 0>, - Requires<[IsThumb2,HasV7]>; - - //===----------------------------------------------------------------------===// // ARMv8.1m instructions // @@ -5204,7 +5255,7 @@ class V8_1MI<dag oops, dag iops, AddrMode am, InstrItinClass itin, string asm, def t2CLRM : V8_1MI<(outs), (ins pred:$p, reglist_with_apsr:$regs, variable_ops), - AddrModeNone, NoItinerary, "clrm", "${p}\t$regs", "", []> { + AddrModeNone, NoItinerary, "clrm${p}", "$regs", "", []> { bits<16> regs; let Inst{31-16} = 0b1110100010011111; @@ -5357,6 +5408,7 @@ def t2DoLoopStart : t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br, [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>; +let hasSideEffects = 0 in def t2LoopDec : t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), 4, IIC_Br, []>, Sched<[WriteBr]>; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td index f1d1d8a89164..8a652c1d90f6 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -158,11 +158,24 @@ def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr), let isUnpredicable = 1 in def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr), IIC_fpLoad16, "vldr", ".16\t$Sd, $addr", - [(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>, + [(set HPR:$Sd, (f16 (alignedload16 addrmode5fp16:$addr)))]>, Requires<[HasFPRegs16]>; } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in' +def : Pat<(bf16 (alignedload16 addrmode5fp16:$addr)), + (VLDRH addrmode5fp16:$addr)> { + let Predicates = [HasFPRegs16]; +} +def : Pat<(bf16 (alignedload16 addrmode3:$addr)), + (COPY_TO_REGCLASS (LDRH addrmode3:$addr), HPR)> { + let Predicates = [HasNoFPRegs16, IsARM]; +} +def : Pat<(bf16 (alignedload16 t2addrmode_imm12:$addr)), + (COPY_TO_REGCLASS (t2LDRHi12 t2addrmode_imm12:$addr), HPR)> { + let Predicates = [HasNoFPRegs16, IsThumb]; +} + def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr), IIC_fpStore64, "vstr", "\t$Dd, $addr", [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>, @@ -180,9 +193,22 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr), let isUnpredicable = 1 in def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr), IIC_fpStore16, "vstr", ".16\t$Sd, $addr", - [(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>, + [(alignedstore16 (f16 HPR:$Sd), addrmode5fp16:$addr)]>, Requires<[HasFPRegs16]>; +def : Pat<(alignedstore16 (bf16 HPR:$Sd), addrmode5fp16:$addr), + (VSTRH (bf16 HPR:$Sd), addrmode5fp16:$addr)> { + let Predicates = [HasFPRegs16]; +} +def : Pat<(alignedstore16 (bf16 HPR:$Sd), addrmode3:$addr), + (STRH (COPY_TO_REGCLASS $Sd, GPR), addrmode3:$addr)> { + let Predicates = [HasNoFPRegs16, IsARM]; +} +def : Pat<(alignedstore16 (bf16 HPR:$Sd), t2addrmode_imm12:$addr), + (t2STRHi12 (COPY_TO_REGCLASS $Sd, GPR), t2addrmode_imm12:$addr)> { + let Predicates = [HasNoFPRegs16, IsThumb]; +} + //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // @@ -277,7 +303,6 @@ def : MnemonicAlias<"vstm", "vstmia">; //===----------------------------------------------------------------------===// // Lazy load / store multiple Instructions // -let mayLoad = 1 in def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone, NoItinerary, "vlldm${p}\t$Rn", "", []>, Requires<[HasV8MMainline, Has8MSecExt]> { @@ -288,9 +313,9 @@ def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone, let Inst{15-12} = 0; let Inst{7-0} = 0; let mayLoad = 1; + let Defs = [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, VPR, FPSCR, FPSCR_NZCV]; } -let mayStore = 1 in def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone, NoItinerary, "vlstm${p}\t$Rn", "", []>, Requires<[HasV8MMainline, Has8MSecExt]> { @@ -387,7 +412,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VADDH : AHbI<0b11100, 0b11, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>, + [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -412,7 +437,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VSUBH : AHbI<0b11100, 0b11, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>, + [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPALU32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -433,7 +458,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VDIVH : AHbI<0b11101, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fdiv HPR:$Sn, HPR:$Sm))]>, + [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPDIV32]>; let TwoOperandAliasConstraint = "$Dn = $Dd" in @@ -458,7 +483,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in def VMULH : AHbI<0b11100, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fmul HPR:$Sn, HPR:$Sm))]>, + [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; def VNMULD : ADbI<0b11100, 0b10, 1, 0, @@ -480,7 +505,7 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0, def VNMULH : AHbI<0b11100, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fneg (fmul HPR:$Sn, HPR:$Sm)))]>, + [(set (f16 HPR:$Sd), (fneg (fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>, Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>; multiclass vsel_inst<string op, bits<2> opc, int CC> { @@ -489,7 +514,7 @@ multiclass vsel_inst<string op, bits<2> opc, int CC> { def H : AHbInp<0b11100, opc, 0, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"), - [(set HPR:$Sd, (ARMcmov HPR:$Sm, HPR:$Sn, CC))]>, + [(set (f16 HPR:$Sd), (ARMcmov (f16 HPR:$Sm), (f16 HPR:$Sn), CC))]>, Requires<[HasFullFP16]>; def S : ASbInp<0b11100, opc, 0, @@ -518,7 +543,7 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { def H : AHbInp<0b11101, 0b00, opc, (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm), NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"), - [(set HPR:$Sd, (SD HPR:$Sn, HPR:$Sm))]>, + [(set (f16 HPR:$Sd), (SD (f16 HPR:$Sn), (f16 HPR:$Sm)))]>, Requires<[HasFullFP16]>; def S : ASbInp<0b11101, 0b00, opc, @@ -564,7 +589,7 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins HPR:$Sd, HPR:$Sm), IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm", - [(arm_cmpfpe HPR:$Sd, HPR:$Sm)]>; + [(arm_cmpfpe (f16 HPR:$Sd), (f16 HPR:$Sm))]>; def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins DPR:$Dd, DPR:$Dm), @@ -583,7 +608,7 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins HPR:$Sd, HPR:$Sm), IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm", - [(arm_cmpfp HPR:$Sd, HPR:$Sm)]>; + [(arm_cmpfp (f16 HPR:$Sd), (f16 HPR:$Sm))]>; } // Defs = [FPSCR_NZCV] //===----------------------------------------------------------------------===// @@ -607,7 +632,7 @@ def VABSS : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0, def VABSH : AHuI<0b11101, 0b11, 0b0000, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm", - [(set HPR:$Sd, (fabs (f16 HPR:$Sm)))]>; + [(set (f16 HPR:$Sd), (fabs (f16 HPR:$Sm)))]>; let Defs = [FPSCR_NZCV] in { def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, @@ -633,7 +658,7 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins HPR:$Sd), IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0", - [(arm_cmpfpe0 HPR:$Sd)]> { + [(arm_cmpfpe0 (f16 HPR:$Sd))]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -661,7 +686,7 @@ def VCMPZS : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, def VCMPZH : AHuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins HPR:$Sd), IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0", - [(arm_cmpfp0 HPR:$Sd)]> { + [(arm_cmpfp0 (f16 HPR:$Sd))]> { let Inst{3-0} = 0b0000; let Inst{5} = 0; } @@ -683,6 +708,7 @@ def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, let Inst{22} = Dd{4}; let Predicates = [HasVFP2, HasDPVFP]; + let hasSideEffects = 0; } // Special case encoding: bits 11-8 is 0b1011. @@ -707,20 +733,23 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, let Inst{4} = 0; let Predicates = [HasVFP2, HasDPVFP]; + let hasSideEffects = 0; } // Between half, single and double-precision. +let hasSideEffects = 0 in def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", [/* Intentionally left blank, see patterns below */]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; -def : FP16Pat<(f32 (fpextend HPR:$Sm)), - (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>; +def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))), + (VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>; def : FP16Pat<(f16_to_fp GPR:$a), (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>; +let hasSideEffects = 0 in def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", [/* Intentionally left blank, see patterns below */]>, @@ -731,19 +760,41 @@ def : FP16Pat<(f16 (fpround SPR:$Sm)), (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>; def : FP16Pat<(fp_to_f16 SPR:$a), (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>; +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), + (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH SPR:$src2), + (SSubReg_f16_reg imm:$lane)))>; +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane), + (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH SPR:$src2), + (SSubReg_f16_reg imm:$lane)))>; +let hasSideEffects = 0 in def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [/* Intentionally left blank, see patterns below */]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; +def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))), + (VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>; +def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))), + (VCVTTHS (EXTRACT_SUBREG + (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)), + (SSubReg_f16_reg imm_odd:$lane)))>; + +let hasSideEffects = 0 in def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>, + [/* Intentionally left blank, see patterns below */]>, Requires<[HasFP16]>, Sched<[WriteFPCVT]>; +def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), + (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2), + (SSubReg_f16_reg imm:$lane)))>; +def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane), + (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH SPR:$src2), + (SSubReg_f16_reg imm:$lane)))>; + def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs DPR:$Dd), (ins SPR:$Sm), NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm", @@ -756,10 +807,12 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, // Encode instruction operands. let Inst{3-0} = Sm{4-1}; let Inst{5} = Sm{0}; + + let hasSideEffects = 0; } -def : FullFP16Pat<(f64 (fpextend HPR:$Sm)), - (VCVTBHD (COPY_TO_REGCLASS HPR:$Sm, SPR))>, +def : FullFP16Pat<(f64 (fpextend (f16 HPR:$Sm))), + (VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>, Requires<[HasFPARMv8, HasDPVFP]>; def : FP16Pat<(f64 (f16_to_fp GPR:$a)), (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>, @@ -779,6 +832,8 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0, let Inst{5} = Dm{4}; let Inst{15-12} = Sd{4-1}; let Inst{22} = Sd{0}; + + let hasSideEffects = 0; } def : FullFP16Pat<(f16 (fpround DPR:$Dm)), @@ -798,6 +853,8 @@ def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0, // Encode instruction operands. let Inst{3-0} = Sm{4-1}; let Inst{5} = Sm{0}; + + let hasSideEffects = 0; } def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0, @@ -813,11 +870,13 @@ def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0, let Inst{22} = Sd{0}; let Inst{3-0} = Dm{3-0}; let Inst{5} = Dm{4}; + + let hasSideEffects = 0; } multiclass vcvt_inst<string opc, bits<2> rm, SDPatternOperator node = null_frag> { - let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in { + let PostEncoderMethod = "", DecoderNamespace = "VFPV8", hasSideEffects = 0 in { def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0, (outs SPR:$Sd), (ins HPR:$Sm), NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"), @@ -883,14 +942,14 @@ multiclass vcvt_inst<string opc, bits<2> rm, let Predicates = [HasFPARMv8] in { let Predicates = [HasFullFP16] in { - def : Pat<(i32 (fp_to_sint (node HPR:$a))), + def : Pat<(i32 (fp_to_sint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS - (!cast<Instruction>(NAME#"SH") HPR:$a), + (!cast<Instruction>(NAME#"SH") (f16 HPR:$a)), GPR)>; - def : Pat<(i32 (fp_to_uint (node HPR:$a))), + def : Pat<(i32 (fp_to_uint (node (f16 HPR:$a)))), (COPY_TO_REGCLASS - (!cast<Instruction>(NAME#"UH") HPR:$a), + (!cast<Instruction>(NAME#"UH") (f16 HPR:$a)), GPR)>; } def : Pat<(i32 (fp_to_sint (node SPR:$a))), @@ -936,7 +995,7 @@ def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0, def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0, (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm", - [(set HPR:$Sd, (fneg HPR:$Sm))]>; + [(set (f16 HPR:$Sd), (fneg (f16 HPR:$Sm)))]>; multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> { def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0, @@ -1035,7 +1094,7 @@ def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs HPR:$Sd), (ins HPR:$Sm), IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm", - [(set HPR:$Sd, (fsqrt (f16 HPR:$Sm)))]>; + [(set (f16 HPR:$Sd), (fsqrt (f16 HPR:$Sm)))]>; let hasSideEffects = 0 in { let isMoveReg = 1 in { @@ -1250,7 +1309,7 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010, def VMOVRH : AVConv2I<0b11100001, 0b1001, (outs rGPR:$Rt), (ins HPR:$Sn), IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn", - [(set rGPR:$Rt, (arm_vmovrh HPR:$Sn))]>, + []>, Requires<[HasFPRegs16]>, Sched<[WriteFPMOV]> { // Instruction operands. @@ -1272,7 +1331,7 @@ def VMOVRH : AVConv2I<0b11100001, 0b1001, def VMOVHR : AVConv4I<0b11100000, 0b1001, (outs HPR:$Sn), (ins rGPR:$Rt), IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt", - [(set HPR:$Sn, (arm_vmovhr rGPR:$Rt))]>, + []>, Requires<[HasFPRegs16]>, Sched<[WriteFPMOV]> { // Instruction operands. @@ -1290,6 +1349,11 @@ def VMOVHR : AVConv4I<0b11100000, 0b1001, let isUnpredicable = 1; } +def : FPRegs16Pat<(arm_vmovrh (f16 HPR:$Sn)), (VMOVRH (f16 HPR:$Sn))>; +def : FPRegs16Pat<(arm_vmovrh (bf16 HPR:$Sn)), (VMOVRH (bf16 HPR:$Sn))>; +def : FPRegs16Pat<(f16 (arm_vmovhr rGPR:$Rt)), (VMOVHR rGPR:$Rt)>; +def : FPRegs16Pat<(bf16 (arm_vmovhr rGPR:$Rt)), (VMOVHR rGPR:$Rt)>; + // FMRDH: SPR -> GPR // FMRDL: SPR -> GPR // FMRRS: SPR -> GPR @@ -1317,6 +1381,7 @@ class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, let Inst{22} = Dd{4}; let Predicates = [HasVFP2, HasDPVFP]; + let hasSideEffects = 0; } class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, @@ -1333,6 +1398,8 @@ class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, let Inst{5} = Sm{0}; let Inst{15-12} = Sd{4-1}; let Inst{22} = Sd{0}; + + let hasSideEffects = 0; } class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, @@ -1352,6 +1419,7 @@ class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, let Inst{22} = Sd{0}; let Predicates = [HasFullFP16]; + let hasSideEffects = 0; } def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, @@ -1465,6 +1533,7 @@ class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, let Inst{22} = Sd{0}; let Predicates = [HasVFP2, HasDPVFP]; + let hasSideEffects = 0; } class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, @@ -1501,6 +1570,7 @@ class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, let Inst{22} = Sd{0}; let Predicates = [HasFullFP16]; + let hasSideEffects = 0; } // Always set Z bit in the instruction, i.e. "round towards zero" variants. @@ -1548,8 +1618,8 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)), - (COPY_TO_REGCLASS (VTOSIZH HPR:$a), GPR)>; +def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))), + (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>; def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, (outs SPR:$Sd), (ins DPR:$Dm), @@ -1595,8 +1665,8 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001, let isUnpredicable = 1; } -def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)), - (COPY_TO_REGCLASS (VTOUIZH HPR:$a), GPR)>; +def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))), + (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>; // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. let Uses = [FPSCR] in { @@ -1680,6 +1750,8 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, // if dp_operation then UInt(D:Vd) else UInt(Vd:D); let Inst{22} = dst{0}; let Inst{15-12} = dst{4-1}; + + let hasSideEffects = 0; } // Double Precision register @@ -1692,6 +1764,7 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4, let Inst{22} = dst{4}; let Inst{15-12} = dst{3-0}; + let hasSideEffects = 0; let Predicates = [HasVFP2, HasDPVFP]; } @@ -1867,6 +1940,37 @@ def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1, } // End of 'let Constraints = "$a = $dst" in' +// BFloat16 - Single precision, unary, predicated +class BF16_VCVT<string opc, bits<2> op7_6> + : VFPAI<(outs SPR:$Sd), (ins SPR:$dst, SPR:$Sm), + VFPUnaryFrm, NoItinerary, + opc, ".bf16.f32\t$Sd, $Sm", []>, + RegConstraint<"$dst = $Sd">, + Requires<[HasBF16]>, + Sched<[]> { + bits<5> Sd; + bits<5> Sm; + + // Encode instruction operands. + let Inst{3-0} = Sm{4-1}; + let Inst{5} = Sm{0}; + let Inst{15-12} = Sd{4-1}; + let Inst{22} = Sd{0}; + + let Inst{27-23} = 0b11101; // opcode1 + let Inst{21-20} = 0b11; // opcode2 + let Inst{19-16} = 0b0011; // opcode3 + let Inst{11-8} = 0b1001; + let Inst{7-6} = op7_6; + let Inst{4} = 0; + + let DecoderNamespace = "VFPV8"; + let hasSideEffects = 0; +} + +def BF16_VCVTB : BF16_VCVT<"vcvtb", 0b01>; +def BF16_VCVTT : BF16_VCVT<"vcvtt", 0b11>; + //===----------------------------------------------------------------------===// // FP Multiply-Accumulate Operations. // @@ -1896,8 +2000,8 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0, def VMLAH : AHbI<0b11100, 0b00, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm), - HPR:$Sdin))]>, + [(set (f16 HPR:$Sd), (fadd_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)), + (f16 HPR:$Sdin)))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFPVMLx]>; @@ -1907,8 +2011,8 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>; -def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), - (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>, +def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), + (VMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>; @@ -1937,8 +2041,8 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0, def VMLSH : AHbI<0b11100, 0b00, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), - HPR:$Sdin))]>, + [(set (f16 HPR:$Sd), (fadd_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))), + (f16 HPR:$Sdin)))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFPVMLx]>; @@ -1948,8 +2052,8 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; -def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), - (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>, +def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), + (VMLSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; def VNMLAD : ADbI<0b11100, 0b01, 1, 0, @@ -1977,8 +2081,8 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0, def VNMLAH : AHbI<0b11100, 0b01, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), - HPR:$Sdin))]>, + [(set (f16 HPR:$Sd), (fsub_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))), + (f16 HPR:$Sdin)))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFPVMLx]>; @@ -1989,8 +2093,8 @@ def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; -def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin), - (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, +def : Pat<(fsub_mlx (fneg (fmul_su (f16 HPR:$a), HPR:$b)), HPR:$dstin), + (VNMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; // (-dst - (a * b)) -> -(dst + (a * b)) @@ -2000,8 +2104,8 @@ def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)), (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; -def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)), - (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>, +def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su (f16 HPR:$a), HPR:$b)), + (VNMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; def VNMLSD : ADbI<0b11100, 0b01, 0, 0, @@ -2028,7 +2132,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0, def VNMLSH : AHbI<0b11100, 0b01, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, + [(set (f16 HPR:$Sd), (fsub_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)), (f16 HPR:$Sdin)))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFPVMLx]>; @@ -2038,8 +2142,8 @@ def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>; -def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin), - (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>, +def : Pat<(fsub_mlx (fmul_su (f16 HPR:$a), HPR:$b), HPR:$dstin), + (VNMLSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>; //===----------------------------------------------------------------------===// @@ -2069,8 +2173,8 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0, def VFMAH : AHbI<0b11101, 0b10, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm), - HPR:$Sdin))]>, + [(set (f16 HPR:$Sd), (fadd_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)), + (f16 HPR:$Sdin)))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFusedMAC]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; @@ -2081,8 +2185,8 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; -def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), - (VFMAH HPR:$dstin, HPR:$a, HPR:$b)>, +def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), + (VFMAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>; // Match @llvm.fma.* intrinsics @@ -2093,8 +2197,8 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)), def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)), (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, HPR:$Sdin)), - (VFMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, +def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))), + (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; def VFMSD : ADbI<0b11101, 0b10, 1, 0, @@ -2121,8 +2225,8 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0, def VFMSH : AHbI<0b11101, 0b10, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), - HPR:$Sdin))]>, + [(set (f16 HPR:$Sd), (fadd_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))), + (f16 HPR:$Sdin)))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFusedMAC]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; @@ -2133,8 +2237,8 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>; -def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)), - (VFMSH HPR:$dstin, HPR:$a, HPR:$b)>, +def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)), + (VFMSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>, Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>; // Match @llvm.fma.* intrinsics @@ -2145,8 +2249,8 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)), def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin)), - (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, +def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))), + (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fma x, (fneg y), z) -> (vfms z, x, y) def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)), @@ -2155,8 +2259,8 @@ def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)), def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)), (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin)), - (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, +def : Pat<(f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin))), + (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; def VFNMAD : ADbI<0b11101, 0b01, 1, 0, @@ -2183,8 +2287,8 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0, def VFNMAH : AHbI<0b11101, 0b01, 1, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)), - HPR:$Sdin))]>, + [(set (f16 HPR:$Sd), (fsub_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))), + (f16 HPR:$Sdin)))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFusedMAC]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; @@ -2204,8 +2308,8 @@ def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))), def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 HPR:$Sdin))), - (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, +def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))), + (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y) def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), @@ -2214,8 +2318,8 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))), def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))), (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, (fneg HPR:$Sdin))), - (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, +def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), + (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; def VFNMSD : ADbI<0b11101, 0b01, 0, 0, @@ -2241,7 +2345,7 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0, def VFNMSH : AHbI<0b11101, 0b01, 0, 0, (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm), IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm", - [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>, + [(set (f16 HPR:$Sd), (fsub_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)), (f16 HPR:$Sdin)))]>, RegConstraint<"$Sdin = $Sd">, Requires<[HasFullFP16,UseFusedMAC]>, Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>; @@ -2262,8 +2366,8 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))), def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (fneg HPR:$Sdin))), - (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, +def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))), + (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y) def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), @@ -2272,8 +2376,8 @@ def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))), def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin))), - (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, +def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))), + (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; // (fneg (fma x, (fneg y), z) -> (vfnms z, x, y) def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))), @@ -2282,8 +2386,8 @@ def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))), def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))), (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>, Requires<[HasVFP4]>; -def : Pat<(fneg (f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin))), - (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>, +def : Pat<(fneg (f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin)))), + (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>, Requires<[HasFullFP16]>; //===----------------------------------------------------------------------===// @@ -2306,7 +2410,7 @@ def VMOVScc : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p), def VMOVHcc : PseudoInst<(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm, cmovpred:$p), IIC_fpUNA16, [(set (f16 HPR:$Sd), - (ARMcmov HPR:$Sn, HPR:$Sm, cmovpred:$p))]>, + (ARMcmov (f16 HPR:$Sn), (f16 HPR:$Sm), cmovpred:$p))]>, RegConstraint<"$Sd = $Sn">, Requires<[HasFPRegs]>; } // hasSideEffects @@ -2512,7 +2616,7 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm), def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm), VFPMiscFrm, IIC_fpUNA16, "vmov", ".f16\t$Sd, $imm", - [(set HPR:$Sd, vfp_f16imm:$imm)]>, + [(set (f16 HPR:$Sd), vfp_f16imm:$imm)]>, Requires<[HasFullFP16]> { bits<5> Sd; bits<8> imm; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp index 67816bc2103f..c8a894fb11a8 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMInstructionSelector.cpp @@ -239,17 +239,17 @@ static bool selectMergeValues(MachineInstrBuilder &MIB, // We only support G_MERGE_VALUES as a way to stick together two scalar GPRs // into one DPR. - Register VReg0 = MIB->getOperand(0).getReg(); + Register VReg0 = MIB.getReg(0); (void)VReg0; assert(MRI.getType(VReg0).getSizeInBits() == 64 && RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID && "Unsupported operand for G_MERGE_VALUES"); - Register VReg1 = MIB->getOperand(1).getReg(); + Register VReg1 = MIB.getReg(1); (void)VReg1; assert(MRI.getType(VReg1).getSizeInBits() == 32 && RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID && "Unsupported operand for G_MERGE_VALUES"); - Register VReg2 = MIB->getOperand(2).getReg(); + Register VReg2 = MIB.getReg(2); (void)VReg2; assert(MRI.getType(VReg2).getSizeInBits() == 32 && RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID && @@ -271,17 +271,17 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB, // We only support G_UNMERGE_VALUES as a way to break up one DPR into two // GPRs. - Register VReg0 = MIB->getOperand(0).getReg(); + Register VReg0 = MIB.getReg(0); (void)VReg0; assert(MRI.getType(VReg0).getSizeInBits() == 32 && RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID && "Unsupported operand for G_UNMERGE_VALUES"); - Register VReg1 = MIB->getOperand(1).getReg(); + Register VReg1 = MIB.getReg(1); (void)VReg1; assert(MRI.getType(VReg1).getSizeInBits() == 32 && RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID && "Unsupported operand for G_UNMERGE_VALUES"); - Register VReg2 = MIB->getOperand(2).getReg(); + Register VReg2 = MIB.getReg(2); (void)VReg2; assert(MRI.getType(VReg2).getSizeInBits() == 64 && RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::FPRRegBankID && @@ -530,7 +530,7 @@ bool ARMInstructionSelector::selectCmp(CmpConstants Helper, MachineRegisterInfo &MRI) const { const InsertInfo I(MIB); - auto ResReg = MIB->getOperand(0).getReg(); + auto ResReg = MIB.getReg(0); if (!validReg(MRI, ResReg, 1, ARM::GPRRegBankID)) return false; @@ -542,8 +542,8 @@ bool ARMInstructionSelector::selectCmp(CmpConstants Helper, return true; } - auto LHSReg = MIB->getOperand(2).getReg(); - auto RHSReg = MIB->getOperand(3).getReg(); + auto LHSReg = MIB.getReg(2); + auto RHSReg = MIB.getReg(3); if (!validOpRegPair(MRI, LHSReg, RHSReg, Helper.OperandSize, Helper.OperandRegBankID)) return false; @@ -627,7 +627,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB, bool UseMovt = STI.useMovt(); unsigned Size = TM.getPointerSize(0); - unsigned Alignment = 4; + const Align Alignment(4); auto addOpsForConstantPoolLoad = [&MF, Alignment, Size](MachineInstrBuilder &MIB, @@ -687,7 +687,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB, if (Indirect) { if (!UseOpcodeThatLoads) { - auto ResultReg = MIB->getOperand(0).getReg(); + auto ResultReg = MIB.getReg(0); auto AddressReg = MRI.createVirtualRegister(&ARM::GPRRegClass); MIB->getOperand(0).setReg(AddressReg); @@ -773,7 +773,7 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB, auto &DbgLoc = MIB->getDebugLoc(); // Compare the condition to 1. - auto CondReg = MIB->getOperand(1).getReg(); + auto CondReg = MIB.getReg(1); assert(validReg(MRI, CondReg, 1, ARM::GPRRegBankID) && "Unsupported types for select operation"); auto CmpI = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(Opcodes.TSTri)) @@ -785,9 +785,9 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB, // Move a value into the result register based on the result of the // comparison. - auto ResReg = MIB->getOperand(0).getReg(); - auto TrueReg = MIB->getOperand(2).getReg(); - auto FalseReg = MIB->getOperand(3).getReg(); + auto ResReg = MIB.getReg(0); + auto TrueReg = MIB.getReg(2); + auto FalseReg = MIB.getReg(3); assert(validOpRegPair(MRI, ResReg, TrueReg, 32, ARM::GPRRegBankID) && validOpRegPair(MRI, TrueReg, FalseReg, 32, ARM::GPRRegBankID) && "Unsupported types for select operation"); @@ -990,7 +990,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) { case G_FCONSTANT: { // Load from constant pool unsigned Size = MRI.getType(I.getOperand(0).getReg()).getSizeInBits() / 8; - unsigned Alignment = Size; + Align Alignment(Size); assert((Size == 4 || Size == 8) && "Unsupported FP constant type"); auto LoadOpcode = Size == 4 ? ARM::VLDRS : ARM::VLDRD; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index e2dff51ea61c..f3657155f47e 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -357,13 +357,12 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate, llvm_unreachable("Unsupported size for FCmp predicate"); } -bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const { +bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, + MachineInstr &MI) const { using namespace TargetOpcode; - MIRBuilder.setInstr(MI); + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext(); switch (MI.getOpcode()) { @@ -445,8 +444,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, } else { // We need to compare against 0. assert(CmpInst::isIntPredicate(ResultPred) && "Unsupported predicate"); - auto Zero = MRI.createGenericVirtualRegister(LLT::scalar(32)); - MIRBuilder.buildConstant(Zero, 0); + auto Zero = MIRBuilder.buildConstant(LLT::scalar(32), 0); MIRBuilder.buildICmp(ResultPred, ProcessedResult, LibcallResult, Zero); } Results.push_back(ProcessedResult); @@ -462,7 +460,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, // Convert to integer constants, while preserving the binary representation. auto AsInteger = MI.getOperand(1).getFPImm()->getValueAPF().bitcastToAPInt(); - MIRBuilder.buildConstant(MI.getOperand(0).getReg(), + MIRBuilder.buildConstant(MI.getOperand(0), *ConstantInt::get(Ctx, AsInteger)); break; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h index e95f8cf76103..f1c2e9c94336 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h @@ -28,9 +28,7 @@ class ARMLegalizerInfo : public LegalizerInfo { public: ARMLegalizerInfo(const ARMSubtarget &ST); - bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder, - GISelChangeObserver &Observer) const override; + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; private: void setFCmpLibcallsGNU(); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 12dddd29ca84..a84d23d3bb96 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -32,6 +32,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -50,6 +51,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" @@ -900,7 +902,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { unsigned Offset = getMemoryOpOffset(*First); Register Base = getLoadStoreBaseOp(*First).getReg(); bool BaseKill = LatestMI->killsRegister(Base); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg); DebugLoc DL = First->getDebugLoc(); MachineInstr *Merged = nullptr; @@ -991,7 +993,7 @@ static bool mayCombineMisaligned(const TargetSubtargetInfo &STI, // Stack pointer alignment is out of the programmers control so we can trust // SP-relative loads/stores. if (getLoadStoreBaseOp(MI).getReg() == ARM::SP && - STI.getFrameLowering()->getTransientStackAlignment() >= 4) + STI.getFrameLowering()->getTransientStackAlign() >= Align(4)) return true; return false; } @@ -1183,8 +1185,8 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, /// Check if the given instruction increments or decrements a register and /// return the amount it is incremented/decremented. Returns 0 if the CPSR flags /// generated by the instruction are possibly read as well. -static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, - ARMCC::CondCodes Pred, unsigned PredReg) { +static int isIncrementOrDecrement(const MachineInstr &MI, Register Reg, + ARMCC::CondCodes Pred, Register PredReg) { bool CheckCPSRDef; int Scale; switch (MI.getOpcode()) { @@ -1201,7 +1203,7 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, default: return 0; } - unsigned MIPredReg; + Register MIPredReg; if (MI.getOperand(0).getReg() != Reg || MI.getOperand(1).getReg() != Reg || getInstrPredicate(MI, MIPredReg) != Pred || @@ -1215,8 +1217,8 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, /// Searches for an increment or decrement of \p Reg before \p MBBI. static MachineBasicBlock::iterator -findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, - ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { +findIncDecBefore(MachineBasicBlock::iterator MBBI, Register Reg, + ARMCC::CondCodes Pred, Register PredReg, int &Offset) { Offset = 0; MachineBasicBlock &MBB = *MBBI->getParent(); MachineBasicBlock::iterator BeginMBBI = MBB.begin(); @@ -1235,8 +1237,8 @@ findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, /// Searches for a increment or decrement of \p Reg after \p MBBI. static MachineBasicBlock::iterator -findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, - ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { +findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg, + ARMCC::CondCodes Pred, Register PredReg, int &Offset) { Offset = 0; MachineBasicBlock &MBB = *MBBI->getParent(); MachineBasicBlock::iterator EndMBBI = MBB.end(); @@ -1270,7 +1272,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { const MachineOperand &BaseOP = MI->getOperand(0); Register Base = BaseOP.getReg(); bool BaseKill = BaseOP.isKill(); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); unsigned Opcode = MI->getOpcode(); DebugLoc DL = MI->getDebugLoc(); @@ -1383,6 +1385,38 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc, case ARM::t2STRi8: case ARM::t2STRi12: return ARM::t2STR_POST; + + case ARM::MVE_VLDRBS16: + return ARM::MVE_VLDRBS16_post; + case ARM::MVE_VLDRBS32: + return ARM::MVE_VLDRBS32_post; + case ARM::MVE_VLDRBU16: + return ARM::MVE_VLDRBU16_post; + case ARM::MVE_VLDRBU32: + return ARM::MVE_VLDRBU32_post; + case ARM::MVE_VLDRHS32: + return ARM::MVE_VLDRHS32_post; + case ARM::MVE_VLDRHU32: + return ARM::MVE_VLDRHU32_post; + case ARM::MVE_VLDRBU8: + return ARM::MVE_VLDRBU8_post; + case ARM::MVE_VLDRHU16: + return ARM::MVE_VLDRHU16_post; + case ARM::MVE_VLDRWU32: + return ARM::MVE_VLDRWU32_post; + case ARM::MVE_VSTRB16: + return ARM::MVE_VSTRB16_post; + case ARM::MVE_VSTRB32: + return ARM::MVE_VSTRB32_post; + case ARM::MVE_VSTRH32: + return ARM::MVE_VSTRH32_post; + case ARM::MVE_VSTRBU8: + return ARM::MVE_VSTRBU8_post; + case ARM::MVE_VSTRHU16: + return ARM::MVE_VSTRHU16_post; + case ARM::MVE_VSTRWU32: + return ARM::MVE_VSTRWU32_post; + default: llvm_unreachable("Unhandled opcode!"); } } @@ -1412,7 +1446,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { if (MI->getOperand(0).getReg() == Base) return false; - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); int Bytes = getLSMultipleTransferSize(MI); MachineBasicBlock &MBB = *MI->getParent(); @@ -1525,7 +1559,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) return false; - unsigned PredReg; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); MachineBasicBlock::iterator MBBI(MI); MachineBasicBlock &MBB = *MI.getParent(); @@ -1602,13 +1636,13 @@ static bool isMemoryOp(const MachineInstr &MI) { // Don't touch volatile memory accesses - we may be changing their order. // TODO: We could allow unordered and monotonic atomics here, but we need to - // make sure the resulting ldm/stm is correctly marked as atomic. + // make sure the resulting ldm/stm is correctly marked as atomic. if (MMO.isVolatile() || MMO.isAtomic()) return false; // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is // not. - if (MMO.getAlignment() < 4) + if (MMO.getAlign() < Align(4)) return false; // str <undef> could probably be eliminated entirely, but for now we just want @@ -1692,7 +1726,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, assert((isT2 || MI->getOperand(3).getReg() == ARM::NoRegister) && "register offset not handled below"); int OffImm = getMemoryOpOffset(*MI); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); if (OddRegNum > EvenRegNum && OffImm == 0) { @@ -1792,7 +1826,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { const MachineOperand &MO = MBBI->getOperand(0); Register Reg = MO.getReg(); Register Base = getLoadStoreBaseOp(*MBBI).getReg(); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg); int Offset = getMemoryOpOffset(*MBBI); if (CurrBase == 0) { @@ -2046,6 +2080,7 @@ namespace { const TargetRegisterInfo *TRI; const ARMSubtarget *STI; MachineRegisterInfo *MRI; + MachineDominatorTree *DT; MachineFunction *MF; ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {} @@ -2058,29 +2093,34 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } private: bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, - unsigned &NewOpc, unsigned &EvenReg, - unsigned &OddReg, unsigned &BaseReg, - int &Offset, - unsigned &PredReg, ARMCC::CondCodes &Pred, - bool &isT2); + unsigned &NewOpc, Register &EvenReg, Register &OddReg, + Register &BaseReg, int &Offset, Register &PredReg, + ARMCC::CondCodes &Pred, bool &isT2); bool RescheduleOps(MachineBasicBlock *MBB, SmallVectorImpl<MachineInstr *> &Ops, unsigned Base, bool isLd, DenseMap<MachineInstr*, unsigned> &MI2LocMap); bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB); + bool DistributeIncrements(); + bool DistributeIncrements(Register Base); }; } // end anonymous namespace char ARMPreAllocLoadStoreOpt::ID = 0; -INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", - ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) +INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) // Limit the number of instructions to be rescheduled. // FIXME: tune this limit, and/or come up with some better heuristics. @@ -2096,10 +2136,11 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); MRI = &Fn.getRegInfo(); + DT = &getAnalysis<MachineDominatorTree>(); MF = &Fn; AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - bool Modified = false; + bool Modified = DistributeIncrements(); for (MachineBasicBlock &MFI : Fn) Modified |= RescheduleLoadStoreInstrs(&MFI); @@ -2143,15 +2184,10 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base, return AddedRegPressure.size() <= MemRegs.size() * 2; } -bool -ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, - DebugLoc &dl, unsigned &NewOpc, - unsigned &FirstReg, - unsigned &SecondReg, - unsigned &BaseReg, int &Offset, - unsigned &PredReg, - ARMCC::CondCodes &Pred, - bool &isT2) { +bool ARMPreAllocLoadStoreOpt::CanFormLdStDWord( + MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc, + Register &FirstReg, Register &SecondReg, Register &BaseReg, int &Offset, + Register &PredReg, ARMCC::CondCodes &Pred, bool &isT2) { // Make sure we're allowed to generate LDRD/STRD. if (!STI->hasV5TEOps()) return false; @@ -2183,12 +2219,12 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, (*Op0->memoperands_begin())->isAtomic()) return false; - unsigned Align = (*Op0->memoperands_begin())->getAlignment(); + Align Alignment = (*Op0->memoperands_begin())->getAlign(); const Function &Func = MF->getFunction(); - unsigned ReqAlign = STI->hasV6Ops() - ? TD->getABITypeAlignment(Type::getInt64Ty(Func.getContext())) - : 8; // Pre-v6 need 8-byte align - if (Align < ReqAlign) + Align ReqAlign = + STI->hasV6Ops() ? TD->getABITypeAlign(Type::getInt64Ty(Func.getContext())) + : Align(8); // Pre-v6 need 8-byte align + if (Alignment < ReqAlign) return false; // Then make sure the immediate offset fits. @@ -2313,8 +2349,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, // to try to allocate a pair of registers that can form register pairs. MachineInstr *Op0 = Ops.back(); MachineInstr *Op1 = Ops[Ops.size()-2]; - unsigned FirstReg = 0, SecondReg = 0; - unsigned BaseReg = 0, PredReg = 0; + Register FirstReg, SecondReg; + Register BaseReg, PredReg; ARMCC::CondCodes Pred = ARMCC::AL; bool isT2 = false; unsigned NewOpc = 0; @@ -2416,7 +2452,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { if (!isMemoryOp(MI)) continue; - unsigned PredReg = 0; + Register PredReg; if (getInstrPredicate(MI, PredReg) != ARMCC::AL) continue; @@ -2482,6 +2518,199 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { return RetVal; } +// Get the Base register operand index from the memory access MachineInst if we +// should attempt to distribute postinc on it. Return -1 if not of a valid +// instruction type. If it returns an index, it is assumed that instruction is a +// r+i indexing mode, and getBaseOperandIndex() + 1 is the Offset index. +static int getBaseOperandIndex(MachineInstr &MI) { + switch (MI.getOpcode()) { + case ARM::MVE_VLDRBS16: + case ARM::MVE_VLDRBS32: + case ARM::MVE_VLDRBU16: + case ARM::MVE_VLDRBU32: + case ARM::MVE_VLDRHS32: + case ARM::MVE_VLDRHU32: + case ARM::MVE_VLDRBU8: + case ARM::MVE_VLDRHU16: + case ARM::MVE_VLDRWU32: + case ARM::MVE_VSTRB16: + case ARM::MVE_VSTRB32: + case ARM::MVE_VSTRH32: + case ARM::MVE_VSTRBU8: + case ARM::MVE_VSTRHU16: + case ARM::MVE_VSTRWU32: + return 1; + } + return -1; +} + +static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset, + Register NewReg, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) { + MachineFunction *MF = MI->getMF(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + unsigned NewOpcode = getPostIndexedLoadStoreOpcode( + MI->getOpcode(), Offset > 0 ? ARM_AM::add : ARM_AM::sub); + + const MCInstrDesc &MCID = TII->get(NewOpcode); + // Constrain the def register class + const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF); + MRI.constrainRegClass(NewReg, TRC); + // And do the same for the base operand + TRC = TII->getRegClass(MCID, 2, TRI, *MF); + MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC); + + return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID) + .addReg(NewReg, RegState::Define) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(Offset) + .add(MI->getOperand(3)) + .add(MI->getOperand(4)) + .cloneMemRefs(*MI); +} + +// Given a Base Register, optimise the load/store uses to attempt to create more +// post-inc accesses. We do this by taking zero offset loads/stores with an add, +// and convert them to a postinc load/store of the same type. Any subsequent +// accesses will be adjusted to use and account for the post-inc value. +// For example: +// LDR #0 LDR_POSTINC #16 +// LDR #4 LDR #-12 +// LDR #8 LDR #-8 +// LDR #12 LDR #-4 +// ADD #16 +bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) { + // We are looking for: + // One zero offset load/store that can become postinc + MachineInstr *BaseAccess = nullptr; + // An increment that can be folded in + MachineInstr *Increment = nullptr; + // Other accesses after BaseAccess that will need to be updated to use the + // postinc value + SmallPtrSet<MachineInstr *, 8> OtherAccesses; + for (auto &Use : MRI->use_nodbg_instructions(Base)) { + if (!Increment && getAddSubImmediate(Use) != 0) { + Increment = &Use; + continue; + } + + int BaseOp = getBaseOperandIndex(Use); + if (BaseOp == -1) + return false; + + if (!Use.getOperand(BaseOp).isReg() || + Use.getOperand(BaseOp).getReg() != Base) + return false; + if (Use.getOperand(BaseOp + 1).getImm() == 0) + BaseAccess = &Use; + else + OtherAccesses.insert(&Use); + } + + if (!BaseAccess || !Increment || + BaseAccess->getParent() != Increment->getParent()) + return false; + Register PredReg; + if (Increment->definesRegister(ARM::CPSR) || + getInstrPredicate(*Increment, PredReg) != ARMCC::AL) + return false; + + LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg " + << Base.virtRegIndex() << "\n"); + + // Make sure that Increment has no uses before BaseAccess. + for (MachineInstr &Use : + MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) { + if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) { + LLVM_DEBUG(dbgs() << " BaseAccess doesn't dominate use of increment\n"); + return false; + } + } + + // Make sure that Increment can be folded into Base + int IncrementOffset = getAddSubImmediate(*Increment); + unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode( + BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub); + if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) { + LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on postinc\n"); + return false; + } + + // And make sure that the negative value of increment can be added to all + // other offsets after the BaseAccess. We rely on either + // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess) + // to keep things simple. + SmallPtrSet<MachineInstr *, 4> SuccessorAccesses; + for (auto *Use : OtherAccesses) { + if (DT->dominates(BaseAccess, Use)) { + SuccessorAccesses.insert(Use); + unsigned BaseOp = getBaseOperandIndex(*Use); + if (!isLegalAddressImm( + Use->getOpcode(), + Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) { + LLVM_DEBUG(dbgs() << " Illegal addressing mode immediate on use\n"); + return false; + } + } else if (!DT->dominates(Use, BaseAccess)) { + LLVM_DEBUG( + dbgs() << " Unknown dominance relation between Base and Use\n"); + return false; + } + } + + // Replace BaseAccess with a post inc + LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump()); + LLVM_DEBUG(dbgs() << " And : "; Increment->dump()); + Register NewBaseReg = Increment->getOperand(0).getReg(); + MachineInstr *BaseAccessPost = + createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI); + BaseAccess->eraseFromParent(); + Increment->eraseFromParent(); + (void)BaseAccessPost; + LLVM_DEBUG(dbgs() << " To : "; BaseAccessPost->dump()); + + for (auto *Use : SuccessorAccesses) { + LLVM_DEBUG(dbgs() << "Changing: "; Use->dump()); + unsigned BaseOp = getBaseOperandIndex(*Use); + Use->getOperand(BaseOp).setReg(NewBaseReg); + int OldOffset = Use->getOperand(BaseOp + 1).getImm(); + Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset); + LLVM_DEBUG(dbgs() << " To : "; Use->dump()); + } + + // Remove the kill flag from all uses of NewBaseReg, in case any old uses + // remain. + for (MachineOperand &Op : MRI->use_nodbg_operands(NewBaseReg)) + Op.setIsKill(false); + return true; +} + +bool ARMPreAllocLoadStoreOpt::DistributeIncrements() { + bool Changed = false; + SmallSetVector<Register, 4> Visited; + for (auto &MBB : *MF) { + for (auto &MI : MBB) { + int BaseOp = getBaseOperandIndex(MI); + if (BaseOp == -1 || !MI.getOperand(BaseOp).isReg()) + continue; + + Register Base = MI.getOperand(BaseOp).getReg(); + if (!Base.isVirtual() || Visited.count(Base)) + continue; + + Visited.insert(Base); + } + } + + for (auto Base : Visited) + Changed |= DistributeIncrements(Base); + + return Changed; +} + /// Returns an instance of the load / store optimization pass. FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { if (PreAlloc) diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 6717d4706aef..be75d6bef08c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -35,6 +35,20 @@ /// are defined to be as large as this maximum sequence of replacement /// instructions. /// +/// A note on VPR.P0 (the lane mask): +/// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a +/// "VPT Active" context (which includes low-overhead loops and vpt blocks). +/// They will simply "and" the result of their calculation with the current +/// value of VPR.P0. You can think of it like this: +/// \verbatim +/// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs +/// VPR.P0 &= Value +/// else +/// VPR.P0 = Value +/// \endverbatim +/// When we're inside the low-overhead loop (between DLSTP and LETP), we always +/// fall in the "VPT active" case, so we can consider that all VPR writes by +/// one of those instruction is actually a "and". //===----------------------------------------------------------------------===// #include "ARM.h" @@ -45,6 +59,7 @@ #include "Thumb2InstrInfo.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineLoopUtils.h" @@ -60,34 +75,93 @@ using namespace llvm; namespace { + using InstSet = SmallPtrSetImpl<MachineInstr *>; + + class PostOrderLoopTraversal { + MachineLoop &ML; + MachineLoopInfo &MLI; + SmallPtrSet<MachineBasicBlock*, 4> Visited; + SmallVector<MachineBasicBlock*, 4> Order; + + public: + PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI) + : ML(ML), MLI(MLI) { } + + const SmallVectorImpl<MachineBasicBlock*> &getOrder() const { + return Order; + } + + // Visit all the blocks within the loop, as well as exit blocks and any + // blocks properly dominating the header. + void ProcessLoop() { + std::function<void(MachineBasicBlock*)> Search = [this, &Search] + (MachineBasicBlock *MBB) -> void { + if (Visited.count(MBB)) + return; + + Visited.insert(MBB); + for (auto *Succ : MBB->successors()) { + if (!ML.contains(Succ)) + continue; + Search(Succ); + } + Order.push_back(MBB); + }; + + // Insert exit blocks. + SmallVector<MachineBasicBlock*, 2> ExitBlocks; + ML.getExitBlocks(ExitBlocks); + for (auto *MBB : ExitBlocks) + Order.push_back(MBB); + + // Then add the loop body. + Search(ML.getHeader()); + + // Then try the preheader and its predecessors. + std::function<void(MachineBasicBlock*)> GetPredecessor = + [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void { + Order.push_back(MBB); + if (MBB->pred_size() == 1) + GetPredecessor(*MBB->pred_begin()); + }; + + if (auto *Preheader = ML.getLoopPreheader()) + GetPredecessor(Preheader); + else if (auto *Preheader = MLI.findLoopPreheader(&ML, true)) + GetPredecessor(Preheader); + } + }; + struct PredicatedMI { MachineInstr *MI = nullptr; SetVector<MachineInstr*> Predicates; public: - PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) : - MI(I) { + PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) { + assert(I && "Instruction must not be null!"); Predicates.insert(Preds.begin(), Preds.end()); } }; - // Represent a VPT block, a list of instructions that begins with a VPST and - // has a maximum of four proceeding instructions. All instructions within the - // block are predicated upon the vpr and we allow instructions to define the - // vpr within in the block too. + // Represent a VPT block, a list of instructions that begins with a VPT/VPST + // and has a maximum of four proceeding instructions. All instructions within + // the block are predicated upon the vpr and we allow instructions to define + // the vpr within in the block too. class VPTBlock { - std::unique_ptr<PredicatedMI> VPST; + // The predicate then instruction, which is either a VPT, or a VPST + // instruction. + std::unique_ptr<PredicatedMI> PredicateThen; PredicatedMI *Divergent = nullptr; SmallVector<PredicatedMI, 4> Insts; public: VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { - VPST = std::make_unique<PredicatedMI>(MI, Preds); + PredicateThen = std::make_unique<PredicatedMI>(MI, Preds); } void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) { LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI); - if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) { + if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) { Divergent = &Insts.back(); LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI); } @@ -104,38 +178,73 @@ namespace { // Is the given instruction part of the predicate set controlling the entry // to the block. bool IsPredicatedOn(MachineInstr *MI) const { - return VPST->Predicates.count(MI); + return PredicateThen->Predicates.count(MI); + } + + // Returns true if this is a VPT instruction. + bool isVPT() const { return !isVPST(); } + + // Returns true if this is a VPST instruction. + bool isVPST() const { + return PredicateThen->MI->getOpcode() == ARM::MVE_VPST; } // Is the given instruction the only predicate which controls the entry to // the block. bool IsOnlyPredicatedOn(MachineInstr *MI) const { - return IsPredicatedOn(MI) && VPST->Predicates.size() == 1; + return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1; } unsigned size() const { return Insts.size(); } SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; } - MachineInstr *getVPST() const { return VPST->MI; } + MachineInstr *getPredicateThen() const { return PredicateThen->MI; } PredicatedMI *getDivergent() const { return Divergent; } }; + struct Reduction { + MachineInstr *Init; + MachineInstr &Copy; + MachineInstr &Reduce; + MachineInstr &VPSEL; + + Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add, + MachineInstr *Sel) + : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel) { } + }; + struct LowOverheadLoop { - MachineLoop *ML = nullptr; + MachineLoop &ML; + MachineBasicBlock *Preheader = nullptr; + MachineLoopInfo &MLI; + ReachingDefAnalysis &RDA; + const TargetRegisterInfo &TRI; + const ARMBaseInstrInfo &TII; MachineFunction *MF = nullptr; MachineInstr *InsertPt = nullptr; MachineInstr *Start = nullptr; MachineInstr *Dec = nullptr; MachineInstr *End = nullptr; MachineInstr *VCTP = nullptr; + SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs; VPTBlock *CurrentBlock = nullptr; SetVector<MachineInstr*> CurrentPredicate; SmallVector<VPTBlock, 4> VPTBlocks; + SmallPtrSet<MachineInstr*, 4> ToRemove; + SmallVector<std::unique_ptr<Reduction>, 1> Reductions; + SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute; bool Revert = false; bool CannotTailPredicate = false; - LowOverheadLoop(MachineLoop *ML) : ML(ML) { - MF = ML->getHeader()->getParent(); + LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, + ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI, + const ARMBaseInstrInfo &TII) + : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII) { + MF = ML.getHeader()->getParent(); + if (auto *MBB = ML.getLoopPreheader()) + Preheader = MBB; + else if (auto *MBB = MLI.findLoopPreheader(&ML, true)) + Preheader = MBB; } // If this is an MVE instruction, check that we know how to use tail @@ -151,22 +260,30 @@ namespace { // For now, let's keep things really simple and only support a single // block for tail predication. return !Revert && FoundAllComponents() && VCTP && - !CannotTailPredicate && ML->getNumBlocks() == 1; + !CannotTailPredicate && ML.getNumBlocks() == 1; } - bool ValidateTailPredicate(MachineInstr *StartInsertPt, - ReachingDefAnalysis *RDA, - MachineLoopInfo *MLI); + // Check that the predication in the loop will be equivalent once we + // perform the conversion. Also ensure that we can provide the number + // of elements to the loop start instruction. + bool ValidateTailPredicate(MachineInstr *StartInsertPt); + + // See whether the live-out instructions are a reduction that we can fixup + // later. + bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers); + + // Check that any values available outside of the loop will be the same + // after tail predication conversion. + bool ValidateLiveOuts(); // Is it safe to define LR with DLS/WLS? // LR can be defined if it is the operand to start, because it's the same // value, or if it's going to be equivalent to the operand to Start. - MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA); + MachineInstr *isSafeToDefineLR(); // Check the branch targets are within range and we satisfy our // restrictions. - void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA, - MachineLoopInfo *MLI); + void CheckLegality(ARMBasicBlockUtils *BBUtils); bool FoundAllComponents() const { return Start && Dec && End; @@ -241,18 +358,19 @@ namespace { void RevertWhile(MachineInstr *MI) const; - bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const; + bool RevertLoopDec(MachineInstr *MI) const; void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; - void RemoveLoopUpdate(LowOverheadLoop &LoLoop); - void ConvertVPTBlocks(LowOverheadLoop &LoLoop); + void FixupReductions(LowOverheadLoop &LoLoop) const; + MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); void Expand(LowOverheadLoop &LoLoop); + void IterationCountDCE(LowOverheadLoop &LoLoop); }; } @@ -261,7 +379,7 @@ char ARMLowOverheadLoops::ID = 0; INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, false, false) -MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) { +MachineInstr *LowOverheadLoop::isSafeToDefineLR() { // We can define LR because LR already contains the same value. if (Start->getOperand(0).getReg() == ARM::LR) return Start; @@ -279,52 +397,22 @@ MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) { // Find an insertion point: // - Is there a (mov lr, Count) before Start? If so, and nothing else writes // to Count before Start, we can insert at that mov. - if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR)) - if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) + if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR)) + if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) return LRDef; // - Is there a (mov lr, Count) after Start? If so, and nothing else writes // to Count after Start, we can insert at that mov. - if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR)) - if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg)) + if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR)) + if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg)) return LRDef; // We've found no suitable LR def and Start doesn't use LR directly. Can we // just define LR anyway? - if (!RDA->isRegUsedAfter(Start, ARM::LR)) - return Start; - - return nullptr; -} - -// Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must -// not define a register that is used by any instructions, after and including, -// 'To'. These instructions also must not redefine any of Froms operands. -template<typename Iterator> -static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) { - SmallSet<int, 2> Defs; - // First check that From would compute the same value if moved. - for (auto &MO : From->operands()) { - if (!MO.isReg() || MO.isUndef() || !MO.getReg()) - continue; - if (MO.isDef()) - Defs.insert(MO.getReg()); - else if (!RDA->hasSameReachingDef(From, To, MO.getReg())) - return false; - } - - // Now walk checking that the rest of the instructions will compute the same - // value. - for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) { - for (auto &MO : I->operands()) - if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg())) - return false; - } - return true; + return RDA.isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr; } -bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, - ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) { +bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) { assert(VCTP && "VCTP instruction expected but is not set"); // All predication within the loop should be based on vctp. If the block // isn't predicated on entry, check whether the vctp is within the block @@ -332,24 +420,35 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, for (auto &Block : VPTBlocks) { if (Block.IsPredicatedOn(VCTP)) continue; - if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) { + if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) { LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: " - << *Block.getDivergent()->MI); + << *Block.getDivergent()->MI); return false; } SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); for (auto &PredMI : Insts) { - if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI)) + // Check the instructions in the block and only allow: + // - VCTPs + // - Instructions predicated on the main VCTP + // - Any VCMP + // - VCMPs just "and" their result with VPR.P0. Whether they are + // located before/after the VCTP is irrelevant - the end result will + // be the same in both cases, so there's no point in requiring them + // to be located after the VCTP! + if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) || + VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0) continue; LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI - << " - which is predicated on:\n"; - for (auto *MI : PredMI.Predicates) - dbgs() << " - " << *MI; - ); + << " - which is predicated on:\n"; + for (auto *MI : PredMI.Predicates) + dbgs() << " - " << *MI); return false; } } + if (!ValidateLiveOuts()) + return false; + // For tail predication, we need to provide the number of elements, instead // of the iteration count, to the loop start instruction. The number of // elements is provided to the vctp instruction, so we need to check that @@ -359,7 +458,7 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, // If the register is defined within loop, then we can't perform TP. // TODO: Check whether this is just a mov of a register that would be // available. - if (RDA->getReachingDef(VCTP, NumElements) >= 0) { + if (RDA.hasLocalDefBefore(VCTP, NumElements)) { LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n"); return false; } @@ -367,17 +466,20 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, // The element count register maybe defined after InsertPt, in which case we // need to try to move either InsertPt or the def so that the [w|d]lstp can // use the value. - MachineBasicBlock *InsertBB = InsertPt->getParent(); - if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) { - if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) { - if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) { + // TODO: On failing to move an instruction, check if the count is provided by + // a mov and whether we can use the mov operand directly. + MachineBasicBlock *InsertBB = StartInsertPt->getParent(); + if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) { + if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) { + if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) { ElemDef->removeFromParent(); - InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef); + InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef); LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: " << *ElemDef); - } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) { - InsertPt->removeFromParent(); - InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt); + } else if (RDA.isSafeToMoveBackwards(StartInsertPt, ElemDef)) { + StartInsertPt->removeFromParent(); + InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), + StartInsertPt); LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); } else { LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop " @@ -390,10 +492,10 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, // Especially in the case of while loops, InsertBB may not be the // preheader, so we need to check that the register isn't redefined // before entering the loop. - auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB, + auto CannotProvideElements = [this](MachineBasicBlock *MBB, Register NumElements) { // NumElements is redefined in this block. - if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0) + if (RDA.hasLocalDefBefore(&MBB->back(), NumElements)) return true; // Don't continue searching up through multiple predecessors. @@ -404,7 +506,7 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, }; // First, find the block that looks like the preheader. - MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true); + MachineBasicBlock *MBB = Preheader; if (!MBB) { LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n"); return false; @@ -419,13 +521,372 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt, MBB = *MBB->pred_begin(); } - LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n"); + // Check that the value change of the element count is what we expect and + // that the predication will be equivalent. For this we need: + // NumElements = NumElements - VectorWidth. The sub will be a sub immediate + // and we can also allow register copies within the chain too. + auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) { + return -getAddSubImmediate(*MI) == ExpectedVecWidth; + }; + + MBB = VCTP->getParent(); + if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), NumElements)) { + SmallPtrSet<MachineInstr*, 2> ElementChain; + SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP }; + unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode()); + + Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end()); + + if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) { + bool FoundSub = false; + + for (auto *MI : ElementChain) { + if (isMovRegOpcode(MI->getOpcode())) + continue; + + if (isSubImmOpcode(MI->getOpcode())) { + if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) + return false; + FoundSub = true; + } else + return false; + } + + LLVM_DEBUG(dbgs() << "ARM Loops: Will remove element count chain:\n"; + for (auto *MI : ElementChain) + dbgs() << " - " << *MI); + ToRemove.insert(ElementChain.begin(), ElementChain.end()); + } + } + return true; +} + +static bool isVectorPredicated(MachineInstr *MI) { + int PIdx = llvm::findFirstVPTPredOperandIdx(*MI); + return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR; +} + +static bool isRegInClass(const MachineOperand &MO, + const TargetRegisterClass *Class) { + return MO.isReg() && MO.getReg() && Class->contains(MO.getReg()); +} + +// MVE 'narrowing' operate on half a lane, reading from half and writing +// to half, which are referred to has the top and bottom half. The other +// half retains its previous value. +static bool retainsPreviousHalfElement(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + uint64_t Flags = MCID.TSFlags; + return (Flags & ARMII::RetainsPreviousHalfElement) != 0; +} + +// Some MVE instructions read from the top/bottom halves of their operand(s) +// and generate a vector result with result elements that are double the +// width of the input. +static bool producesDoubleWidthResult(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + uint64_t Flags = MCID.TSFlags; + return (Flags & ARMII::DoubleWidthResult) != 0; +} + +static bool isHorizontalReduction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + uint64_t Flags = MCID.TSFlags; + return (Flags & ARMII::HorizontalReduction) != 0; +} + +// Can this instruction generate a non-zero result when given only zeroed +// operands? This allows us to know that, given operands with false bytes +// zeroed by masked loads, that the result will also contain zeros in those +// bytes. +static bool canGenerateNonZeros(const MachineInstr &MI) { + + // Check for instructions which can write into a larger element size, + // possibly writing into a previous zero'd lane. + if (producesDoubleWidthResult(MI)) + return true; + + switch (MI.getOpcode()) { + default: + break; + // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow + // fp16 -> fp32 vector conversions. + // Instructions that perform a NOT will generate 1s from 0s. + case ARM::MVE_VMVN: + case ARM::MVE_VORN: + // Count leading zeros will do just that! + case ARM::MVE_VCLZs8: + case ARM::MVE_VCLZs16: + case ARM::MVE_VCLZs32: + return true; + } + return false; +} + + +// Look at its register uses to see if it only can only receive zeros +// into its false lanes which would then produce zeros. Also check that +// the output register is also defined by an FalseLanesZero instruction +// so that if tail-predication happens, the lanes that aren't updated will +// still be zeros. +static bool producesFalseLanesZero(MachineInstr &MI, + const TargetRegisterClass *QPRs, + const ReachingDefAnalysis &RDA, + InstSet &FalseLanesZero) { + if (canGenerateNonZeros(MI)) + return false; + + bool AllowScalars = isHorizontalReduction(MI); + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg()) + continue; + if (!isRegInClass(MO, QPRs) && AllowScalars) + continue; + if (auto *OpDef = RDA.getMIOperand(&MI, MO)) + if (FalseLanesZero.count(OpDef)) + continue; + return false; + } + LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); + return true; +} + +bool +LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) { + // Also check for reductions where the operation needs to be merging values + // from the last and previous loop iterations. This means an instruction + // producing a value and a vmov storing the value calculated in the previous + // iteration. So we can have two live-out regs, one produced by a vmov and + // both being consumed by a vpsel. + LLVM_DEBUG(dbgs() << "ARM Loops: Looking for reduction live-outs:\n"; + for (auto *MI : LiveMIs) + dbgs() << " - " << *MI); + + if (!Preheader) + return false; + + // Expect a vmov, a vadd and a single vpsel user. + // TODO: This means we can't currently support multiple reductions in the + // loop. + if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1) + return false; + + MachineInstr *VPSEL = *LiveOutUsers.begin(); + if (VPSEL->getOpcode() != ARM::MVE_VPSEL) + return false; + + unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1; + MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx); + if (!Pred || Pred != VCTP) { + LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n"); + return false; + } + + MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1); + if (!Reduce) + return false; + + assert(LiveMIs.count(Reduce) && "Expected MI to be live-out"); + + // TODO: Support more operations than VADD. + switch (VCTP->getOpcode()) { + default: + return false; + case ARM::MVE_VCTP8: + if (Reduce->getOpcode() != ARM::MVE_VADDi8) + return false; + break; + case ARM::MVE_VCTP16: + if (Reduce->getOpcode() != ARM::MVE_VADDi16) + return false; + break; + case ARM::MVE_VCTP32: + if (Reduce->getOpcode() != ARM::MVE_VADDi32) + return false; + break; + } + + // Test that the reduce op is overwriting ones of its operands. + if (Reduce->getOperand(0).getReg() != Reduce->getOperand(1).getReg() && + Reduce->getOperand(0).getReg() != Reduce->getOperand(2).getReg()) { + LLVM_DEBUG(dbgs() << "ARM Loops: Reducing op isn't overwriting itself.\n"); + return false; + } + + // Check that the VORR is actually a VMOV. + MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2); + if (!Copy || Copy->getOpcode() != ARM::MVE_VORR || + !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() || + Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg()) + return false; + + assert(LiveMIs.count(Copy) && "Expected MI to be live-out"); + + // Check that the vadd and vmov are only used by each other and the vpsel. + SmallPtrSet<MachineInstr*, 2> CopyUsers; + RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers); + if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Copy users unsupported.\n"); + return false; + } + + SmallPtrSet<MachineInstr*, 2> ReduceUsers; + RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers); + if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Reduce users unsupported.\n"); + return false; + } + + // Then find whether there's an instruction initialising the register that + // is storing the reduction. + SmallPtrSet<MachineInstr*, 2> Incoming; + RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming); + if (Incoming.size() > 1) + return false; + + MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin(); + LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n" + << " - " << *Copy + << " - " << *Reduce + << " - " << *VPSEL); + Reductions.push_back(std::make_unique<Reduction>(Init, Copy, Reduce, VPSEL)); return true; } -void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, - ReachingDefAnalysis *RDA, - MachineLoopInfo *MLI) { +bool LowOverheadLoop::ValidateLiveOuts() { + // We want to find out if the tail-predicated version of this loop will + // produce the same values as the loop in its original form. For this to + // be true, the newly inserted implicit predication must not change the + // the (observable) results. + // We're doing this because many instructions in the loop will not be + // predicated and so the conversion from VPT predication to tail-predication + // can result in different values being produced; due to the tail-predication + // preventing many instructions from updating their falsely predicated + // lanes. This analysis assumes that all the instructions perform lane-wise + // operations and don't perform any exchanges. + // A masked load, whether through VPT or tail predication, will write zeros + // to any of the falsely predicated bytes. So, from the loads, we know that + // the false lanes are zeroed and here we're trying to track that those false + // lanes remain zero, or where they change, the differences are masked away + // by their user(s). + // All MVE loads and stores have to be predicated, so we know that any load + // operands, or stored results are equivalent already. Other explicitly + // predicated instructions will perform the same operation in the original + // loop and the tail-predicated form too. Because of this, we can insert + // loads, stores and other predicated instructions into our Predicated + // set and build from there. + const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID); + SetVector<MachineInstr *> FalseLanesUnknown; + SmallPtrSet<MachineInstr *, 4> FalseLanesZero; + SmallPtrSet<MachineInstr *, 4> Predicated; + MachineBasicBlock *Header = ML.getHeader(); + + for (auto &MI : *Header) { + const MCInstrDesc &MCID = MI.getDesc(); + uint64_t Flags = MCID.TSFlags; + if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) + continue; + + if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode())) + continue; + + // Predicated loads will write zeros to the falsely predicated bytes of the + // destination register. + if (isVectorPredicated(&MI)) { + if (MI.mayLoad()) + FalseLanesZero.insert(&MI); + Predicated.insert(&MI); + continue; + } + + if (MI.getNumDefs() == 0) + continue; + + if (!producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) { + // We require retaining and horizontal operations to operate upon zero'd + // false lanes to ensure the conversion doesn't change the output. + if (retainsPreviousHalfElement(MI) || isHorizontalReduction(MI)) + return false; + // Otherwise we need to evaluate this instruction later to see whether + // unknown false lanes will get masked away by their user(s). + FalseLanesUnknown.insert(&MI); + } else if (!isHorizontalReduction(MI)) + FalseLanesZero.insert(&MI); + } + + auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, + SmallPtrSetImpl<MachineInstr *> &Predicated) { + SmallPtrSet<MachineInstr *, 2> Uses; + RDA.getGlobalUses(MI, MO.getReg(), Uses); + for (auto *Use : Uses) { + if (Use != MI && !Predicated.count(Use)) + return false; + } + return true; + }; + + // Visit the unknowns in reverse so that we can start at the values being + // stored and then we can work towards the leaves, hopefully adding more + // instructions to Predicated. Successfully terminating the loop means that + // all the unknown values have to found to be masked by predicated user(s). + // For any unpredicated values, we store them in NonPredicated so that we + // can later check whether these form a reduction. + SmallPtrSet<MachineInstr*, 2> NonPredicated; + for (auto *MI : reverse(FalseLanesUnknown)) { + for (auto &MO : MI->operands()) { + if (!isRegInClass(MO, QPRs) || !MO.isDef()) + continue; + if (!HasPredicatedUsers(MI, MO, Predicated)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : " + << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); + NonPredicated.insert(MI); + continue; + } + } + // Any unknown false lanes have been masked away by the user(s). + Predicated.insert(MI); + } + + SmallPtrSet<MachineInstr *, 2> LiveOutMIs; + SmallPtrSet<MachineInstr*, 2> LiveOutUsers; + SmallVector<MachineBasicBlock *, 2> ExitBlocks; + ML.getExitBlocks(ExitBlocks); + assert(ML.getNumBlocks() == 1 && "Expected single block loop!"); + assert(ExitBlocks.size() == 1 && "Expected a single exit block"); + MachineBasicBlock *ExitBB = ExitBlocks.front(); + for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { + // Check Q-regs that are live in the exit blocks. We don't collect scalars + // because they won't be affected by lane predication. + if (QPRs->contains(RegMask.PhysReg)) { + if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg)) + LiveOutMIs.insert(MI); + RDA.getLiveInUses(ExitBB, RegMask.PhysReg, LiveOutUsers); + } + } + + // If we have any non-predicated live-outs, they need to be part of a + // reduction that we can fixup later. The reduction that the form of an + // operation that uses its previous values through a vmov and then a vpsel + // resides in the exit blocks to select the final bytes from n and n-1 + // iterations. + if (!NonPredicated.empty() && + !FindValidReduction(NonPredicated, LiveOutUsers)) + return false; + + // We've already validated that any VPT predication within the loop will be + // equivalent when we perform the predication transformation; so we know that + // any VPT predicated instruction is predicated upon VCTP. Any live-out + // instruction needs to be predicated, so check this here. The instructions + // in NonPredicated have been found to be a reduction that we can ensure its + // legality. + for (auto *MI : LiveOutMIs) + if (!isVectorPredicated(MI) && !NonPredicated.count(MI)) + return false; + + return true; +} + +void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) { if (Revert) return; @@ -434,7 +895,7 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, // TODO Maybe there's cases where the target doesn't have to be the header, // but for now be safe and revert. - if (End->getOperand(1).getMBB() != ML->getHeader()) { + if (End->getOperand(1).getMBB() != ML.getHeader()) { LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); Revert = true; return; @@ -442,8 +903,8 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, // The WLS and LE instructions have 12-bits for the label offset. WLS // requires a positive offset, while LE uses negative. - if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || - !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { + if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) || + !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) { LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); Revert = true; return; @@ -458,7 +919,7 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, return; } - InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA); + InsertPt = Revert ? nullptr : isSafeToDefineLR(); if (!InsertPt) { LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); Revert = true; @@ -473,9 +934,9 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils, return; } - assert(ML->getBlocks().size() == 1 && + assert(ML.getBlocks().size() == 1 && "Shouldn't be processing a loop with more than one block"); - CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI); + CannotTailPredicate = !ValidateTailPredicate(InsertPt); LLVM_DEBUG(if (CannotTailPredicate) dbgs() << "ARM Loops: Couldn't validate tail predicate.\n"); } @@ -484,29 +945,44 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { if (CannotTailPredicate) return false; - // Only support a single vctp. - if (isVCTP(MI) && VCTP) - return false; + if (isVCTP(MI)) { + // If we find another VCTP, check whether it uses the same value as the main VCTP. + // If it does, store it in the SecondaryVCTPs set, else refuse it. + if (VCTP) { + if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) || + !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " + "definition from the main VCTP"); + return false; + } + LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI); + SecondaryVCTPs.insert(MI); + } else { + LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI); + VCTP = MI; + } + } else if (isVPTOpcode(MI->getOpcode())) { + if (MI->getOpcode() != ARM::MVE_VPST) { + assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 && + "VPT does not implicitly define VPR?!"); + CurrentPredicate.insert(MI); + } - // Start a new vpt block when we discover a vpt. - if (MI->getOpcode() == ARM::MVE_VPST) { VPTBlocks.emplace_back(MI, CurrentPredicate); CurrentBlock = &VPTBlocks.back(); return true; - } else if (isVCTP(MI)) - VCTP = MI; - else if (MI->getOpcode() == ARM::MVE_VPSEL || - MI->getOpcode() == ARM::MVE_VPNOT) + } else if (MI->getOpcode() == ARM::MVE_VPSEL || + MI->getOpcode() == ARM::MVE_VPNOT) { + // TODO: Allow VPSEL and VPNOT, we currently cannot because: + // 1) It will use the VPR as a predicate operand, but doesn't have to be + // instead a VPT block, which means we can assert while building up + // the VPT block because we don't find another VPT or VPST to being a new + // one. + // 2) VPSEL still requires a VPR operand even after tail predicating, + // which means we can't remove it unless there is another + // instruction, such as vcmp, that can provide the VPR def. return false; - - // TODO: Allow VPSEL and VPNOT, we currently cannot because: - // 1) It will use the VPR as a predicate operand, but doesn't have to be - // instead a VPT block, which means we can assert while building up - // the VPT block because we don't find another VPST to being a new - // one. - // 2) VPSEL still requires a VPR operand even after tail predicating, - // which means we can't remove it unless there is another - // instruction, such as vcmp, that can provide the VPR def. + } bool IsUse = false; bool IsDef = false; @@ -548,7 +1024,9 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { return false; } - return true; + // If the instruction is already explicitly predicated, then the conversion + // will be fine, but ensure that all memory operations are predicated. + return !IsUse && MI->mayLoadOrStore() ? false : true; } bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { @@ -591,6 +1069,8 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { dbgs() << " - " << Preheader->getName() << "\n"; else if (auto *Preheader = MLI->findLoopPreheader(ML)) dbgs() << " - " << Preheader->getName() << "\n"; + else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) + dbgs() << " - " << Preheader->getName() << "\n"; for (auto *MBB : ML->getBlocks()) dbgs() << " - " << MBB->getName() << "\n"; ); @@ -608,14 +1088,12 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { return nullptr; }; - LowOverheadLoop LoLoop(ML); + LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII); // Search the preheader for the start intrinsic. // FIXME: I don't see why we shouldn't be supporting multiple predecessors // with potentially multiple set.loop.iterations, so we need to enable this. - if (auto *Preheader = ML->getLoopPreheader()) - LoLoop.Start = SearchForStart(Preheader); - else if (auto *Preheader = MLI->findLoopPreheader(ML, true)) - LoLoop.Start = SearchForStart(Preheader); + if (LoLoop.Preheader) + LoLoop.Start = SearchForStart(LoLoop.Preheader); else return false; @@ -624,7 +1102,9 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { // whether we can convert that predicate using tail predication. for (auto *MBB : reverse(ML->getBlocks())) { for (auto &MI : *MBB) { - if (MI.getOpcode() == ARM::t2LoopDec) + if (MI.isDebugValue()) + continue; + else if (MI.getOpcode() == ARM::t2LoopDec) LoLoop.Dec = &MI; else if (MI.getOpcode() == ARM::t2LoopEnd) LoLoop.End = &MI; @@ -641,28 +1121,6 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { // Check we know how to tail predicate any mve instructions. LoLoop.AnalyseMVEInst(&MI); } - - // We need to ensure that LR is not used or defined inbetween LoopDec and - // LoopEnd. - if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert) - continue; - - // If we find that LR has been written or read between LoopDec and - // LoopEnd, expect that the decremented value is being used else where. - // Because this value isn't actually going to be produced until the - // latch, by LE, we would need to generate a real sub. The value is also - // likely to be copied/reloaded for use of LoopEnd - in which in case - // we'd need to perform an add because it gets subtracted again by LE! - // The other option is to then generate the other form of LE which doesn't - // perform the sub. - for (auto &MO : MI.operands()) { - if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && - MO.getReg() == ARM::LR) { - LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI); - LoLoop.Revert = true; - break; - } - } } } @@ -672,7 +1130,15 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { return false; } - LoLoop.CheckLegality(BBUtils.get(), RDA, MLI); + // Check that the only instruction using LoopDec is LoopEnd. + // TODO: Check for copy chains that really have no effect. + SmallPtrSet<MachineInstr*, 2> Uses; + RDA->getReachingLocalUses(LoLoop.Dec, ARM::LR, Uses); + if (Uses.size() > 1 || !Uses.count(LoLoop.End)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n"); + LoLoop.Revert = true; + } + LoLoop.CheckLegality(BBUtils.get()); Expand(LoLoop); return true; } @@ -702,16 +1168,19 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { MI->eraseFromParent(); } -bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI, - bool SetFlags) const { +bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); MachineBasicBlock *MBB = MI->getParent(); + SmallPtrSet<MachineInstr*, 1> Ignore; + for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) { + if (I->getOpcode() == ARM::t2LoopEnd) { + Ignore.insert(&*I); + break; + } + } // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. - if (SetFlags && - (RDA->isRegUsedAfter(MI, ARM::CPSR) || - !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR))) - SetFlags = false; + bool SetFlags = RDA->isSafeToDefRegAt(MI, ARM::CPSR, Ignore); MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); @@ -759,7 +1228,102 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { MI->eraseFromParent(); } +// Perform dead code elimation on the loop iteration count setup expression. +// If we are tail-predicating, the number of elements to be processed is the +// operand of the VCTP instruction in the vector body, see getCount(), which is +// register $r3 in this example: +// +// $lr = big-itercount-expression +// .. +// t2DoLoopStart renamable $lr +// vector.body: +// .. +// $vpr = MVE_VCTP32 renamable $r3 +// renamable $lr = t2LoopDec killed renamable $lr, 1 +// t2LoopEnd renamable $lr, %vector.body +// tB %end +// +// What we would like achieve here is to replace the do-loop start pseudo +// instruction t2DoLoopStart with: +// +// $lr = MVE_DLSTP_32 killed renamable $r3 +// +// Thus, $r3 which defines the number of elements, is written to $lr, +// and then we want to delete the whole chain that used to define $lr, +// see the comment below how this chain could look like. +// +void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { + if (!LoLoop.IsTailPredicationLegal()) + return; + + LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n"); + + MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 0); + if (!Def) { + LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n"); + return; + } + + // Collect and remove the users of iteration count. + SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec, + LoLoop.End, LoLoop.InsertPt }; + SmallPtrSet<MachineInstr*, 2> Remove; + if (RDA->isSafeToRemove(Def, Remove, Killed)) + LoLoop.ToRemove.insert(Remove.begin(), Remove.end()); + else { + LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n"); + return; + } + + // Collect the dead code and the MBBs in which they reside. + RDA->collectKilledOperands(Def, Killed); + SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks; + for (auto *MI : Killed) + BasicBlocks.insert(MI->getParent()); + + // Collect IT blocks in all affected basic blocks. + std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks; + for (auto *MBB : BasicBlocks) { + for (auto &MI : *MBB) { + if (MI.getOpcode() != ARM::t2IT) + continue; + RDA->getReachingLocalUses(&MI, ARM::ITSTATE, ITBlocks[&MI]); + } + } + + // If we're removing all of the instructions within an IT block, then + // also remove the IT instruction. + SmallPtrSet<MachineInstr*, 2> ModifiedITs; + for (auto *MI : Killed) { + if (MachineOperand *MO = MI->findRegisterUseOperand(ARM::ITSTATE)) { + MachineInstr *IT = RDA->getMIOperand(MI, *MO); + auto &CurrentBlock = ITBlocks[IT]; + CurrentBlock.erase(MI); + if (CurrentBlock.empty()) + ModifiedITs.erase(IT); + else + ModifiedITs.insert(IT); + } + } + + // Delete the killed instructions only if we don't have any IT blocks that + // need to be modified because we need to fixup the mask. + // TODO: Handle cases where IT blocks are modified. + if (ModifiedITs.empty()) { + LLVM_DEBUG(dbgs() << "ARM Loops: Will remove iteration count:\n"; + for (auto *MI : Killed) + dbgs() << " - " << *MI); + LoLoop.ToRemove.insert(Killed.begin(), Killed.end()); + } else + LLVM_DEBUG(dbgs() << "ARM Loops: Would need to modify IT block(s).\n"); +} + MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { + LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n"); + // When using tail-predication, try to delete the dead code that was used to + // calculate the number of loop iterations. + IterationCountDCE(LoLoop); + MachineInstr *InsertPt = LoLoop.InsertPt; MachineInstr *Start = LoLoop.Start; MachineBasicBlock *MBB = InsertPt->getParent(); @@ -775,109 +1339,67 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { if (!IsDo) MIB.add(Start->getOperand(1)); - // When using tail-predication, try to delete the dead code that was used to - // calculate the number of loop iterations. - if (LoLoop.IsTailPredicationLegal()) { - SmallVector<MachineInstr*, 4> Killed; - SmallVector<MachineInstr*, 4> Dead; - if (auto *Def = RDA->getReachingMIDef(Start, - Start->getOperand(0).getReg())) { - Killed.push_back(Def); - - while (!Killed.empty()) { - MachineInstr *Def = Killed.back(); - Killed.pop_back(); - Dead.push_back(Def); - for (auto &MO : Def->operands()) { - if (!MO.isReg() || !MO.isKill()) - continue; - - MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg()); - if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1) - Killed.push_back(Kill); - } - } - for (auto *MI : Dead) - MI->eraseFromParent(); - } - } - // If we're inserting at a mov lr, then remove it as it's redundant. if (InsertPt != Start) - InsertPt->eraseFromParent(); - Start->eraseFromParent(); + LoLoop.ToRemove.insert(InsertPt); + LoLoop.ToRemove.insert(Start); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); return &*MIB; } -// Goal is to optimise and clean-up these loops: -// -// vector.body: -// renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg -// renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4 -// .. -// $lr = MVE_DLSTP_32 renamable $r3 -// -// The SUB is the old update of the loop iteration count expression, which -// is no longer needed. This sub is removed when the element count, which is in -// r3 in this example, is defined by an instruction in the loop, and it has -// no uses. -// -void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) { - Register ElemCount = LoLoop.VCTP->getOperand(1).getReg(); - MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back(); - - LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n"); - - if (LoLoop.ML->getNumBlocks() != 1) { - LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n"); - return; - } - - LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: "; - LoLoop.VCTP->getOperand(1).dump()); - - // Find the definition we are interested in removing, if there is one. - MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount); - if (!Def) { - LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n"); - return; - } - - // Bail if we define CPSR and it is not dead - if (!Def->registerDefIsDead(ARM::CPSR, TRI)) { - LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n"); - return; - } - - // Bail if elemcount is used in exit blocks, i.e. if it is live-in. - if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n"); - return; - } +void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const { + LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n"); + auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) { + MachineBasicBlock *MBB = InsertPt.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR)); + MIB.addDef(To); + MIB.addReg(From); + MIB.addReg(From); + MIB.addImm(0); + MIB.addReg(0); + MIB.addReg(To); + LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB); + }; - // Bail if there are uses after this Def in the block. - SmallVector<MachineInstr*, 4> Uses; - RDA->getReachingLocalUses(Def, ElemCount, Uses); - if (Uses.size()) { - LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n"); - return; - } + for (auto &Reduction : LoLoop.Reductions) { + MachineInstr &Copy = Reduction->Copy; + MachineInstr &Reduce = Reduction->Reduce; + Register DestReg = Copy.getOperand(0).getReg(); - Uses.clear(); - RDA->getAllInstWithUseBefore(Def, ElemCount, Uses); + // Change the initialiser if present + if (Reduction->Init) { + MachineInstr *Init = Reduction->Init; - // Remove Def if there are no uses, or if the only use is the VCTP - // instruction. - if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: "; - Def->dump()); - Def->eraseFromParent(); - return; + for (unsigned i = 0; i < Init->getNumOperands(); ++i) { + MachineOperand &MO = Init->getOperand(i); + if (MO.isReg() && MO.isUse() && MO.isTied() && + Init->findTiedOperandIdx(i) == 0) + Init->getOperand(i).setReg(DestReg); + } + Init->getOperand(0).setReg(DestReg); + LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init); + } else + BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg()); + + // Change the reducing op to write to the register that is used to copy + // its value on the next iteration. Also update the tied-def operand. + Reduce.getOperand(0).setReg(DestReg); + Reduce.getOperand(5).setReg(DestReg); + LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce); + + // Instead of a vpsel, just copy the register into the necessary one. + MachineInstr &VPSEL = Reduction->VPSEL; + if (VPSEL.getOperand(0).getReg() != DestReg) + BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg); + + // Remove the unnecessary instructions. + LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n" + << " - " << Copy + << " - " << VPSEL << "\n"); + Copy.eraseFromParent(); + VPSEL.eraseFromParent(); } - - LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n"; - for (auto U : Uses) U->dump()); } void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { @@ -893,28 +1415,24 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { }; // There are a few scenarios which we have to fix up: - // 1) A VPT block with is only predicated by the vctp and has no internal vpr - // defs. - // 2) A VPT block which is only predicated by the vctp but has an internal - // vpr def. - // 3) A VPT block which is predicated upon the vctp as well as another vpr - // def. - // 4) A VPT block which is not predicated upon a vctp, but contains it and - // all instructions within the block are predicated upon in. - + // 1. VPT Blocks with non-uniform predicates: + // - a. When the divergent instruction is a vctp + // - b. When the block uses a vpst, and is only predicated on the vctp + // - c. When the block uses a vpt and (optionally) contains one or more + // vctp. + // 2. VPT Blocks with uniform predicates: + // - a. The block uses a vpst, and is only predicated on the vctp for (auto &Block : LoLoop.getVPTBlocks()) { SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts(); if (Block.HasNonUniformPredicate()) { PredicatedMI *Divergent = Block.getDivergent(); if (isVCTP(Divergent->MI)) { - // The vctp will be removed, so the size of the vpt block needs to be - // modified. - uint64_t Size = getARMVPTBlockMask(Block.size() - 1); - Block.getVPST()->getOperand(0).setImm(Size); - LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n"); - } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { - // The VPT block has a non-uniform predicate but it's entry is guarded - // only by a vctp, which means we: + // The vctp will be removed, so the block mask of the vp(s)t will need + // to be recomputed. + LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen()); + } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { + // The VPT block has a non-uniform predicate but it uses a vpst and its + // entry is guarded only by a vctp, which means we: // - Need to remove the original vpst. // - Then need to unpredicate any following instructions, until // we come across the divergent vpr def. @@ -922,7 +1440,7 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { // the divergent vpr def. // TODO: We could be producing more VPT blocks than necessary and could // fold the newly created one into a proceeding one. - for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()), + for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()), E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) RemovePredicate(&*I); @@ -935,28 +1453,58 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { ++Size; ++I; } + // Create a VPST (with a null mask for now, we'll recompute it later). MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt, InsertAt->getDebugLoc(), TII->get(ARM::MVE_VPST)); - MIB.addImm(getARMVPTBlockMask(Size)); - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); + MIB.addImm(0); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); - Block.getVPST()->eraseFromParent(); + LoLoop.ToRemove.insert(Block.getPredicateThen()); + LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); + } + // Else, if the block uses a vpt, iterate over the block, removing the + // extra VCTPs it may contain. + else if (Block.isVPT()) { + bool RemovedVCTP = false; + for (PredicatedMI &Elt : Block.getInsts()) { + MachineInstr *MI = Elt.MI; + if (isVCTP(MI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI); + LoLoop.ToRemove.insert(MI); + RemovedVCTP = true; + continue; + } + } + if (RemovedVCTP) + LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen()); } - } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { - // A vpt block which is only predicated upon vctp and has no internal vpr - // defs: + } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) { + // A vpt block starting with VPST, is only predicated upon vctp and has no + // internal vpr defs: // - Remove vpst. // - Unpredicate the remaining instructions. - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); - Block.getVPST()->eraseFromParent(); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); + LoLoop.ToRemove.insert(Block.getPredicateThen()); for (auto &PredMI : Insts) RemovePredicate(PredMI.MI); } } - - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP); - LoLoop.VCTP->eraseFromParent(); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n"); + // Remove the "main" VCTP + LoLoop.ToRemove.insert(LoLoop.VCTP); + LLVM_DEBUG(dbgs() << " " << *LoLoop.VCTP); + // Remove remaining secondary VCTPs + for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) { + // All VCTPs that aren't marked for removal yet should be unpredicated ones. + // The predicated ones should have already been marked for removal when + // visiting the VPT blocks. + if (LoLoop.ToRemove.insert(VCTP).second) { + assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None && + "Removing Predicated VCTP without updating the block mask!"); + LLVM_DEBUG(dbgs() << " " << *VCTP); + } + } } void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { @@ -973,9 +1521,8 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { MIB.add(End->getOperand(0)); MIB.add(End->getOperand(1)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); - - LoLoop.End->eraseFromParent(); - LoLoop.Dec->eraseFromParent(); + LoLoop.ToRemove.insert(LoLoop.Dec); + LoLoop.ToRemove.insert(End); return &*MIB; }; @@ -1001,7 +1548,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { RevertWhile(LoLoop.Start); else LoLoop.Start->eraseFromParent(); - bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true); + bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec); RevertLoopEnd(LoLoop.End, FlagsAlreadySet); } else { LoLoop.Start = ExpandLoopStart(LoLoop); @@ -1009,10 +1556,35 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { LoLoop.End = ExpandLoopEnd(LoLoop); RemoveDeadBranch(LoLoop.End); if (LoLoop.IsTailPredicationLegal()) { - RemoveLoopUpdate(LoLoop); ConvertVPTBlocks(LoLoop); + FixupReductions(LoLoop); + } + for (auto *I : LoLoop.ToRemove) { + LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); + I->eraseFromParent(); + } + for (auto *I : LoLoop.BlockMasksToRecompute) { + LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I); + recomputeVPTBlockMask(*I); + LLVM_DEBUG(dbgs() << " ... done: " << *I); } } + + PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); + DFS.ProcessLoop(); + const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder(); + for (auto *MBB : PostOrder) { + recomputeLiveIns(*MBB); + // FIXME: For some reason, the live-in print order is non-deterministic for + // our tests and I can't out why... So just sort them. + MBB->sortUniqueLiveIns(); + } + + for (auto *MBB : reverse(PostOrder)) + recomputeLivenessFlags(*MBB); + + // We've moved, removed and inserted new instructions, so update RDA. + RDA->reset(); } bool ARMLowOverheadLoops::RevertNonLoops() { diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp index 8e01b998d900..f893faa4cf97 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMCInstLower.cpp @@ -194,9 +194,9 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) // BLX ip // POP{ r0, lr } // - OutStreamer->EmitCodeAlignment(4); + OutStreamer->emitCodeAlignment(4); auto CurSled = OutContext.createTempSymbol("xray_sled_", true); - OutStreamer->EmitLabel(CurSled); + OutStreamer->emitLabel(CurSled); auto Target = OutContext.createTempSymbol(); // Emit "B #20" instruction, which jumps over the next 24 bytes (because @@ -209,8 +209,8 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) emitNops(NoopsInSledCount); - OutStreamer->EmitLabel(Target); - recordSled(CurSled, MI, Kind); + OutStreamer->emitLabel(Target); + recordSled(CurSled, MI, Kind, 2); } void ARMAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp index 3b676ca4c883..507c3e69b3a4 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -15,4 +15,6 @@ void ARMFunctionInfo::anchor() {} ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF) : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()), - hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()) {} + hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()), + IsCmseNSEntry(MF.getFunction().hasFnAttribute("cmse_nonsecure_entry")), + IsCmseNSCall(MF.getFunction().hasFnAttribute("cmse_nonsecure_call")) {} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index bb136e92329b..298c8a238987 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -58,10 +58,6 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// emitPrologue. bool RestoreSPFromFP = false; - /// LRSpilledForFarJump - True if the LR register has been for spilled to - /// enable far jump. - bool LRSpilledForFarJump = false; - /// LRSpilled - True if the LR register has been for spilled for /// any reason, so it's legal to emit an ARM::tBfar (i.e. "bl"). bool LRSpilled = false; @@ -87,6 +83,7 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills /// areas. + unsigned FPCXTSaveSize = 0; unsigned GPRCS1Size = 0; unsigned GPRCS2Size = 0; unsigned DPRCSAlignGapSize = 0; @@ -109,6 +106,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// HasITBlocks - True if IT blocks have been inserted. bool HasITBlocks = false; + // Security Extensions + bool IsCmseNSEntry; + bool IsCmseNSCall; + /// CPEClones - Track constant pool entries clones created by Constant Island /// pass. DenseMap<unsigned, unsigned> CPEClones; @@ -144,6 +145,9 @@ public: bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; } bool isThumb2Function() const { return isThumb && hasThumb2; } + bool isCmseNSEntryFunction() const { return IsCmseNSEntry; } + bool isCmseNSCallFunction() const { return IsCmseNSCall; } + unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; } void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; } @@ -162,9 +166,6 @@ public: bool isLRSpilled() const { return LRSpilled; } void setLRIsSpilled(bool s) { LRSpilled = s; } - bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; } - void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; } - unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; } void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; } @@ -179,11 +180,13 @@ public: void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; } void setDPRCalleeSavedAreaOffset(unsigned o) { DPRCSOffset = o; } + unsigned getFPCXTSaveAreaSize() const { return FPCXTSaveSize; } unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; } unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; } unsigned getDPRCalleeSavedGapSize() const { return DPRCSAlignGapSize; } unsigned getDPRCalleeSavedAreaSize() const { return DPRCSSize; } + void setFPCXTSaveAreaSize(unsigned s) { FPCXTSaveSize = s; } void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; } void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; } void setDPRCalleeSavedGapSize(unsigned s) { DPRCSAlignGapSize = s; } @@ -252,6 +255,7 @@ public: } DenseMap<unsigned, unsigned> EHPrologueRemappedRegs; + DenseMap<unsigned, unsigned> EHPrologueOffsetInRegs; void setPreservesR0() { PreservesR0 = true; } bool getPreservesR0() const { return PreservesR0; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp index e2c9335db419..e750649ce86c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -19,8 +19,9 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" -#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsARM.h" @@ -28,7 +29,6 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" -#include "llvm/PassSupport.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -352,7 +352,6 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { SmallVector<Instruction*, 8> Writes; LoadPairs.clear(); WideLoads.clear(); - OrderedBasicBlock OrderedBB(BB); // Collect loads and instruction that may write to memory. For now we only // record loads which are simple, sign-extended and have a single user. @@ -384,7 +383,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc), ModRefInfo::ModRef))) continue; - if (OrderedBB.dominates(Write, Read)) + if (Write->comesBefore(Read)) RAWDeps[Read].insert(Write); } } @@ -392,8 +391,9 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { // Check whether there's not a write between the two loads which would // prevent them from being safely merged. auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) { - LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset; - LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base; + bool BaseFirst = Base->comesBefore(Offset); + LoadInst *Dominator = BaseFirst ? Base : Offset; + LoadInst *Dominated = BaseFirst ? Offset : Base; if (RAWDeps.count(Dominated)) { InstSet &WritesBefore = RAWDeps[Dominated]; @@ -401,7 +401,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { for (auto Before : WritesBefore) { // We can't move the second load backward, past a write, to merge // with the first load. - if (OrderedBB.dominates(Dominator, Before)) + if (Dominator->comesBefore(Before)) return false; } } @@ -571,6 +571,10 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) { auto Ld2 = static_cast<LoadInst*>(PMul0->RHS); auto Ld3 = static_cast<LoadInst*>(PMul1->RHS); + // Check that each mul is operating on two different loads. + if (Ld0 == Ld2 || Ld1 == Ld3) + return false; + if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); @@ -705,12 +709,11 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) { } // Roughly sort the mul pairs in their program order. - OrderedBasicBlock OrderedBB(R.getRoot()->getParent()); - llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) { - const Instruction *A = PairA.first->Root; - const Instruction *B = PairB.first->Root; - return OrderedBB.dominates(A, B); - }); + llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) { + const Instruction *A = PairA.first->Root; + const Instruction *B = PairB.first->Root; + return A->comesBefore(B); + }); IntegerType *Ty = IntegerType::get(M->getContext(), 32); for (auto &Pair : R.getMulPairs()) { @@ -772,8 +775,7 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads, const unsigned AddrSpace = DomLoad->getPointerAddressSpace(); Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(), LoadTy->getPointerTo(AddrSpace)); - LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, - Base->getAlignment()); + LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign()); // Make sure everything is in the correct order in the basic block. MoveBefore(Base->getPointerOperand(), VecPtr); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td index dea1d767beb4..1ae71be9f760 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMPredicates.td @@ -7,148 +7,160 @@ //===----------------------------------------------------------------------===// def HasV4T : Predicate<"Subtarget->hasV4TOps()">, - AssemblerPredicate<"HasV4TOps", "armv4t">; + AssemblerPredicate<(all_of HasV4TOps), "armv4t">; def NoV4T : Predicate<"!Subtarget->hasV4TOps()">; def HasV5T : Predicate<"Subtarget->hasV5TOps()">, - AssemblerPredicate<"HasV5TOps", "armv5t">; + AssemblerPredicate<(all_of HasV5TOps), "armv5t">; def NoV5T : Predicate<"!Subtarget->hasV5TOps()">; def HasV5TE : Predicate<"Subtarget->hasV5TEOps()">, - AssemblerPredicate<"HasV5TEOps", "armv5te">; + AssemblerPredicate<(all_of HasV5TEOps), "armv5te">; def HasV6 : Predicate<"Subtarget->hasV6Ops()">, - AssemblerPredicate<"HasV6Ops", "armv6">; + AssemblerPredicate<(all_of HasV6Ops), "armv6">; def NoV6 : Predicate<"!Subtarget->hasV6Ops()">; def HasV6M : Predicate<"Subtarget->hasV6MOps()">, - AssemblerPredicate<"HasV6MOps", + AssemblerPredicate<(all_of HasV6MOps), "armv6m or armv6t2">; def HasV8MBaseline : Predicate<"Subtarget->hasV8MBaselineOps()">, - AssemblerPredicate<"HasV8MBaselineOps", + AssemblerPredicate<(all_of HasV8MBaselineOps), "armv8m.base">; def HasV8MMainline : Predicate<"Subtarget->hasV8MMainlineOps()">, - AssemblerPredicate<"HasV8MMainlineOps", + AssemblerPredicate<(all_of HasV8MMainlineOps), "armv8m.main">; def HasV8_1MMainline : Predicate<"Subtarget->hasV8_1MMainlineOps()">, - AssemblerPredicate<"HasV8_1MMainlineOps", + AssemblerPredicate<(all_of HasV8_1MMainlineOps), "armv8.1m.main">; def HasMVEInt : Predicate<"Subtarget->hasMVEIntegerOps()">, - AssemblerPredicate<"HasMVEIntegerOps", + AssemblerPredicate<(all_of HasMVEIntegerOps), "mve">; def HasMVEFloat : Predicate<"Subtarget->hasMVEFloatOps()">, - AssemblerPredicate<"HasMVEFloatOps", + AssemblerPredicate<(all_of HasMVEFloatOps), "mve.fp">; +def HasCDE : Predicate<"Subtarget->hasCDEOps()">, + AssemblerPredicate<(all_of HasCDEOps), + "cde">; def HasFPRegs : Predicate<"Subtarget->hasFPRegs()">, - AssemblerPredicate<"FeatureFPRegs", + AssemblerPredicate<(all_of FeatureFPRegs), "fp registers">; def HasFPRegs16 : Predicate<"Subtarget->hasFPRegs16()">, - AssemblerPredicate<"FeatureFPRegs16", + AssemblerPredicate<(all_of FeatureFPRegs16), + "16-bit fp registers">; +def HasNoFPRegs16 : Predicate<"!Subtarget->hasFPRegs16()">, + AssemblerPredicate<(all_of (not FeatureFPRegs16)), "16-bit fp registers">; def HasFPRegs64 : Predicate<"Subtarget->hasFPRegs64()">, - AssemblerPredicate<"FeatureFPRegs64", + AssemblerPredicate<(all_of FeatureFPRegs64), "64-bit fp registers">; def HasFPRegsV8_1M : Predicate<"Subtarget->hasFPRegs() && Subtarget->hasV8_1MMainlineOps()">, - AssemblerPredicate<"FeatureFPRegs,HasV8_1MMainlineOps", + AssemblerPredicate<(all_of FeatureFPRegs, HasV8_1MMainlineOps), "armv8.1m.main with FP or MVE">; def HasV6T2 : Predicate<"Subtarget->hasV6T2Ops()">, - AssemblerPredicate<"HasV6T2Ops", "armv6t2">; + AssemblerPredicate<(all_of HasV6T2Ops), "armv6t2">; def NoV6T2 : Predicate<"!Subtarget->hasV6T2Ops()">; def HasV6K : Predicate<"Subtarget->hasV6KOps()">, - AssemblerPredicate<"HasV6KOps", "armv6k">; + AssemblerPredicate<(all_of HasV6KOps), "armv6k">; def NoV6K : Predicate<"!Subtarget->hasV6KOps()">; def HasV7 : Predicate<"Subtarget->hasV7Ops()">, - AssemblerPredicate<"HasV7Ops", "armv7">; + AssemblerPredicate<(all_of HasV7Ops), "armv7">; def HasV8 : Predicate<"Subtarget->hasV8Ops()">, - AssemblerPredicate<"HasV8Ops", "armv8">; + AssemblerPredicate<(all_of HasV8Ops), "armv8">; def PreV8 : Predicate<"!Subtarget->hasV8Ops()">, - AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">; + AssemblerPredicate<(all_of (not HasV8Ops)), "armv7 or earlier">; def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, - AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; + AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">; def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, - AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; + AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">; def HasV8_3a : Predicate<"Subtarget->hasV8_3aOps()">, - AssemblerPredicate<"HasV8_3aOps", "armv8.3a">; + AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">; def HasV8_4a : Predicate<"Subtarget->hasV8_4aOps()">, - AssemblerPredicate<"HasV8_4aOps", "armv8.4a">; + AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">; def HasV8_5a : Predicate<"Subtarget->hasV8_5aOps()">, - AssemblerPredicate<"HasV8_5aOps", "armv8.5a">; + AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">; +def HasV8_6a : Predicate<"Subtarget->hasV8_6aOps()">, + AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">; def NoVFP : Predicate<"!Subtarget->hasVFP2Base()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2Base()">, - AssemblerPredicate<"FeatureVFP2_SP", "VFP2">; + AssemblerPredicate<(all_of FeatureVFP2_SP), "VFP2">; def HasVFP3 : Predicate<"Subtarget->hasVFP3Base()">, - AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">; + AssemblerPredicate<(all_of FeatureVFP3_D16_SP), "VFP3">; def HasVFP4 : Predicate<"Subtarget->hasVFP4Base()">, - AssemblerPredicate<"FeatureVFP4_D16_SP", "VFP4">; + AssemblerPredicate<(all_of FeatureVFP4_D16_SP), "VFP4">; def HasDPVFP : Predicate<"Subtarget->hasFP64()">, - AssemblerPredicate<"FeatureFP64", + AssemblerPredicate<(all_of FeatureFP64), "double precision VFP">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8Base()">, - AssemblerPredicate<"FeatureFPARMv8_D16_SP", "FPARMv8">; + AssemblerPredicate<(all_of FeatureFPARMv8_D16_SP), "FPARMv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, - AssemblerPredicate<"FeatureNEON", "NEON">; + AssemblerPredicate<(all_of FeatureNEON), "NEON">; def HasSHA2 : Predicate<"Subtarget->hasSHA2()">, - AssemblerPredicate<"FeatureSHA2", "sha2">; + AssemblerPredicate<(all_of FeatureSHA2), "sha2">; def HasAES : Predicate<"Subtarget->hasAES()">, - AssemblerPredicate<"FeatureAES", "aes">; + AssemblerPredicate<(all_of FeatureAES), "aes">; def HasCrypto : Predicate<"Subtarget->hasCrypto()">, - AssemblerPredicate<"FeatureCrypto", "crypto">; + AssemblerPredicate<(all_of FeatureCrypto), "crypto">; def HasDotProd : Predicate<"Subtarget->hasDotProd()">, - AssemblerPredicate<"FeatureDotProd", "dotprod">; + AssemblerPredicate<(all_of FeatureDotProd), "dotprod">; def HasCRC : Predicate<"Subtarget->hasCRC()">, - AssemblerPredicate<"FeatureCRC", "crc">; + AssemblerPredicate<(all_of FeatureCRC), "crc">; def HasRAS : Predicate<"Subtarget->hasRAS()">, - AssemblerPredicate<"FeatureRAS", "ras">; + AssemblerPredicate<(all_of FeatureRAS), "ras">; def HasLOB : Predicate<"Subtarget->hasLOB()">, - AssemblerPredicate<"FeatureLOB", "lob">; + AssemblerPredicate<(all_of FeatureLOB), "lob">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, - AssemblerPredicate<"FeatureFP16","half-float conversions">; + AssemblerPredicate<(all_of FeatureFP16),"half-float conversions">; def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, - AssemblerPredicate<"FeatureFullFP16","full half-float">; + AssemblerPredicate<(all_of FeatureFullFP16),"full half-float">; def HasFP16FML : Predicate<"Subtarget->hasFP16FML()">, - AssemblerPredicate<"FeatureFP16FML","full half-float fml">; + AssemblerPredicate<(all_of FeatureFP16FML),"full half-float fml">; +def HasBF16 : Predicate<"Subtarget->hasBF16()">, + AssemblerPredicate<(all_of FeatureBF16),"BFloat16 floating point extension">; +def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">, + AssemblerPredicate<(all_of FeatureMatMulInt8),"8-bit integer matrix multiply">; def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">, - AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">; + AssemblerPredicate<(all_of FeatureHWDivThumb), "divide in THUMB">; def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, - AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">; + AssemblerPredicate<(all_of FeatureHWDivARM), "divide in ARM">; def HasDSP : Predicate<"Subtarget->hasDSP()">, - AssemblerPredicate<"FeatureDSP", "dsp">; + AssemblerPredicate<(all_of FeatureDSP), "dsp">; def HasDB : Predicate<"Subtarget->hasDataBarrier()">, - AssemblerPredicate<"FeatureDB", + AssemblerPredicate<(all_of FeatureDB), "data-barriers">; def HasDFB : Predicate<"Subtarget->hasFullDataBarrier()">, - AssemblerPredicate<"FeatureDFB", + AssemblerPredicate<(all_of FeatureDFB), "full-data-barrier">; def HasV7Clrex : Predicate<"Subtarget->hasV7Clrex()">, - AssemblerPredicate<"FeatureV7Clrex", + AssemblerPredicate<(all_of FeatureV7Clrex), "v7 clrex">; def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">, - AssemblerPredicate<"FeatureAcquireRelease", + AssemblerPredicate<(all_of FeatureAcquireRelease), "acquire/release">; def HasMP : Predicate<"Subtarget->hasMPExtension()">, - AssemblerPredicate<"FeatureMP", + AssemblerPredicate<(all_of FeatureMP), "mp-extensions">; def HasVirtualization: Predicate<"false">, - AssemblerPredicate<"FeatureVirtualization", + AssemblerPredicate<(all_of FeatureVirtualization), "virtualization-extensions">; def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">, - AssemblerPredicate<"FeatureTrustZone", + AssemblerPredicate<(all_of FeatureTrustZone), "TrustZone">; def Has8MSecExt : Predicate<"Subtarget->has8MSecExt()">, - AssemblerPredicate<"Feature8MSecExt", + AssemblerPredicate<(all_of Feature8MSecExt), "ARMv8-M Security Extensions">; def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; def IsThumb : Predicate<"Subtarget->isThumb()">, - AssemblerPredicate<"ModeThumb", "thumb">; + AssemblerPredicate<(all_of ModeThumb), "thumb">; def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">; def IsThumb2 : Predicate<"Subtarget->isThumb2()">, - AssemblerPredicate<"ModeThumb,FeatureThumb2", + AssemblerPredicate<(all_of ModeThumb, FeatureThumb2), "thumb2">; def IsMClass : Predicate<"Subtarget->isMClass()">, - AssemblerPredicate<"FeatureMClass", "armv*m">; + AssemblerPredicate<(all_of FeatureMClass), "armv*m">; def IsNotMClass : Predicate<"!Subtarget->isMClass()">, - AssemblerPredicate<"!FeatureMClass", + AssemblerPredicate<(all_of (not FeatureMClass)), "!armv*m">; def IsARM : Predicate<"!Subtarget->isThumb()">, - AssemblerPredicate<"!ModeThumb", "arm-mode">; + AssemblerPredicate<(all_of (not ModeThumb)), "arm-mode">; def IsMachO : Predicate<"Subtarget->isTargetMachO()">; def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; @@ -157,12 +169,12 @@ def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">; def IsReadTPHard : Predicate<"Subtarget->isReadTPHard()">; def IsReadTPSoft : Predicate<"!Subtarget->isReadTPHard()">; def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">, - AssemblerPredicate<"FeatureNaClTrap", "NaCl">; + AssemblerPredicate<(all_of FeatureNaClTrap), "NaCl">; def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">; def UseNegativeImmediates : Predicate<"false">, - AssemblerPredicate<"!FeatureNoNegativeImmediates", + AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)), "NegativeImmediates">; // FIXME: Eventually this will be just "hasV6T2Ops". @@ -206,4 +218,4 @@ def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">; // Armv8.5-A extensions def HasSB : Predicate<"Subtarget->hasSB()">, - AssemblerPredicate<"FeatureSB", "sb">; + AssemblerPredicate<(all_of FeatureSB), "sb">; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp index 43c8cd5a89be..f9dbfef4c113 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -131,45 +131,47 @@ static void checkValueMappings() { ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI) : ARMGenRegisterBankInfo() { - static bool AlreadyInit = false; // We have only one set of register banks, whatever the subtarget // is. Therefore, the initialization of the RegBanks table should be // done only once. Indeed the table of all register banks // (ARM::RegBanks) is unique in the compiler. At some point, it // will get tablegen'ed and the whole constructor becomes empty. - if (AlreadyInit) - return; - AlreadyInit = true; + static llvm::once_flag InitializeRegisterBankFlag; - const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID); - (void)RBGPR; - assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up"); + static auto InitializeRegisterBankOnce = [&]() { + const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID); + (void)RBGPR; + assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up"); - // Initialize the GPR bank. - assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) && - "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) && - "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnopcRegClassID)) && - "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass(ARM::rGPRRegClassID)) && - "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPRRegClassID)) && - "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) && - "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) && - "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) && - "Subclass not added?"); - assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) && - "Subclass not added?"); - assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit"); + // Initialize the GPR bank. + assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) && + "Subclass not added?"); + assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) && + "Subclass not added?"); + assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnopcRegClassID)) && + "Subclass not added?"); + assert(RBGPR.covers(*TRI.getRegClass(ARM::rGPRRegClassID)) && + "Subclass not added?"); + assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPRRegClassID)) && + "Subclass not added?"); + assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) && + "Subclass not added?"); + assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) && + "Subclass not added?"); + assert(RBGPR.covers( + *TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) && + "Subclass not added?"); + assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) && + "Subclass not added?"); + assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit"); #ifndef NDEBUG - ARM::checkPartialMappings(); - ARM::checkValueMappings(); + ARM::checkPartialMappings(); + ARM::checkValueMappings(); #endif + }; + + llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); } const RegisterBank & diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td index 56055a15483a..a384b0dc757c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -305,6 +305,17 @@ def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> { let DiagnosticType = "rGPR"; } +// GPRs without the PC and SP but with APSR_NZCV.Some instructions allow +// accessing the APSR_NZCV, while actually encoding PC in the register field. +// This is useful for assembly and disassembly only. +// Currently used by the CDE extension. +def GPRwithAPSR_NZCVnosp + : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), LR, APSR_NZCV)> { + let isAllocatable = 0; + let DiagnosticString = + "operand must be a register in the range [r0, r12], r14 or apsr_nzcv"; +} + // Thumb registers are R0-R7 normally. Some instructions can still use // the general GPR register class above (MOV, e.g.) def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)> { @@ -379,7 +390,7 @@ def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> { let DiagnosticString = "operand must be a register in range [s0, s31]"; } -def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> { +def HPR : RegisterClass<"ARM", [f16, bf16], 32, (sequence "S%u", 0, 31)> { let AltOrders = [(add (decimate HPR, 2), SPR), (add (decimate HPR, 4), (decimate HPR, 2), @@ -401,7 +412,7 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> { // class. // ARM requires only word alignment for double. It's more performant if it // is double-word alignment though. -def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, +def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16, v4bf16], 64, (sequence "D%u", 0, 31)> { // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on // Darwin platforms. @@ -422,20 +433,20 @@ def FPWithVPR : RegisterClass<"ARM", [f32], 32, (add SPR, DPR, VPR)> { // Subset of DPR that are accessible with VFP2 (and so that also have // 32-bit SPR subregs). -def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, +def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16, v4bf16], 64, (trunc DPR, 16)> { let DiagnosticString = "operand must be a register in range [d0, d15]"; } // Subset of DPR which can be used as a source of NEON scalars for 16-bit // operations -def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, +def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16, v4bf16], 64, (trunc DPR, 8)> { let DiagnosticString = "operand must be a register in range [d0, d7]"; } // Generic 128-bit vector register class. -def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128, +def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16, v8bf16], 128, (sequence "Q%u", 0, 15)> { // Allocate non-VFP2 aliases Q8-Q15 first. let AltOrders = [(rotl QPR, 8), (trunc QPR, 8)]; @@ -577,3 +588,6 @@ def Tuples4DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4, dsub_6], // Spaced quads of D registers. def DQuadSpc : RegisterClass<"ARM", [v4i64], 64, (add Tuples3DSpc)>; + +// FP context payload +def FPCXTRegs : RegisterClass<"ARM", [i32], 32, (add FPCXTNS)>; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td index a79f3348f338..d9a8d304c41f 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleA57.td @@ -96,7 +96,7 @@ def CortexA57Model : SchedMachineModel { let FullInstRWOverlapCheck = 0; let UnsupportedFeatures = [HasV8_1MMainline, HasMVEInt, HasMVEFloat, - HasFPRegsV8_1M]; + HasFPRegsV8_1M, HasFP16FML, HasMatMulInt8, HasBF16]; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td index 00a44599b1b2..e0e98bfa0e9b 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMScheduleSwift.td @@ -744,7 +744,7 @@ let SchedModel = SwiftModel in { SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, SwiftWriteLM14CyNo, SwiftWriteP01OneCycle, SwiftVLDMPerm5]>, - // Inaccurate: reuse describtion from 9 S registers. + // Inaccurate: reuse description from 9 S registers. SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy, SwiftWriteLM13Cy, SwiftWriteLM14CyNo, SwiftWriteLM17CyNo, SwiftWriteLM18CyNo, @@ -760,7 +760,7 @@ let SchedModel = SwiftModel in { SwiftWriteLM11CyNo, SwiftWriteLM11CyNo, SwiftWriteLM11CyNo, SwiftWriteLM11CyNo, SwiftWriteP01OneCycle, SwiftVLDMPerm3]>, - // Inaccurate: reuse describtion from 9 S registers. + // Inaccurate: reuse description from 9 S registers. SchedVar<SwiftLMAddr13Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy, SwiftWriteLM13Cy, SwiftWriteLM14CyNo, SwiftWriteLM17CyNo, SwiftWriteLM18CyNo, @@ -958,7 +958,7 @@ let SchedModel = SwiftModel in { def : InstRW<[SwiftWriteLM7Cy, SwiftWriteP01OneCycle, SwiftWriteLM8Cy, SwiftWriteLM8Cy, SwiftExt1xP0, SwiftVLDMPerm3], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>; - // Four element struture. + // Four element structure. def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo, SwiftWriteLM10CyNo, SwiftExt1xP0, SwiftVLDMPerm5], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$", diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp index cade06e8c109..7e06229b60c3 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -126,24 +126,24 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { const ARMSubtarget &Subtarget = DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); // Do repeated 4-byte loads and stores. To be improved. // This requires 4-byte alignment. - if ((Align & 3) != 0) + if (Alignment < Align(4)) return SDValue(); // This requires the copy size to be a constant, preferably // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); if (!ConstantSize) - return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, - RTLIB::MEMCPY); + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, + Alignment.value(), RTLIB::MEMCPY); uint64_t SizeVal = ConstantSize->getZExtValue(); if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) - return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, - RTLIB::MEMCPY); + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, + Alignment.value(), RTLIB::MEMCPY); unsigned BytesLeft = SizeVal & 3; unsigned NumMemOps = SizeVal >> 2; @@ -240,16 +240,16 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { - return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, - RTLIB::MEMMOVE); + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, + Alignment.value(), RTLIB::MEMMOVE); } SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo) const { - return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align, - RTLIB::MEMSET); + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, + Alignment.value(), RTLIB::MEMSET); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h index b8a86ae7310f..7aa831c09248 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -39,22 +39,22 @@ class ARMSelectionDAGInfo : public SelectionDAGTargetInfo { public: SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, - bool AlwaysInline, + SDValue Size, Align Alignment, + bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const override; SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, - unsigned Align, bool isVolatile, + Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const override; // Adjust parameters for memset, see RTABI section 4.3.4 SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, - SDValue Op3, unsigned Align, bool isVolatile, + SDValue Op3, Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo) const override; SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp index eb4d39b01cbb..46802037c2aa 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -183,7 +183,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (!ArchFS.empty()) ArchFS = (Twine(ArchFS) + "," + FS).str(); else - ArchFS = FS; + ArchFS = std::string(FS); } ParseSubtargetFeatures(CPUString, ArchFS); @@ -292,12 +292,15 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { case CortexA73: case CortexA75: case CortexA76: + case CortexA77: + case CortexA78: case CortexR4: case CortexR4F: case CortexR5: case CortexR7: case CortexM3: case CortexR52: + case CortexX1: break; case Exynos: LdStMultipleTiming = SingleIssuePlusExtras; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h index 6bdd021970ef..2703e385dd81 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMSubtarget.h @@ -28,6 +28,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/MC/MCSchedule.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include <memory> #include <string> @@ -60,6 +61,8 @@ protected: CortexA73, CortexA75, CortexA76, + CortexA77, + CortexA78, CortexA8, CortexA9, CortexM3, @@ -68,6 +71,7 @@ protected: CortexR5, CortexR52, CortexR7, + CortexX1, Exynos, Krait, Kryo, @@ -108,6 +112,7 @@ protected: ARMv83a, ARMv84a, ARMv85a, + ARMv86a, ARMv8a, ARMv8mBaseline, ARMv8mMainline, @@ -157,11 +162,13 @@ protected: bool HasV8_3aOps = false; bool HasV8_4aOps = false; bool HasV8_5aOps = false; + bool HasV8_6aOps = false; bool HasV8MBaselineOps = false; bool HasV8MMainlineOps = false; bool HasV8_1MMainlineOps = false; bool HasMVEIntegerOps = false; bool HasMVEFloatOps = false; + bool HasCDEOps = false; /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what /// floating point ISAs are supported. @@ -254,6 +261,12 @@ protected: /// HasFP16FML - True if subtarget supports half-precision FP fml operations bool HasFP16FML = false; + /// HasBF16 - True if subtarget supports BFloat16 floating point operations + bool HasBF16 = false; + + /// HasMatMulInt8 - True if subtarget supports 8-bit integer matrix multiply + bool HasMatMulInt8 = false; + /// HasD32 - True if subtarget has the full 32 double precision /// FP registers for VFPv3. bool HasD32 = false; @@ -562,6 +575,7 @@ private: void initSubtargetFeatures(StringRef CPU, StringRef FS); ARMFrameLowering *initializeFrameLowering(StringRef CPU, StringRef FS); + std::bitset<8> CoprocCDE = {}; public: void computeIssueWidth(); @@ -579,11 +593,13 @@ public: bool hasV8_3aOps() const { return HasV8_3aOps; } bool hasV8_4aOps() const { return HasV8_4aOps; } bool hasV8_5aOps() const { return HasV8_5aOps; } + bool hasV8_6aOps() const { return HasV8_6aOps; } bool hasV8MBaselineOps() const { return HasV8MBaselineOps; } bool hasV8MMainlineOps() const { return HasV8MMainlineOps; } bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; } bool hasMVEIntegerOps() const { return HasMVEIntegerOps; } bool hasMVEFloatOps() const { return HasMVEFloatOps; } + bool hasCDEOps() const { return HasCDEOps; } bool hasFPRegs() const { return HasFPRegs; } bool hasFPRegs16() const { return HasFPRegs16; } bool hasFPRegs64() const { return HasFPRegs64; } @@ -689,12 +705,15 @@ public: bool hasD32() const { return HasD32; } bool hasFullFP16() const { return HasFullFP16; } bool hasFP16FML() const { return HasFP16FML; } + bool hasBF16() const { return HasBF16; } bool hasFuseAES() const { return HasFuseAES; } bool hasFuseLiterals() const { return HasFuseLiterals; } /// Return true if the CPU supports any kind of instruction fusion. bool hasFusion() const { return hasFuseAES() || hasFuseLiterals(); } + bool hasMatMulInt8() const { return HasMatMulInt8; } + const Triple &getTargetTriple() const { return TargetTriple; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 84876eda33a6..9ead5fa4308c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -96,6 +96,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() { initializeARMExpandPseudoPass(Registry); initializeThumb2SizeReducePass(Registry); initializeMVEVPTBlockPass(Registry); + initializeMVEVPTOptimisationsPass(Registry); initializeMVETailPredicationPass(Registry); initializeARMLowOverheadLoopsPass(Registry); initializeMVEGatherScatterLoweringPass(Registry); @@ -243,7 +244,14 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, this->Options.NoTrapAfterNoreturn = true; } + // ARM supports the debug entry values. + setSupportsDebugEntryValues(true); + initAsmInfo(); + + // ARM supports the MachineOutliner. + setMachineOutliner(true); + setSupportsDefaultOutlining(false); } ARMBaseTargetMachine::~ARMBaseTargetMachine() = default; @@ -359,6 +367,7 @@ public: void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; + void addPreEmitPass2() override; std::unique_ptr<CSEConfigBase> getCSEConfig() const override; }; @@ -483,6 +492,8 @@ bool ARMPassConfig::addGlobalInstructionSelect() { void ARMPassConfig::addPreRegAlloc() { if (getOptLevel() != CodeGenOpt::None) { + addPass(createMVEVPTOptimisationsPass()); + addPass(createMLxExpansionPass()); if (EnableARMLoadStoreOpt) @@ -507,9 +518,12 @@ void ARMPassConfig::addPreSched2() { addPass(createARMExpandPseudoPass()); if (getOptLevel() != CodeGenOpt::None) { - // in v8, IfConversion depends on Thumb instruction widths + // When optimising for size, always run the Thumb2SizeReduction pass before + // IfConversion. Otherwise, check whether IT blocks are restricted + // (e.g. in v8, IfConversion depends on Thumb instruction widths) addPass(createThumb2SizeReductionPass([this](const Function &F) { - return this->TM->getSubtarget<ARMSubtarget>(F).restrictIT(); + return this->TM->getSubtarget<ARMSubtarget>(F).hasMinSize() || + this->TM->getSubtarget<ARMSubtarget>(F).restrictIT(); })); addPass(createIfConverter([](const MachineFunction &MF) { @@ -538,7 +552,9 @@ void ARMPassConfig::addPreEmitPass() { // Don't optimize barriers at -O0. if (getOptLevel() != CodeGenOpt::None) addPass(createARMOptimizeBarriersPass()); +} +void ARMPassConfig::addPreEmitPass2() { addPass(createARMConstantIslandPass()); addPass(createARMLowOverheadLoopsPass()); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp index 891329d3f297..3f0e3360632d 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -49,7 +49,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx, // Since we cannot modify flags for an existing section, we create a new // section with the right flags, and use 0 as the unique ID for // execute-only text - TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U); + TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U, nullptr); } } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 7ff05034c1f2..bea4e157a131 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -16,18 +16,19 @@ #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -45,7 +46,7 @@ static cl::opt<bool> DisableLowOverheadLoops( "disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops")); -extern cl::opt<bool> DisableTailPredication; +extern cl::opt<TailPredication::Mode> EnableTailPredication; extern cl::opt<bool> EnableMaskedGatherScatters; @@ -57,17 +58,32 @@ bool ARMTTIImpl::areInlineCompatible(const Function *Caller, const FeatureBitset &CalleeBits = TM.getSubtargetImpl(*Callee)->getFeatureBits(); - // To inline a callee, all features not in the whitelist must match exactly. - bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) == - (CalleeBits & ~InlineFeatureWhitelist); - // For features in the whitelist, the callee's features must be a subset of + // To inline a callee, all features not in the allowed list must match exactly. + bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) == + (CalleeBits & ~InlineFeaturesAllowed); + // For features in the allowed list, the callee's features must be a subset of // the callers'. - bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) == - (CalleeBits & InlineFeatureWhitelist); + bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) == + (CalleeBits & InlineFeaturesAllowed); return MatchExact && MatchSubset; } -int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const { + if (L->getHeader()->getParent()->hasOptSize()) + return false; + if (ST->hasMVEIntegerOps()) + return false; + return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; +} + +bool ARMTTIImpl::shouldFavorPostInc() const { + if (ST->hasMVEIntegerOps()) + return true; + return false; +} + +int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned Bits = Ty->getPrimitiveSizeInBits(); @@ -110,7 +126,7 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, } int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty) { + Type *Ty, TTI::TargetCostKind CostKind) { // Division by a constant can be turned into multiplication, but only if we // know it's constant. So it's not so much that the immediate is cheap (it's // not), but that the alternative is worse. @@ -125,12 +141,14 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im if (Imm == 255 || Imm == 65535) return 0; // Conversion to BIC is free, and means we can use ~Imm instead. - return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty)); + return std::min(getIntImmCost(Imm, Ty, CostKind), + getIntImmCost(~Imm, Ty, CostKind)); } if (Opcode == Instruction::Add) // Conversion to SUB is free, and means we can use -Imm instead. - return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty)); + return std::min(getIntImmCost(Imm, Ty, CostKind), + getIntImmCost(-Imm, Ty, CostKind)); if (Opcode == Instruction::ICmp && Imm.isNegative() && Ty->getIntegerBitWidth() == 32) { @@ -147,34 +165,27 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im if (Opcode == Instruction::Xor && Imm.isAllOnesValue()) return 0; - return getIntImmCost(Imm, Ty); + return getIntImmCost(Imm, Ty, CostKind); } int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - // Single to/from double precision conversions. - static const CostTblEntry NEONFltDblTbl[] = { - // Vector fptrunc/fpext conversions. - { ISD::FP_ROUND, MVT::v2f64, 2 }, - { ISD::FP_EXTEND, MVT::v2f32, 2 }, - { ISD::FP_EXTEND, MVT::v4f32, 4 } + // TODO: Allow non-throughput costs that aren't binary. + auto AdjustCost = [&CostKind](int Cost) { + if (CostKind != TTI::TCK_RecipThroughput) + return Cost == 0 ? 0 : 1; + return Cost; }; - if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND || - ISD == ISD::FP_EXTEND)) { - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); - if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) - return LT.first * Entry->Cost; - } - EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); // The extend of a load is free if (I && isa<LoadInst>(I->getOperand(0))) { @@ -194,7 +205,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, }; if (const auto *Entry = ConvertCostTableLookup( LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) - return Entry->Cost; + return AdjustCost(Entry->Cost); static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0}, @@ -203,27 +214,129 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0}, {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0}, {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0}, + // The following extend from a legal type to an illegal type, so need to + // split the load. This introduced an extra load operation, but the + // extend is still "free". + {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1}, + {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1}, + {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3}, + {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3}, + {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1}, + {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1}, }; if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { if (const auto *Entry = ConvertCostTableLookup(MVELoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) - return Entry->Cost; + return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); + } + + static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { + // FPExtends are similar but also require the VCVT instructions. + {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, + {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3}, + }; + if (SrcTy.isVector() && ST->hasMVEFloatOps()) { + if (const auto *Entry = + ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, + DstTy.getSimpleVT(), SrcTy.getSimpleVT())) + return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); + } + } + + // The truncate of a store is free. This is the mirror of extends above. + if (I && I->hasOneUse() && isa<StoreInst>(*I->user_begin())) { + static const TypeConversionCostTblEntry MVELoadConversionTbl[] = { + {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0}, + {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0}, + {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0}, + {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1}, + {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3}, + {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1}, + }; + if (SrcTy.isVector() && ST->hasMVEIntegerOps()) { + if (const auto *Entry = + ConvertCostTableLookup(MVELoadConversionTbl, ISD, SrcTy.getSimpleVT(), + DstTy.getSimpleVT())) + return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); + } + + static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { + {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, + {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, + }; + if (SrcTy.isVector() && ST->hasMVEFloatOps()) { + if (const auto *Entry = + ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(), + DstTy.getSimpleVT())) + return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } } + // NEON vector operations that can extend their inputs. + if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) && + I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) { + static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = { + // vaddl + { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 }, + { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 }, + // vsubl + { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 }, + { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 }, + // vmull + { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 }, + { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 }, + // vshll + { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 }, + { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 }, + }; + + auto *User = cast<Instruction>(*I->user_begin()); + int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode()); + if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) { + return AdjustCost(Entry->Cost); + } + } + + // Single to/from double precision conversions. + if (Src->isVectorTy() && ST->hasNEON() && + ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 && + DstTy.getScalarType() == MVT::f32) || + (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 && + DstTy.getScalarType() == MVT::f64))) { + static const CostTblEntry NEONFltDblTbl[] = { + // Vector fptrunc/fpext conversions. + {ISD::FP_ROUND, MVT::v2f64, 2}, + {ISD::FP_EXTEND, MVT::v2f32, 2}, + {ISD::FP_EXTEND, MVT::v4f32, 4}}; + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) + return AdjustCost(LT.first * Entry->Cost); + } + // Some arithmetic, load and store operations have specific instructions // to cast up/down their types automatically at no extra cost. // TODO: Get these tables to know at least what the related operations are. static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { - { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, - { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, // The number of vmovl instructions for the extension. + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, @@ -294,7 +407,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) - return Entry->Cost; + return AdjustCost(Entry->Cost); } // Scalar float to integer conversions. @@ -324,7 +437,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) - return Entry->Cost; + return AdjustCost(Entry->Cost); } // Scalar integer to float conversions. @@ -355,7 +468,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) - return Entry->Cost; + return AdjustCost(Entry->Cost); } // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one @@ -380,7 +493,28 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) - return Entry->Cost * ST->getMVEVectorCostFactor(); + return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); + } + + if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) { + // As general rule, fp converts that were not matched above are scalarized + // and cost 1 vcvt for each lane, so long as the instruction is available. + // If not it will become a series of function calls. + const int CallCost = getCallInstrCost(nullptr, Dst, {Src}, CostKind); + int Lanes = 1; + if (SrcTy.isFixedLengthVector()) + Lanes = SrcTy.getVectorNumElements(); + auto IsLegal = [this](EVT VT) { + EVT EltVT = VT.getScalarType(); + return (EltVT == MVT::f32 && ST->hasVFP2Base()) || + (EltVT == MVT::f64 && ST->hasFP64()) || + (EltVT == MVT::f16 && ST->hasFullFP16()); + }; + + if (IsLegal(SrcTy) && IsLegal(DstTy)) + return Lanes; + else + return Lanes * CallCost; } // Scalar integer conversion costs. @@ -399,13 +533,14 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) - return Entry->Cost; + return AdjustCost(Entry->Cost); } int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() ? ST->getMVEVectorCostFactor() : 1; - return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src); + return AdjustCost( + BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); } int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, @@ -420,7 +555,7 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, Opcode == Instruction::ExtractElement)) { // Cross-class copies are expensive on many microarchitectures, // so assume they are expensive by default. - if (ValTy->getVectorElementType()->isIntegerTy()) + if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy()) return 3; // Even if it's not a cross class copy, this likely leads to mixing @@ -438,14 +573,19 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, // result anyway. return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), ST->getMVEVectorCostFactor()) * - ValTy->getVectorNumElements() / 2; + cast<FixedVectorType>(ValTy)->getNumElements() / 2; } return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + TTI::TargetCostKind CostKind, const Instruction *I) { + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); + int ISD = TLI->InstructionOpcodeToISD(Opcode); // On NEON a vector select gets lowered to vbsl. if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) { @@ -472,7 +612,8 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy() ? ST->getMVEVectorCostFactor() : 1; - return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, + I); } int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, @@ -496,11 +637,28 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, return BaseT::getAddressComputationCost(Ty, SE, Ptr); } -bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { +bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) { + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + // If a VCTP is part of a chain, it's already profitable and shouldn't be + // optimized, else LSR may block tail-predication. + switch (II->getIntrinsicID()) { + case Intrinsic::arm_mve_vctp8: + case Intrinsic::arm_mve_vctp16: + case Intrinsic::arm_mve_vctp32: + case Intrinsic::arm_mve_vctp64: + return true; + default: + break; + } + } + return false; +} + +bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) return false; - if (auto *VecTy = dyn_cast<VectorType>(DataTy)) { + if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) { // Don't support v2i1 yet. if (VecTy->getNumElements() == 2) return false; @@ -512,12 +670,11 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { } unsigned EltWidth = DataTy->getScalarSizeInBits(); - return (EltWidth == 32 && (!Alignment || Alignment >= 4)) || - (EltWidth == 16 && (!Alignment || Alignment >= 2)) || - (EltWidth == 8); + return (EltWidth == 32 && Alignment >= 4) || + (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8); } -bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) { +bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) { if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) return false; @@ -534,8 +691,8 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) { return false; unsigned EltWidth = Ty->getScalarSizeInBits(); - return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) || - (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8); + return ((EltWidth == 32 && Alignment >= 4) || + (EltWidth == 16 && Alignment >= 2) || EltWidth == 8); } int ARMTTIImpl::getMemcpyCost(const Instruction *I) { @@ -552,8 +709,8 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) { return LibCallCost; const unsigned Size = C->getValue().getZExtValue(); - const unsigned DstAlign = MI->getDestAlignment(); - const unsigned SrcAlign = MI->getSourceAlignment(); + const Align DstAlign = *MI->getDestAlign(); + const Align SrcAlign = *MI->getSourceAlign(); const Function *F = I->getParent()->getParent(); const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize()); std::vector<EVT> MemOps; @@ -562,8 +719,9 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) { // loaded and stored. That's why we multiply the number of elements by 2 to // get the cost for this memcpy. if (getTLI()->findOptimalMemOpLowering( - MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/, - false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/, + MemOps, Limit, + MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign, + /*IsVolatile*/ true), MI->getDestAddressSpace(), MI->getSourceAddressSpace(), F->getAttributes())) return MemOps.size() * 2; @@ -572,8 +730,8 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) { return LibCallCost; } -int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, + int Index, VectorType *SubTp) { if (ST->hasNEON()) { if (Kind == TTI::SK_Broadcast) { static const CostTblEntry NEONDupTbl[] = { @@ -667,12 +825,19 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, } int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + TTI::TargetCostKind CostKind, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, const Instruction *CxtI) { + // TODO: Handle more cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, + Op2Info, Opd1PropInfo, + Opd2PropInfo, Args, CxtI); + int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -723,7 +888,8 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) return LT.first * Entry->Cost; - int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, + Op2Info, Opd1PropInfo, Opd2PropInfo); // This is somewhat of a hack. The problem that we are facing is that SROA @@ -779,12 +945,13 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * BaseCost; // Else this is expand, assume that we need to scalarize this op. - if (Ty->isVectorTy()) { - unsigned Num = Ty->getVectorNumElements(); - unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); + if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { + unsigned Num = VTy->getNumElements(); + unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType(), + CostKind); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost; + return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost; } return BaseCost; @@ -792,26 +959,53 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind, const Instruction *I) { - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return 1; + + // Type legalization can't handle structs + if (TLI->getValueType(DL, Src, true) == MVT::Other) + return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, + CostKind); if (ST->hasNEON() && Src->isVectorTy() && (Alignment && *Alignment != Align(16)) && - Src->getVectorElementType()->isDoubleTy()) { + cast<VectorType>(Src)->getElementType()->isDoubleTy()) { // Unaligned loads/stores are extremely inefficient. // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); return LT.first * 4; } + + // MVE can optimize a fpext(load(4xhalf)) using an extending integer load. + // Same for stores. + if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I && + ((Opcode == Instruction::Load && I->hasOneUse() && + isa<FPExtInst>(*I->user_begin())) || + (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) { + FixedVectorType *SrcVTy = cast<FixedVectorType>(Src); + Type *DstTy = + Opcode == Instruction::Load + ? (*I->user_begin())->getType() + : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType(); + if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() && + DstTy->getScalarType()->isFloatTy()) + return ST->getMVEVectorCostFactor(); + } + int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() ? ST->getMVEVectorCostFactor() : 1; - return BaseCost * LT.first; + return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, + CostKind, I); } int ARMTTIImpl::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, - bool UseMaskForGaps) { + Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + bool UseMaskForCond, bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); @@ -820,8 +1014,9 @@ int ARMTTIImpl::getInterleavedMemoryOpCost( if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && !UseMaskForCond && !UseMaskForGaps) { - unsigned NumElts = VecTy->getVectorNumElements(); - auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); + unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements(); + auto *SubVecTy = + FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); // vldN/vstN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be @@ -842,10 +1037,109 @@ int ARMTTIImpl::getInterleavedMemoryOpCost( } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); } +unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + const Value *Ptr, bool VariableMask, + Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I) { + using namespace PatternMatch; + if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters) + return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + + assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!"); + auto *VTy = cast<FixedVectorType>(DataTy); + + // TODO: Splitting, once we do that. + + unsigned NumElems = VTy->getNumElements(); + unsigned EltSize = VTy->getScalarSizeInBits(); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy); + + // For now, it is assumed that for the MVE gather instructions the loads are + // all effectively serialised. This means the cost is the scalar cost + // multiplied by the number of elements being loaded. This is possibly very + // conservative, but even so we still end up vectorising loops because the + // cost per iteration for many loops is lower than for scalar loops. + unsigned VectorCost = NumElems * LT.first; + // The scalarization cost should be a lot higher. We use the number of vector + // elements plus the scalarization overhead. + unsigned ScalarCost = + NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {}); + + if (Alignment < EltSize / 8) + return ScalarCost; + + unsigned ExtSize = EltSize; + // Check whether there's a single user that asks for an extended type + if (I != nullptr) { + // Dependent of the caller of this function, a gather instruction will + // either have opcode Instruction::Load or be a call to the masked_gather + // intrinsic + if ((I->getOpcode() == Instruction::Load || + match(I, m_Intrinsic<Intrinsic::masked_gather>())) && + I->hasOneUse()) { + const User *Us = *I->users().begin(); + if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) { + // only allow valid type combinations + unsigned TypeSize = + cast<Instruction>(Us)->getType()->getScalarSizeInBits(); + if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) || + (TypeSize == 16 && EltSize == 8)) && + TypeSize * NumElems == 128) { + ExtSize = TypeSize; + } + } + } + // Check whether the input data needs to be truncated + TruncInst *T; + if ((I->getOpcode() == Instruction::Store || + match(I, m_Intrinsic<Intrinsic::masked_scatter>())) && + (T = dyn_cast<TruncInst>(I->getOperand(0)))) { + // Only allow valid type combinations + unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits(); + if (((EltSize == 16 && TypeSize == 32) || + (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) && + TypeSize * NumElems == 128) + ExtSize = TypeSize; + } + } + + if (ExtSize * NumElems != 128 || NumElems < 4) + return ScalarCost; + + // Any (aligned) i32 gather will not need to be scalarised. + if (ExtSize == 32) + return VectorCost; + // For smaller types, we need to ensure that the gep's inputs are correctly + // extended from a small enough value. Other sizes (including i64) are + // scalarized for now. + if (ExtSize != 8 && ExtSize != 16) + return ScalarCost; + + if (const auto *BC = dyn_cast<BitCastInst>(Ptr)) + Ptr = BC->getOperand(0); + if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { + if (GEP->getNumOperands() != 2) + return ScalarCost; + unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType()); + // Scale needs to be correct (which is only relevant for i16s). + if (Scale != 1 && Scale * 8 != ExtSize) + return ScalarCost; + // And we need to zext (not sext) the indexes from a small enough type. + if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) { + if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize) + return VectorCost; + } + return ScalarCost; + } + return ScalarCost; +} + bool ARMTTIImpl::isLoweredToCall(const Function *F) { if (!F->isIntrinsic()) BaseT::isLoweredToCall(F); @@ -913,23 +1207,31 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, HardwareLoopInfo &HWLoopInfo) { // Low-overhead branches are only supported in the 'low-overhead branch' // extension of v8.1-m. - if (!ST->hasLOB() || DisableLowOverheadLoops) + if (!ST->hasLOB() || DisableLowOverheadLoops) { + LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n"); return false; + } - if (!SE.hasLoopInvariantBackedgeTakenCount(L)) + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { + LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n"); return false; + } const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); - if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { + LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n"); return false; + } const SCEV *TripCountSCEV = SE.getAddExpr(BackedgeTakenCount, SE.getOne(BackedgeTakenCount->getType())); // We need to store the trip count in LR, a 32-bit register. - if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) + if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) { + LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n"); return false; + } // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little // point in generating a hardware loop if that's going to happen. @@ -1034,8 +1336,10 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, auto ScanLoop = [&](Loop *L) { for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { - if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) + if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) { + LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n"); return false; + } } } return true; @@ -1102,12 +1406,47 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) { static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI) { + LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); + + // If there are live-out values, it is probably a reduction, which needs a + // final reduction step after the loop. MVE has a VADDV instruction to reduce + // integer vectors, but doesn't have an equivalent one for float vectors. A + // live-out value that is not recognised as a reduction will result in the + // tail-predicated loop to be reverted to a non-predicated loop and this is + // very expensive, i.e. it has a significant performance impact. So, in this + // case it's better not to tail-predicate the loop, which is what we check + // here. Thus, we allow only 1 live-out value, which has to be an integer + // reduction, which matches the loops supported by ARMLowOverheadLoops. + // It is important to keep ARMLowOverheadLoops and canTailPredicateLoop in + // sync with each other. + SmallVector< Instruction *, 8 > LiveOuts; + LiveOuts = llvm::findDefsUsedOutsideOfLoop(L); + bool IntReductionsDisabled = + EnableTailPredication == TailPredication::EnabledNoReductions || + EnableTailPredication == TailPredication::ForceEnabledNoReductions; + + for (auto *I : LiveOuts) { + if (!I->getType()->isIntegerTy()) { + LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer " + "live-out value\n"); + return false; + } + if (I->getOpcode() != Instruction::Add) { + LLVM_DEBUG(dbgs() << "Only add reductions supported\n"); + return false; + } + if (IntReductionsDisabled) { + LLVM_DEBUG(dbgs() << "Integer add reductions not enabled\n"); + return false; + } + } + + // Next, check that all instructions can be tail-predicated. PredicatedScalarEvolution PSE = LAI->getPSE(); + SmallVector<Instruction *, 16> LoadStores; int ICmpCount = 0; int Stride = 0; - LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n"); - SmallVector<Instruction *, 16> LoadStores; for (BasicBlock *BB : L->blocks()) { for (Instruction &I : BB->instructionsWithoutDebug()) { if (isa<PHINode>(&I)) @@ -1155,8 +1494,10 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI, DominatorTree *DT, const LoopAccessInfo *LAI) { - if (DisableTailPredication) + if (!EnableTailPredication) { + LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n"); return false; + } // Creating a predicated vector loop is the first step for generating a // tail-predicated hardware loop, for which we need the MVE masked @@ -1197,7 +1538,16 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, return canTailPredicateLoop(L, LI, SE, DL, LAI); } +bool ARMTTIImpl::emitGetActiveLaneMask() const { + if (!ST->hasMVEIntegerOps() || !EnableTailPredication) + return false; + // Intrinsic @llvm.get.active.lane.mask is supported. + // It is used in the MVETailPredication pass, which requires the number of + // elements processed by this vector loop to setup the tail-predicated + // loop. + return true; +} void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { // Only currently enable these preferences for M-Class cores. @@ -1241,8 +1591,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, return; if (isa<CallInst>(I) || isa<InvokeInst>(I)) { - ImmutableCallSite CS(&I); - if (const Function *F = CS.getCalledFunction()) { + if (const Function *F = cast<CallBase>(I).getCalledFunction()) { if (!isLoweredToCall(F)) continue; } @@ -1251,7 +1600,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, SmallVector<const Value*, 4> Operands(I.value_op_begin(), I.value_op_end()); - Cost += getUserCost(&I, Operands); + Cost += getUserCost(&I, Operands, TargetTransformInfo::TCK_CodeSize); } } @@ -1271,27 +1620,12 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, UP.Force = true; } +void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP) { + BaseT::getPeelingPreferences(L, SE, PP); +} + bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { - assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type"); - unsigned ScalarBits = Ty->getScalarSizeInBits(); - if (!ST->hasMVEIntegerOps()) - return false; - - switch (Opcode) { - case Instruction::FAdd: - case Instruction::FMul: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Mul: - case Instruction::FCmp: - return false; - case Instruction::ICmp: - case Instruction::Add: - return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128; - default: - llvm_unreachable("Unhandled reduction opcode"); - } - return false; + return ST->hasMVEIntegerOps(); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index f66083eaf187..7bf6de4bffe0 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -38,6 +38,16 @@ class ScalarEvolution; class Type; class Value; +namespace TailPredication { + enum Mode { + Disabled = 0, + EnabledNoReductions, + Enabled, + ForceEnabledNoReductions, + ForceEnabled + }; +} + class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { using BaseT = BasicTTIImplBase<ARMTTIImpl>; using TTI = TargetTransformInfo; @@ -47,13 +57,13 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { const ARMSubtarget *ST; const ARMTargetLowering *TLI; - // Currently the following features are excluded from InlineFeatureWhitelist. + // Currently the following features are excluded from InlineFeaturesAllowed. // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32 // Depending on whether they are set or unset, different // instructions/registers are available. For example, inlining a callee with // -thumb-mode in a caller with +thumb-mode, may cause the assembler to // fail if the callee uses ARM only instructions, e.g. in inline asm. - const FeatureBitset InlineFeatureWhitelist = { + const FeatureBitset InlineFeaturesAllowed = { ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2, ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8, ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb, @@ -93,11 +103,8 @@ public: bool enableInterleavedAccessVectorization() { return true; } - bool shouldFavorBackedgeIndex(const Loop *L) const { - if (L->getHeader()->getParent()->hasOptSize()) - return false; - return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1; - } + bool shouldFavorBackedgeIndex(const Loop *L) const; + bool shouldFavorPostInc() const; /// Floating-point computation using ARMv8 AArch32 Advanced /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD @@ -113,9 +120,10 @@ public: Type *Ty); using BaseT::getIntImmCost; - int getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); - int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty, TTI::TargetCostKind CostKind); /// @} @@ -153,19 +161,24 @@ public: return ST->getMaxInterleaveFactor(); } - bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment); + bool isProfitableLSRChainElement(Instruction *I); - bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) { + bool isLegalMaskedLoad(Type *DataTy, Align Alignment); + + bool isLegalMaskedStore(Type *DataTy, Align Alignment) { return isLegalMaskedLoad(DataTy, Alignment); } - bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment); + bool isLegalMaskedGather(Type *Ty, Align Alignment); - bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { return false; } + bool isLegalMaskedScatter(Type *Ty, Align Alignment) { + return isLegalMaskedGather(Ty, Alignment); + } int getMemcpyCost(const Instruction *I); - int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, + VectorType *SubTp); bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; @@ -194,9 +207,11 @@ public: } int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); @@ -206,6 +221,7 @@ public: int getArithmeticInstrCost( unsigned Opcode, Type *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, TTI::OperandValueKind Op2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, @@ -214,13 +230,20 @@ public: const Instruction *CxtI = nullptr); int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, - unsigned AddressSpace, const Instruction *I = nullptr); + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, - ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond = false, - bool UseMaskForGaps = false); + int getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, + Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, + bool UseMaskForCond = false, bool UseMaskForGaps = false); + + unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); bool isLoweredToCall(const Function *F); bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, @@ -236,6 +259,10 @@ public: void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); + bool emitGetActiveLaneMask() const; + + void getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP); bool shouldBuildLookupTablesForConstant(Constant *C) const { // In the ROPI and RWPI relocation models we can't have pointers to global // variables or functions in constant data, so don't convert switches to diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index f6d76ee09534..05f870b90ecd 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringSet.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" @@ -180,10 +181,68 @@ public: } }; +// Various sets of ARM instruction mnemonics which are used by the asm parser +class ARMMnemonicSets { + StringSet<> CDE; + StringSet<> CDEWithVPTSuffix; +public: + ARMMnemonicSets(const MCSubtargetInfo &STI); + + /// Returns true iff a given mnemonic is a CDE instruction + bool isCDEInstr(StringRef Mnemonic) { + // Quick check before searching the set + if (!Mnemonic.startswith("cx") && !Mnemonic.startswith("vcx")) + return false; + return CDE.count(Mnemonic); + } + + /// Returns true iff a given mnemonic is a VPT-predicable CDE instruction + /// (possibly with a predication suffix "e" or "t") + bool isVPTPredicableCDEInstr(StringRef Mnemonic) { + if (!Mnemonic.startswith("vcx")) + return false; + return CDEWithVPTSuffix.count(Mnemonic); + } + + /// Returns true iff a given mnemonic is an IT-predicable CDE instruction + /// (possibly with a condition suffix) + bool isITPredicableCDEInstr(StringRef Mnemonic) { + if (!Mnemonic.startswith("cx")) + return false; + return Mnemonic.startswith("cx1a") || Mnemonic.startswith("cx1da") || + Mnemonic.startswith("cx2a") || Mnemonic.startswith("cx2da") || + Mnemonic.startswith("cx3a") || Mnemonic.startswith("cx3da"); + } + + /// Return true iff a given mnemonic is an integer CDE instruction with + /// dual-register destination + bool isCDEDualRegInstr(StringRef Mnemonic) { + if (!Mnemonic.startswith("cx")) + return false; + return Mnemonic == "cx1d" || Mnemonic == "cx1da" || + Mnemonic == "cx2d" || Mnemonic == "cx2da" || + Mnemonic == "cx3d" || Mnemonic == "cx3da"; + } +}; + +ARMMnemonicSets::ARMMnemonicSets(const MCSubtargetInfo &STI) { + for (StringRef Mnemonic: { "cx1", "cx1a", "cx1d", "cx1da", + "cx2", "cx2a", "cx2d", "cx2da", + "cx3", "cx3a", "cx3d", "cx3da", }) + CDE.insert(Mnemonic); + for (StringRef Mnemonic : + {"vcx1", "vcx1a", "vcx2", "vcx2a", "vcx3", "vcx3a"}) { + CDE.insert(Mnemonic); + CDEWithVPTSuffix.insert(Mnemonic); + CDEWithVPTSuffix.insert(std::string(Mnemonic) + "t"); + CDEWithVPTSuffix.insert(std::string(Mnemonic) + "e"); + } +} class ARMAsmParser : public MCTargetAsmParser { const MCRegisterInfo *MRI; UnwindContext UC; + ARMMnemonicSets MS; ARMTargetStreamer &getTargetStreamer() { assert(getParser().getStreamer().getTargetStreamer() && @@ -245,12 +304,12 @@ class ARMAsmParser : public MCTargetAsmParser { ITInst.setOpcode(ARM::t2IT); ITInst.addOperand(MCOperand::createImm(ITState.Cond)); ITInst.addOperand(MCOperand::createImm(ITState.Mask)); - Out.EmitInstruction(ITInst, getSTI()); + Out.emitInstruction(ITInst, getSTI()); // Emit the conditonal instructions assert(PendingConditionalInsts.size() <= 4); for (const MCInst &Inst : PendingConditionalInsts) { - Out.EmitInstruction(Inst, getSTI()); + Out.emitInstruction(Inst, getSTI()); } PendingConditionalInsts.clear(); @@ -444,6 +503,8 @@ class ARMAsmParser : public MCTargetAsmParser { void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting, OperandVector &Operands); + bool CDEConvertDualRegOperand(StringRef Mnemonic, OperandVector &Operands); + bool isThumb() const { // FIXME: Can tablegen auto-generate this? return getSTI().getFeatureBits()[ARM::ModeThumb]; @@ -501,6 +562,9 @@ class ARMAsmParser : public MCTargetAsmParser { bool hasMVEFloat() const { return getSTI().getFeatureBits()[ARM::HasMVEFloatOps]; } + bool hasCDE() const { + return getSTI().getFeatureBits()[ARM::HasCDEOps]; + } bool has8MSecExt() const { return getSTI().getFeatureBits()[ARM::Feature8MSecExt]; } @@ -605,7 +669,7 @@ public: ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, STI, MII), UC(Parser) { + : MCTargetAsmParser(Options, STI, MII), UC(Parser), MS(STI) { MCAsmParserExtension::Initialize(Parser); // Cache the MCRegisterInfo. @@ -628,6 +692,8 @@ public: // Implementation of the MCTargetAsmParser interface: bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) override; bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; bool ParseDirective(AsmToken DirectiveID) override; @@ -3553,8 +3619,7 @@ public: if (Kind == k_RegisterList && Regs.back().second == ARM::APSR) Kind = k_RegisterListWithAPSR; - assert(std::is_sorted(Regs.begin(), Regs.end()) && - "Register list must be sorted by encoding"); + assert(llvm::is_sorted(Regs) && "Register list must be sorted by encoding"); auto Op = std::make_unique<ARMOperand>(Kind); for (const auto &P : Regs) @@ -3885,6 +3950,14 @@ bool ARMAsmParser::ParseRegister(unsigned &RegNo, return (RegNo == (unsigned)-1); } +OperandMatchResultTy ARMAsmParser::tryParseRegister(unsigned &RegNo, + SMLoc &StartLoc, + SMLoc &EndLoc) { + if (ParseRegister(RegNo, StartLoc, EndLoc)) + return MatchOperand_NoMatch; + return MatchOperand_Success; +} + /// Try to parse a register name. The token must be an Identifier when called, /// and if it is a register name the token is eaten and the register number is /// returned. Otherwise return -1. @@ -6045,20 +6118,35 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { case AsmToken::LCurly: return parseRegisterList(Operands, !Mnemonic.startswith("clr")); case AsmToken::Dollar: - case AsmToken::Hash: - // #42 -> immediate. + case AsmToken::Hash: { + // #42 -> immediate + // $ 42 -> immediate + // $foo -> symbol name + // $42 -> symbol name S = Parser.getTok().getLoc(); - Parser.Lex(); + + // Favor the interpretation of $-prefixed operands as symbol names. + // Cases where immediates are explicitly expected are handled by their + // specific ParseMethod implementations. + auto AdjacentToken = getLexer().peekTok(/*ShouldSkipSpace=*/false); + bool ExpectIdentifier = Parser.getTok().is(AsmToken::Dollar) && + (AdjacentToken.is(AsmToken::Identifier) || + AdjacentToken.is(AsmToken::Integer)); + if (!ExpectIdentifier) { + // Token is not part of identifier. Drop leading $ or # before parsing + // expression. + Parser.Lex(); + } if (Parser.getTok().isNot(AsmToken::Colon)) { - bool isNegative = Parser.getTok().is(AsmToken::Minus); + bool IsNegative = Parser.getTok().is(AsmToken::Minus); const MCExpr *ImmVal; if (getParser().parseExpression(ImmVal)) return true; const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal); if (CE) { int32_t Val = CE->getValue(); - if (isNegative && Val == 0) + if (IsNegative && Val == 0) ImmVal = MCConstantExpr::create(std::numeric_limits<int32_t>::min(), getContext()); } @@ -6077,7 +6165,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { } // w/ a ':' after the '#', it's just like a plain ':'. LLVM_FALLTHROUGH; - + } case AsmToken::Colon: { S = Parser.getTok().getLoc(); // ":lower16:" and ":upper16:" expression prefixes @@ -6233,6 +6321,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic, Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" || Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" || Mnemonic == "bxns" || Mnemonic == "blxns" || + Mnemonic == "vdot" || Mnemonic == "vmmla" || Mnemonic == "vudot" || Mnemonic == "vsdot" || Mnemonic == "vcmla" || Mnemonic == "vcadd" || Mnemonic == "vfmal" || Mnemonic == "vfmsl" || @@ -6373,14 +6462,20 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic, Mnemonic == "vudot" || Mnemonic == "vsdot" || Mnemonic == "vcmla" || Mnemonic == "vcadd" || Mnemonic == "vfmal" || Mnemonic == "vfmsl" || + Mnemonic == "vfmat" || Mnemonic == "vfmab" || + Mnemonic == "vdot" || Mnemonic == "vmmla" || Mnemonic == "sb" || Mnemonic == "ssbb" || - Mnemonic == "pssbb" || + Mnemonic == "pssbb" || Mnemonic == "vsmmla" || + Mnemonic == "vummla" || Mnemonic == "vusmmla" || + Mnemonic == "vusdot" || Mnemonic == "vsudot" || Mnemonic == "bfcsel" || Mnemonic == "wls" || Mnemonic == "dls" || Mnemonic == "le" || Mnemonic == "csel" || Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" || Mnemonic == "cinc" || Mnemonic == "cinv" || Mnemonic == "cneg" || Mnemonic == "cset" || Mnemonic == "csetm" || Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst") || + (hasCDE() && MS.isCDEInstr(Mnemonic) && + !MS.isITPredicableCDEInstr(Mnemonic)) || (hasMVE() && (Mnemonic.startswith("vst2") || Mnemonic.startswith("vld2") || Mnemonic.startswith("vst4") || Mnemonic.startswith("vld4") || @@ -6770,6 +6865,69 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic, ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc())); } +// Dual-register instruction have the following syntax: +// <mnemonic> <predicate>? <coproc>, <Rdest>, <Rdest+1>, <Rsrc>, ..., #imm +// This function tries to remove <Rdest+1> and replace <Rdest> with a pair +// operand. If the conversion fails an error is diagnosed, and the function +// returns true. +bool ARMAsmParser::CDEConvertDualRegOperand(StringRef Mnemonic, + OperandVector &Operands) { + assert(MS.isCDEDualRegInstr(Mnemonic)); + bool isPredicable = + Mnemonic == "cx1da" || Mnemonic == "cx2da" || Mnemonic == "cx3da"; + size_t NumPredOps = isPredicable ? 1 : 0; + + if (Operands.size() <= 3 + NumPredOps) + return false; + + StringRef Op2Diag( + "operand must be an even-numbered register in the range [r0, r10]"); + + const MCParsedAsmOperand &Op2 = *Operands[2 + NumPredOps]; + if (!Op2.isReg()) + return Error(Op2.getStartLoc(), Op2Diag); + + unsigned RNext; + unsigned RPair; + switch (Op2.getReg()) { + default: + return Error(Op2.getStartLoc(), Op2Diag); + case ARM::R0: + RNext = ARM::R1; + RPair = ARM::R0_R1; + break; + case ARM::R2: + RNext = ARM::R3; + RPair = ARM::R2_R3; + break; + case ARM::R4: + RNext = ARM::R5; + RPair = ARM::R4_R5; + break; + case ARM::R6: + RNext = ARM::R7; + RPair = ARM::R6_R7; + break; + case ARM::R8: + RNext = ARM::R9; + RPair = ARM::R8_R9; + break; + case ARM::R10: + RNext = ARM::R11; + RPair = ARM::R10_R11; + break; + } + + const MCParsedAsmOperand &Op3 = *Operands[3 + NumPredOps]; + if (!Op3.isReg() || Op3.getReg() != RNext) + return Error(Op3.getStartLoc(), "operand must be a consecutive register"); + + Operands.erase(Operands.begin() + 3 + NumPredOps); + Operands[2 + NumPredOps] = + ARMOperand::CreateReg(RPair, Op2.getStartLoc(), Op2.getEndLoc()); + return false; +} + /// Parse an arm instruction mnemonic followed by its operands. bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { @@ -6786,7 +6944,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // First check for the ARM-specific .req directive. if (Parser.getTok().is(AsmToken::Identifier) && - Parser.getTok().getIdentifier() == ".req") { + Parser.getTok().getIdentifier().lower() == ".req") { parseDirectiveReq(Name, NameLoc); // We always return 'error' for this, as we're done with this // statement and don't need to match the 'instruction." @@ -6823,6 +6981,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // ITx -> x100 (ITT -> 0100, ITE -> 1100) // ITxy -> xy10 (e.g. ITET -> 1010) // ITxyz -> xyz1 (e.g. ITEET -> 1101) + // Note: See the ARM::PredBlockMask enum in + // /lib/Target/ARM/Utils/ARMBaseInfo.h if (Mnemonic == "it" || Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst")) { SMLoc Loc = Mnemonic == "it" ? SMLoc::getFromPointer(NameLoc.getPointer() + 2) : @@ -6969,6 +7129,21 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands); + if (hasCDE() && MS.isCDEInstr(Mnemonic)) { + // Dual-register instructions use even-odd register pairs as their + // destination operand, in assembly such pair is spelled as two + // consecutive registers, without any special syntax. ConvertDualRegOperand + // tries to convert such operand into register pair, e.g. r2, r3 -> r2_r3. + // It returns true, if an error message has been emitted. If the function + // returns false, the function either succeeded or an error (e.g. missing + // operand) will be diagnosed elsewhere. + if (MS.isCDEDualRegInstr(Mnemonic)) { + bool GotError = CDEConvertDualRegOperand(Mnemonic, Operands); + if (GotError) + return GotError; + } + } + // Some instructions, mostly Thumb, have forms for the same mnemonic that // do and don't have a cc_out optional-def operand. With some spot-checks // of the operand list, we can figure out which variant we're trying to @@ -7947,6 +8122,142 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst, return Error (Operands[3]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1"); break; } + case ARM::UMAAL: + case ARM::UMLAL: + case ARM::UMULL: + case ARM::t2UMAAL: + case ARM::t2UMLAL: + case ARM::t2UMULL: + case ARM::SMLAL: + case ARM::SMLALBB: + case ARM::SMLALBT: + case ARM::SMLALD: + case ARM::SMLALDX: + case ARM::SMLALTB: + case ARM::SMLALTT: + case ARM::SMLSLD: + case ARM::SMLSLDX: + case ARM::SMULL: + case ARM::t2SMLAL: + case ARM::t2SMLALBB: + case ARM::t2SMLALBT: + case ARM::t2SMLALD: + case ARM::t2SMLALDX: + case ARM::t2SMLALTB: + case ARM::t2SMLALTT: + case ARM::t2SMLSLD: + case ARM::t2SMLSLDX: + case ARM::t2SMULL: { + unsigned RdHi = Inst.getOperand(0).getReg(); + unsigned RdLo = Inst.getOperand(1).getReg(); + if(RdHi == RdLo) { + return Error(Loc, + "unpredictable instruction, RdHi and RdLo must be different"); + } + break; + } + + case ARM::CDE_CX1: + case ARM::CDE_CX1A: + case ARM::CDE_CX1D: + case ARM::CDE_CX1DA: + case ARM::CDE_CX2: + case ARM::CDE_CX2A: + case ARM::CDE_CX2D: + case ARM::CDE_CX2DA: + case ARM::CDE_CX3: + case ARM::CDE_CX3A: + case ARM::CDE_CX3D: + case ARM::CDE_CX3DA: + case ARM::CDE_VCX1_vec: + case ARM::CDE_VCX1_fpsp: + case ARM::CDE_VCX1_fpdp: + case ARM::CDE_VCX1A_vec: + case ARM::CDE_VCX1A_fpsp: + case ARM::CDE_VCX1A_fpdp: + case ARM::CDE_VCX2_vec: + case ARM::CDE_VCX2_fpsp: + case ARM::CDE_VCX2_fpdp: + case ARM::CDE_VCX2A_vec: + case ARM::CDE_VCX2A_fpsp: + case ARM::CDE_VCX2A_fpdp: + case ARM::CDE_VCX3_vec: + case ARM::CDE_VCX3_fpsp: + case ARM::CDE_VCX3_fpdp: + case ARM::CDE_VCX3A_vec: + case ARM::CDE_VCX3A_fpsp: + case ARM::CDE_VCX3A_fpdp: { + assert(Inst.getOperand(1).isImm() && + "CDE operand 1 must be a coprocessor ID"); + int64_t Coproc = Inst.getOperand(1).getImm(); + if (Coproc < 8 && !ARM::isCDECoproc(Coproc, *STI)) + return Error(Operands[1]->getStartLoc(), + "coprocessor must be configured as CDE"); + else if (Coproc >= 8) + return Error(Operands[1]->getStartLoc(), + "coprocessor must be in the range [p0, p7]"); + break; + } + + case ARM::t2CDP: + case ARM::t2CDP2: + case ARM::t2LDC2L_OFFSET: + case ARM::t2LDC2L_OPTION: + case ARM::t2LDC2L_POST: + case ARM::t2LDC2L_PRE: + case ARM::t2LDC2_OFFSET: + case ARM::t2LDC2_OPTION: + case ARM::t2LDC2_POST: + case ARM::t2LDC2_PRE: + case ARM::t2LDCL_OFFSET: + case ARM::t2LDCL_OPTION: + case ARM::t2LDCL_POST: + case ARM::t2LDCL_PRE: + case ARM::t2LDC_OFFSET: + case ARM::t2LDC_OPTION: + case ARM::t2LDC_POST: + case ARM::t2LDC_PRE: + case ARM::t2MCR: + case ARM::t2MCR2: + case ARM::t2MCRR: + case ARM::t2MCRR2: + case ARM::t2MRC: + case ARM::t2MRC2: + case ARM::t2MRRC: + case ARM::t2MRRC2: + case ARM::t2STC2L_OFFSET: + case ARM::t2STC2L_OPTION: + case ARM::t2STC2L_POST: + case ARM::t2STC2L_PRE: + case ARM::t2STC2_OFFSET: + case ARM::t2STC2_OPTION: + case ARM::t2STC2_POST: + case ARM::t2STC2_PRE: + case ARM::t2STCL_OFFSET: + case ARM::t2STCL_OPTION: + case ARM::t2STCL_POST: + case ARM::t2STCL_PRE: + case ARM::t2STC_OFFSET: + case ARM::t2STC_OPTION: + case ARM::t2STC_POST: + case ARM::t2STC_PRE: { + unsigned Opcode = Inst.getOpcode(); + // Inst.getOperand indexes operands in the (oops ...) and (iops ...) dags, + // CopInd is the index of the coprocessor operand. + size_t CopInd = 0; + if (Opcode == ARM::t2MRRC || Opcode == ARM::t2MRRC2) + CopInd = 2; + else if (Opcode == ARM::t2MRC || Opcode == ARM::t2MRC2) + CopInd = 1; + assert(Inst.getOperand(CopInd).isImm() && + "Operand must be a coprocessor ID"); + int64_t Coproc = Inst.getOperand(CopInd).getImm(); + // Operands[2] is the coprocessor operand at syntactic level + if (ARM::isCDECoproc(Coproc, *STI)) + return Error(Operands[2]->getStartLoc(), + "coprocessor must be configured as GCP"); + break; + } } return false; @@ -8223,50 +8534,6 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, } switch (Inst.getOpcode()) { - case ARM::MVE_VORNIZ0v4i32: - case ARM::MVE_VORNIZ0v8i16: - case ARM::MVE_VORNIZ8v4i32: - case ARM::MVE_VORNIZ8v8i16: - case ARM::MVE_VORNIZ16v4i32: - case ARM::MVE_VORNIZ24v4i32: - case ARM::MVE_VANDIZ0v4i32: - case ARM::MVE_VANDIZ0v8i16: - case ARM::MVE_VANDIZ8v4i32: - case ARM::MVE_VANDIZ8v8i16: - case ARM::MVE_VANDIZ16v4i32: - case ARM::MVE_VANDIZ24v4i32: { - unsigned Opcode; - bool imm16 = false; - switch(Inst.getOpcode()) { - case ARM::MVE_VORNIZ0v4i32: Opcode = ARM::MVE_VORRIZ0v4i32; break; - case ARM::MVE_VORNIZ0v8i16: Opcode = ARM::MVE_VORRIZ0v8i16; imm16 = true; break; - case ARM::MVE_VORNIZ8v4i32: Opcode = ARM::MVE_VORRIZ8v4i32; break; - case ARM::MVE_VORNIZ8v8i16: Opcode = ARM::MVE_VORRIZ8v8i16; imm16 = true; break; - case ARM::MVE_VORNIZ16v4i32: Opcode = ARM::MVE_VORRIZ16v4i32; break; - case ARM::MVE_VORNIZ24v4i32: Opcode = ARM::MVE_VORRIZ24v4i32; break; - case ARM::MVE_VANDIZ0v4i32: Opcode = ARM::MVE_VBICIZ0v4i32; break; - case ARM::MVE_VANDIZ0v8i16: Opcode = ARM::MVE_VBICIZ0v8i16; imm16 = true; break; - case ARM::MVE_VANDIZ8v4i32: Opcode = ARM::MVE_VBICIZ8v4i32; break; - case ARM::MVE_VANDIZ8v8i16: Opcode = ARM::MVE_VBICIZ8v8i16; imm16 = true; break; - case ARM::MVE_VANDIZ16v4i32: Opcode = ARM::MVE_VBICIZ16v4i32; break; - case ARM::MVE_VANDIZ24v4i32: Opcode = ARM::MVE_VBICIZ24v4i32; break; - default: llvm_unreachable("unexpected opcode"); - } - - MCInst TmpInst; - TmpInst.setOpcode(Opcode); - TmpInst.addOperand(Inst.getOperand(0)); - TmpInst.addOperand(Inst.getOperand(1)); - - // invert immediate - unsigned imm = ~Inst.getOperand(2).getImm() & (imm16 ? 0xffff : 0xffffffff); - TmpInst.addOperand(MCOperand::createImm(imm)); - - TmpInst.addOperand(Inst.getOperand(3)); - TmpInst.addOperand(Inst.getOperand(4)); - Inst = TmpInst; - return true; - } // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction. case ARM::LDRT_POST: case ARM::LDRBT_POST: { @@ -8285,6 +8552,26 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, Inst = TmpInst; return true; } + // Alias for 'ldr{sb,h,sh}t Rt, [Rn] {, #imm}' for ommitted immediate. + case ARM::LDRSBTii: + case ARM::LDRHTii: + case ARM::LDRSHTii: { + MCInst TmpInst; + + if (Inst.getOpcode() == ARM::LDRSBTii) + TmpInst.setOpcode(ARM::LDRSBTi); + else if (Inst.getOpcode() == ARM::LDRHTii) + TmpInst.setOpcode(ARM::LDRHTi); + else if (Inst.getOpcode() == ARM::LDRSHTii) + TmpInst.setOpcode(ARM::LDRSHTi); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(256)); + TmpInst.addOperand(Inst.getOperand(2)); + Inst = TmpInst; + return true; + } // Alias for alternate form of 'str{,b}t Rt, [Rn], #imm' instruction. case ARM::STRT_POST: case ARM::STRBT_POST: { @@ -8323,7 +8610,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, // Reading PC provides the start of the current instruction + 8 and // the transform to adr is biased by that. MCSymbol *Dot = getContext().createTempSymbol(); - Out.EmitLabel(Dot); + Out.emitLabel(Dot); const MCExpr *OpExpr = Inst.getOperand(2).getExpr(); const MCExpr *InstPC = MCSymbolRefExpr::create(Dot, MCSymbolRefExpr::VK_None, @@ -10521,7 +10808,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (isITBlockFull() || isITBlockTerminator(Inst)) flushPendingInstructions(Out); } else { - Out.EmitInstruction(Inst, getSTI()); + Out.emitInstruction(Inst, getSTI()); } return false; case Match_NearMisses: @@ -10546,7 +10833,7 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) { bool IsMachO = Format == MCObjectFileInfo::IsMachO; bool IsCOFF = Format == MCObjectFileInfo::IsCOFF; - StringRef IDVal = DirectiveID.getIdentifier(); + std::string IDVal = DirectiveID.getIdentifier().lower(); if (IDVal == ".word") parseLiteralValues(4, DirectiveID.getLoc()); else if (IDVal == ".short" || IDVal == ".hword") @@ -10632,7 +10919,7 @@ bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) { const MCExpr *Value; if (getParser().parseExpression(Value)) return true; - getParser().getStreamer().EmitValue(Value, Size, L); + getParser().getStreamer().emitValue(Value, Size, L); return false; }; return (parseMany(parseOne)); @@ -10648,7 +10935,7 @@ bool ARMAsmParser::parseDirectiveThumb(SMLoc L) { if (!isThumb()) SwitchMode(); - getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + getParser().getStreamer().emitAssemblerFlag(MCAF_Code16); return false; } @@ -10661,7 +10948,7 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) { if (isThumb()) SwitchMode(); - getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + getParser().getStreamer().emitAssemblerFlag(MCAF_Code32); return false; } @@ -10673,7 +10960,7 @@ void ARMAsmParser::doBeforeLabelEmit(MCSymbol *Symbol) { void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) { if (NextSymbolIsThumb) { - getParser().getStreamer().EmitThumbFunc(Symbol); + getParser().getStreamer().emitThumbFunc(Symbol); NextSymbolIsThumb = false; } } @@ -10693,7 +10980,7 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) { Parser.getTok().is(AsmToken::String)) { MCSymbol *Func = getParser().getContext().getOrCreateSymbol( Parser.getTok().getIdentifier()); - getParser().getStreamer().EmitThumbFunc(Func); + getParser().getStreamer().emitThumbFunc(Func); Parser.Lex(); if (parseToken(AsmToken::EndOfStatement, "unexpected token in '.thumb_func' directive")) @@ -10757,14 +11044,14 @@ bool ARMAsmParser::parseDirectiveCode(SMLoc L) { if (!isThumb()) SwitchMode(); - getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + getParser().getStreamer().emitAssemblerFlag(MCAF_Code16); } else { if (!hasARM()) return Error(L, "target does not support ARM mode"); if (isThumb()) SwitchMode(); - getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + getParser().getStreamer().emitAssemblerFlag(MCAF_Code32); } return false; @@ -10817,7 +11104,7 @@ void ARMAsmParser::FixModeAfterArchChange(bool WasThumb, SMLoc Loc) { SwitchMode(); } else { // Mode switch forced, because the new arch doesn't support the old mode. - getParser().getStreamer().EmitAssemblerFlag(isThumb() ? MCAF_Code16 + getParser().getStreamer().emitAssemblerFlag(isThumb() ? MCAF_Code16 : MCAF_Code32); // Warn about the implcit mode switch. GAS does not switch modes here, // but instead stays in the old mode, reporting an error on any following @@ -10859,11 +11146,13 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) { TagLoc = Parser.getTok().getLoc(); if (Parser.getTok().is(AsmToken::Identifier)) { StringRef Name = Parser.getTok().getIdentifier(); - Tag = ARMBuildAttrs::AttrTypeFromString(Name); - if (Tag == -1) { + Optional<unsigned> Ret = + ELFAttrs::attrTypeFromString(Name, ARMBuildAttrs::ARMAttributeTags); + if (!Ret.hasValue()) { Error(TagLoc, "attribute name not recognised: " + Name); return false; } + Tag = Ret.getValue(); Parser.Lex(); } else { const MCExpr *AttrExpr; @@ -11314,9 +11603,9 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) { assert(Section && "must have section to emit alignment"); if (Section->UseCodeAlign()) - getStreamer().EmitCodeAlignment(2); + getStreamer().emitCodeAlignment(2); else - getStreamer().EmitValueToAlignment(2); + getStreamer().emitValueToAlignment(2); return false; } @@ -11516,9 +11805,9 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) { const MCSection *Section = getStreamer().getCurrentSectionOnly(); assert(Section && "must have section to emit alignment"); if (Section->UseCodeAlign()) - getStreamer().EmitCodeAlignment(4, 0); + getStreamer().emitCodeAlignment(4, 0); else - getStreamer().EmitValueToAlignment(4, 0, 1, 0); + getStreamer().emitValueToAlignment(4, 0, 1, 0); return false; } return true; @@ -11770,7 +12059,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { // when we start to table-generate them, and we can use the ARM // flags below, that were generated by table-gen. static const struct { - const unsigned Kind; + const uint64_t Kind; const FeatureBitset ArchCheck; const FeatureBitset Features; } Extensions[] = { @@ -11819,7 +12108,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { EnableFeature = false; Name = Name.substr(2); } - unsigned FeatureKind = ARM::parseArchExt(Name); + uint64_t FeatureKind = ARM::parseArchExt(Name); if (FeatureKind == ARM::AEK_INVALID) return Error(ExtLoc, "unknown architectural extension: " + Name); @@ -11969,6 +12258,7 @@ bool ARMAsmParser::isMnemonicVPTPredicable(StringRef Mnemonic, Mnemonic.startswith("vpnot") || Mnemonic.startswith("vbic") || Mnemonic.startswith("vrmlsldavh") || Mnemonic.startswith("vmlsldav") || Mnemonic.startswith("vcvt") || + MS.isVPTPredicableCDEInstr(Mnemonic) || (Mnemonic.startswith("vmov") && !(ExtraToken == ".f16" || ExtraToken == ".32" || ExtraToken == ".16" || ExtraToken == ".8")); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index d26b04556abb..54ff0d9966cb 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -182,6 +182,9 @@ static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus +DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); @@ -201,6 +204,8 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); @@ -538,10 +543,6 @@ template<unsigned MinLog, unsigned MaxLog> static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder); -template <int shift> -static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const void *Decoder); template<unsigned start> static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val, uint64_t Address, @@ -1087,8 +1088,12 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size, } } + uint32_t Coproc = fieldFromInstruction(Insn32, 8, 4); + const uint8_t *DecoderTable = ARM::isCDECoproc(Coproc, STI) + ? DecoderTableThumb2CDE32 + : DecoderTableThumb2CoProc32; Result = - decodeInstruction(DecoderTableThumb2CoProc32, MI, Insn32, Address, this, STI); + decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { Size = 4; Check(Result, AddThumbPredicate(MI)); @@ -1220,10 +1225,12 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; + // According to the Arm ARM RegNo = 14 is undefined, but we return fail + // rather than SoftFail as there is no GPRPair table entry for index 7. if (RegNo > 13) return MCDisassembler::Fail; - if ((RegNo & 1) || RegNo == 0xe) + if (RegNo & 1) S = MCDisassembler::SoftFail; unsigned RegisterPair = GPRPairDecoderTable[RegNo/2]; @@ -1231,6 +1238,19 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo, return S; } +static DecodeStatus DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo > 13) + return MCDisassembler::Fail; + + unsigned RegisterPair = GPRPairDecoderTable[RegNo/2]; + Inst.addOperand(MCOperand::createReg(RegisterPair)); + + if ((RegNo & 1) || RegNo > 10) + return MCDisassembler::SoftFail; + return MCDisassembler::Success; +} + static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { @@ -6068,6 +6088,23 @@ static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo, return MCDisassembler::Success; } +static DecodeStatus +DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + if (RegNo == 15) { + Inst.addOperand(MCOperand::createReg(ARM::APSR_NZCV)); + return MCDisassembler::Success; + } + + unsigned Register = GPRDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + + if (RegNo == 13) + return MCDisassembler::SoftFail; + + return MCDisassembler::Success; +} + static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; @@ -6395,16 +6432,6 @@ static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val, return S; } -template <int shift> -static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val, - uint64_t Address, - const void *Decoder) { - Val <<= shift; - - Inst.addOperand(MCOperand::createImm(Val)); - return MCDisassembler::Success; -} - template<unsigned start> static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val, uint64_t Address, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index be02da18fb7d..9ad595c016c4 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -48,10 +48,17 @@ public: } // end anonymous namespace Optional<MCFixupKind> ARMAsmBackend::getFixupKind(StringRef Name) const { - if (STI.getTargetTriple().isOSBinFormatELF() && Name == "R_ARM_NONE") - return FK_NONE; - - return MCAsmBackend::getFixupKind(Name); + if (!STI.getTargetTriple().isOSBinFormatELF()) + return None; + + unsigned Type = llvm::StringSwitch<unsigned>(Name) +#define ELF_RELOC(X, Y) .Case(#X, Y) +#include "llvm/BinaryFormat/ELFRelocs/ARM.def" +#undef ELF_RELOC + .Default(-1u); + if (Type == -1u) + return None; + return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type); } const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { @@ -166,6 +173,11 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel} }; + // Fixup kinds from .reloc directive are like R_ARM_NONE. They do not require + // any extra processing. + if (Kind >= FirstLiteralRelocationKind) + return MCAsmBackend::getFixupKindInfo(FK_NONE); + if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); @@ -310,9 +322,8 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, return reasonForFixupRelaxation(Fixup, Value); } -void ARMAsmBackend::relaxInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI, - MCInst &Res) const { +void ARMAsmBackend::relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const { unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode(), STI); // Sanity check w/ diagnostic if we get here w/ a bogus instruction. @@ -328,17 +339,18 @@ void ARMAsmBackend::relaxInstruction(const MCInst &Inst, // have to change the operands too. if ((Inst.getOpcode() == ARM::tCBZ || Inst.getOpcode() == ARM::tCBNZ) && RelaxedOp == ARM::tHINT) { + MCInst Res; Res.setOpcode(RelaxedOp); Res.addOperand(MCOperand::createImm(0)); Res.addOperand(MCOperand::createImm(14)); Res.addOperand(MCOperand::createReg(0)); + Inst = std::move(Res); return; } // The rest of instructions we're relaxing have the same operands. // We just need to update to the proper opcode. - Res = Inst; - Res.setOpcode(RelaxedOp); + Inst.setOpcode(RelaxedOp); } bool ARMAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { @@ -432,7 +444,6 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, default: Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type"); return 0; - case FK_NONE: case FK_Data_1: case FK_Data_2: case FK_Data_4: @@ -865,7 +876,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCSymbolRefExpr *A = Target.getSymA(); const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; const unsigned FixupKind = Fixup.getKind(); - if (FixupKind == FK_NONE) + if (FixupKind >= FirstLiteralRelocationKind) return true; if (FixupKind == ARM::fixup_arm_thumb_bl) { assert(Sym && "How did we resolve this?"); @@ -909,9 +920,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { default: llvm_unreachable("Unknown fixup kind!"); - case FK_NONE: - return 0; - case FK_Data_1: case ARM::fixup_arm_thumb_bcc: case ARM::fixup_arm_thumb_cp: @@ -973,9 +981,6 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) { default: llvm_unreachable("Unknown fixup kind!"); - case FK_NONE: - return 0; - case FK_Data_1: return 1; case FK_Data_2: @@ -1031,7 +1036,10 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, MutableArrayRef<char> Data, uint64_t Value, bool IsResolved, const MCSubtargetInfo* STI) const { - unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + unsigned Kind = Fixup.getKind(); + if (Kind >= FirstLiteralRelocationKind) + return; + unsigned NumBytes = getFixupKindNumBytes(Kind); MCContext &Ctx = Asm.getContext(); Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx, STI); if (!Value) @@ -1043,7 +1051,7 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, // Used to point to big endian bytes. unsigned FullSizeBytes; if (Endian == support::big) { - FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind()); + FullSizeBytes = getFixupKindContainerSizeBytes(Kind); assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!"); assert(NumBytes <= FullSizeBytes && "Invalid fixup size!"); } @@ -1110,11 +1118,11 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( const MCCFIInstruction &Inst = Instrs[i]; switch (Inst.getOperation()) { case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa - CFARegisterOffset = -Inst.getOffset(); + CFARegisterOffset = Inst.getOffset(); CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true); break; case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset - CFARegisterOffset = -Inst.getOffset(); + CFARegisterOffset = Inst.getOffset(); break; case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true); @@ -1271,35 +1279,6 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( return CompactUnwindEncoding | ((FloatRegCount - 1) << 8); } -static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) { - ARM::ArchKind AK = ARM::parseArch(Arch); - switch (AK) { - default: - return MachO::CPU_SUBTYPE_ARM_V7; - case ARM::ArchKind::ARMV4T: - return MachO::CPU_SUBTYPE_ARM_V4T; - case ARM::ArchKind::ARMV5T: - case ARM::ArchKind::ARMV5TE: - case ARM::ArchKind::ARMV5TEJ: - return MachO::CPU_SUBTYPE_ARM_V5; - case ARM::ArchKind::ARMV6: - case ARM::ArchKind::ARMV6K: - return MachO::CPU_SUBTYPE_ARM_V6; - case ARM::ArchKind::ARMV7A: - return MachO::CPU_SUBTYPE_ARM_V7; - case ARM::ArchKind::ARMV7S: - return MachO::CPU_SUBTYPE_ARM_V7S; - case ARM::ArchKind::ARMV7K: - return MachO::CPU_SUBTYPE_ARM_V7K; - case ARM::ArchKind::ARMV6M: - return MachO::CPU_SUBTYPE_ARM_V6M; - case ARM::ArchKind::ARMV7M: - return MachO::CPU_SUBTYPE_ARM_V7M; - case ARM::ArchKind::ARMV7EM: - return MachO::CPU_SUBTYPE_ARM_V7EM; - } -} - static MCAsmBackend *createARMAsmBackend(const Target &T, const MCSubtargetInfo &STI, const MCRegisterInfo &MRI, @@ -1309,10 +1288,8 @@ static MCAsmBackend *createARMAsmBackend(const Target &T, switch (TheTriple.getObjectFormat()) { default: llvm_unreachable("unsupported object format"); - case Triple::MachO: { - MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName()); - return new ARMAsmBackendDarwin(T, STI, MRI, CS); - } + case Triple::MachO: + return new ARMAsmBackendDarwin(T, STI, MRI); case Triple::COFF: assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported"); return new ARMAsmBackendWinCOFF(T, STI); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 67722a5e5b64..38c7b30769b3 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -66,8 +66,8 @@ public: const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; - void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - MCInst &Res) const override; + void relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const override; bool writeNopData(raw_ostream &OS, uint64_t Count) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h index 87e56940f46d..e27bb134670f 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h @@ -16,16 +16,20 @@ namespace llvm { class ARMAsmBackendDarwin : public ARMAsmBackend { const MCRegisterInfo &MRI; + Triple TT; public: const MachO::CPUSubTypeARM Subtype; ARMAsmBackendDarwin(const Target &T, const MCSubtargetInfo &STI, - const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st) - : ARMAsmBackend(T, STI, support::little), MRI(MRI), Subtype(st) {} + const MCRegisterInfo &MRI) + : ARMAsmBackend(T, STI, support::little), MRI(MRI), + TT(STI.getTargetTriple()), + Subtype((MachO::CPUSubTypeARM)cantFail( + MachO::getCPUSubType(STI.getTargetTriple()))) {} std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override { - return createARMMachObjectWriter(/*Is64Bit=*/false, MachO::CPU_TYPE_ARM, - Subtype); + return createARMMachObjectWriter( + /*Is64Bit=*/false, cantFail(MachO::getCPUType(TT)), Subtype); } uint32_t generateCompactUnwindEncoding( diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index 6293a2462306..74cd2e681ded 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -393,9 +393,21 @@ namespace ARMII { // in an IT block). ThumbArithFlagSetting = 1 << 19, - // Whether an instruction can be included in an MVE tail-predicated loop. + // Whether an instruction can be included in an MVE tail-predicated loop, + // though extra validity checks may need to be performed too. ValidForTailPredication = 1 << 20, + // Whether an instruction writes to the top/bottom half of a vector element + // and leaves the other half untouched. + RetainsPreviousHalfElement = 1 << 21, + + // Whether the instruction produces a scalar result from vector operands. + HorizontalReduction = 1 << 22, + + // Whether this instruction produces a vector result that is larger than + // its input, typically reading from the top/bottom halves of the input(s). + DoubleWidthResult = 1 << 23, + //===------------------------------------------------------------------===// // Code domain. DomainShift = 15, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 2c26dd388c05..37d81e4b0af1 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -53,8 +53,8 @@ ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI) bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym, unsigned Type) const { - // FIXME: This is extremely conservative. This really needs to use a - // whitelist with a clear explanation for why each realocation needs to + // FIXME: This is extremely conservative. This really needs to use an + // explicit list with a clear explanation for why each realocation needs to // point to the symbol, not to the section. switch (Type) { default: @@ -79,6 +79,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel, MCContext &Ctx) const { + unsigned Kind = Fixup.getTargetKind(); + if (Kind >= FirstLiteralRelocationKind) + return Kind - FirstLiteralRelocationKind; MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); if (IsPCRel) { @@ -89,9 +92,18 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case FK_Data_4: switch (Modifier) { default: - llvm_unreachable("Unsupported Modifier"); - case MCSymbolRefExpr::VK_None: + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 4-byte pc-relative data relocation"); + return ELF::R_ARM_NONE; + case MCSymbolRefExpr::VK_None: { + if (const MCSymbolRefExpr *SymRef = Target.getSymA()) { + // For GNU AS compatibility expressions such as + // _GLOBAL_OFFSET_TABLE_ - label emit a R_ARM_BASE_PREL relocation. + if (SymRef->getSymbol().getName() == "_GLOBAL_OFFSET_TABLE_") + return ELF::R_ARM_BASE_PREL; + } return ELF::R_ARM_REL32; + } case MCSymbolRefExpr::VK_GOTTPOFF: return ELF::R_ARM_TLS_IE32; case MCSymbolRefExpr::VK_ARM_GOT_PREL: @@ -145,30 +157,34 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, return ELF::R_ARM_THM_BF18; } } - switch (Fixup.getTargetKind()) { + switch (Kind) { default: Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol"); return ELF::R_ARM_NONE; - case FK_NONE: - return ELF::R_ARM_NONE; case FK_Data_1: switch (Modifier) { default: - llvm_unreachable("unsupported Modifier"); + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 1-byte data relocation"); + return ELF::R_ARM_NONE; case MCSymbolRefExpr::VK_None: return ELF::R_ARM_ABS8; } case FK_Data_2: switch (Modifier) { default: - llvm_unreachable("unsupported modifier"); + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 2-byte data relocation"); + return ELF::R_ARM_NONE; case MCSymbolRefExpr::VK_None: return ELF::R_ARM_ABS16; } case FK_Data_4: switch (Modifier) { default: - llvm_unreachable("Unsupported Modifier"); + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for 4-byte data relocation"); + return ELF::R_ARM_NONE; case MCSymbolRefExpr::VK_ARM_NONE: return ELF::R_ARM_NONE; case MCSymbolRefExpr::VK_GOT: @@ -210,7 +226,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case ARM::fixup_arm_movt_hi16: switch (Modifier) { default: - llvm_unreachable("Unsupported Modifier"); + Ctx.reportError(Fixup.getLoc(), "invalid fixup for ARM MOVT instruction"); + return ELF::R_ARM_NONE; case MCSymbolRefExpr::VK_None: return ELF::R_ARM_MOVT_ABS; case MCSymbolRefExpr::VK_ARM_SBREL: @@ -219,7 +236,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case ARM::fixup_arm_movw_lo16: switch (Modifier) { default: - llvm_unreachable("Unsupported Modifier"); + Ctx.reportError(Fixup.getLoc(), "invalid fixup for ARM MOVW instruction"); + return ELF::R_ARM_NONE; case MCSymbolRefExpr::VK_None: return ELF::R_ARM_MOVW_ABS_NC; case MCSymbolRefExpr::VK_ARM_SBREL: @@ -228,7 +246,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case ARM::fixup_t2_movt_hi16: switch (Modifier) { default: - llvm_unreachable("Unsupported Modifier"); + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for Thumb MOVT instruction"); + return ELF::R_ARM_NONE; case MCSymbolRefExpr::VK_None: return ELF::R_ARM_THM_MOVT_ABS; case MCSymbolRefExpr::VK_ARM_SBREL: @@ -237,7 +257,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case ARM::fixup_t2_movw_lo16: switch (Modifier) { default: - llvm_unreachable("Unsupported Modifier"); + Ctx.reportError(Fixup.getLoc(), + "invalid fixup for Thumb MOVW instruction"); + return ELF::R_ARM_NONE; case MCSymbolRefExpr::VK_None: return ELF::R_ARM_THM_MOVW_ABS_NC; case MCSymbolRefExpr::VK_ARM_SBREL: diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index f558ca8d2d9f..876741d6c343 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -93,7 +93,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer { void emitIntTextAttribute(unsigned Attribute, unsigned IntValue, StringRef StringValue) override; void emitArch(ARM::ArchKind Arch) override; - void emitArchExtension(unsigned ArchExt) override; + void emitArchExtension(uint64_t ArchExt) override; void emitObjectArch(ARM::ArchKind Arch) override; void emitFPU(unsigned FPU) override; void emitInst(uint32_t Inst, char Suffix = '\0') override; @@ -177,7 +177,8 @@ void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {} void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) { OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value); if (IsVerboseAsm) { - StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute); + StringRef Name = + ELFAttrs::attrTypeAsString(Attribute, ARMBuildAttrs::ARMAttributeTags); if (!Name.empty()) OS << "\t@ " << Name; } @@ -193,7 +194,8 @@ void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute, default: OS << "\t.eabi_attribute\t" << Attribute << ", \"" << String << "\""; if (IsVerboseAsm) { - StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute); + StringRef Name = ELFAttrs::attrTypeAsString( + Attribute, ARMBuildAttrs::ARMAttributeTags); if (!Name.empty()) OS << "\t@ " << Name; } @@ -212,7 +214,9 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute, if (!StringValue.empty()) OS << ", \"" << StringValue << "\""; if (IsVerboseAsm) - OS << "\t@ " << ARMBuildAttrs::AttrTypeAsString(Attribute); + OS << "\t@ " + << ELFAttrs::attrTypeAsString(Attribute, + ARMBuildAttrs::ARMAttributeTags); break; } OS << "\n"; @@ -222,7 +226,7 @@ void ARMTargetAsmStreamer::emitArch(ARM::ArchKind Arch) { OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n"; } -void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) { +void ARMTargetAsmStreamer::emitArchExtension(uint64_t ArchExt) { OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n"; } @@ -238,7 +242,7 @@ void ARMTargetAsmStreamer::finishAttributeSection() {} void ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) { - OS << "\t.tlsdescseq\t" << S->getSymbol().getName(); + OS << "\t.tlsdescseq\t" << S->getSymbol().getName() << "\n"; } void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) { @@ -328,12 +332,8 @@ private: } // Create new attribute item - AttributeItem Item = { - AttributeItem::NumericAttribute, - Attribute, - Value, - StringRef("") - }; + AttributeItem Item = {AttributeItem::NumericAttribute, Attribute, Value, + std::string(StringRef(""))}; Contents.push_back(Item); } @@ -344,17 +344,13 @@ private: if (!OverwriteExisting) return; Item->Type = AttributeItem::TextAttribute; - Item->StringValue = Value; + Item->StringValue = std::string(Value); return; } // Create new attribute item - AttributeItem Item = { - AttributeItem::TextAttribute, - Attribute, - 0, - Value - }; + AttributeItem Item = {AttributeItem::TextAttribute, Attribute, 0, + std::string(Value)}; Contents.push_back(Item); } @@ -366,17 +362,13 @@ private: return; Item->Type = AttributeItem::NumericAndTextAttributes; Item->IntValue = IntValue; - Item->StringValue = StringValue; + Item->StringValue = std::string(StringValue); return; } // Create new attribute item - AttributeItem Item = { - AttributeItem::NumericAndTextAttributes, - Attribute, - IntValue, - StringValue - }; + AttributeItem Item = {AttributeItem::NumericAndTextAttributes, Attribute, + IntValue, std::string(StringValue)}; Contents.push_back(Item); } @@ -452,7 +444,7 @@ public: ~ARMELFStreamer() override = default; - void FinishImpl() override; + void finishImpl() override; // ARM exception handling directives void emitFnStart(); @@ -468,13 +460,13 @@ public: void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes); void emitFill(const MCExpr &NumBytes, uint64_t FillValue, SMLoc Loc) override { - EmitDataMappingSymbol(); + emitDataMappingSymbol(); MCObjectStreamer::emitFill(NumBytes, FillValue, Loc); } - void ChangeSection(MCSection *Section, const MCExpr *Subsection) override { + void changeSection(MCSection *Section, const MCExpr *Subsection) override { LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo); - MCELFStreamer::ChangeSection(Section, Subsection); + MCELFStreamer::changeSection(Section, Subsection); auto LastMappingSymbol = LastMappingSymbols.find(Section); if (LastMappingSymbol != LastMappingSymbols.end()) { LastEMSInfo = std::move(LastMappingSymbol->second); @@ -486,14 +478,14 @@ public: /// This function is the one used to emit instruction data into the ELF /// streamer. We override it to add the appropriate mapping symbol if /// necessary. - void EmitInstruction(const MCInst &Inst, + void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override { if (IsThumb) EmitThumbMappingSymbol(); else EmitARMMappingSymbol(); - MCELFStreamer::EmitInstruction(Inst, STI); + MCELFStreamer::emitInstruction(Inst, STI); } void emitInst(uint32_t Inst, char Suffix) { @@ -533,15 +525,15 @@ public: llvm_unreachable("Invalid Suffix"); } - MCELFStreamer::EmitBytes(StringRef(Buffer, Size)); + MCELFStreamer::emitBytes(StringRef(Buffer, Size)); } /// This is one of the functions used to emit data into an ELF section, so the /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if /// necessary. - void EmitBytes(StringRef Data) override { - EmitDataMappingSymbol(); - MCELFStreamer::EmitBytes(Data); + void emitBytes(StringRef Data) override { + emitDataMappingSymbol(); + MCELFStreamer::emitBytes(Data); } void FlushPendingMappingSymbol() { @@ -555,7 +547,7 @@ public: /// This is one of the functions used to emit data into an ELF section, so the /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if /// necessary. - void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { + void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) { if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) { getContext().reportError(Loc, "relocated expression must be 32-bit"); @@ -564,12 +556,12 @@ public: getOrCreateDataFragment(); } - EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size, Loc); + emitDataMappingSymbol(); + MCELFStreamer::emitValueImpl(Value, Size, Loc); } - void EmitAssemblerFlag(MCAssemblerFlag Flag) override { - MCELFStreamer::EmitAssemblerFlag(Flag); + void emitAssemblerFlag(MCAssemblerFlag Flag) override { + MCELFStreamer::emitAssemblerFlag(Flag); switch (Flag) { case MCAF_SyntaxUnified: @@ -609,7 +601,7 @@ private: ElfMappingSymbol State; }; - void EmitDataMappingSymbol() { + void emitDataMappingSymbol() { if (LastEMSInfo->State == EMS_Data) return; else if (LastEMSInfo->State == EMS_None) { @@ -648,7 +640,7 @@ private: void EmitMappingSymbol(StringRef Name) { auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol( Name + "." + Twine(MappingSymbolCounter++))); - EmitLabel(Symbol); + emitLabel(Symbol); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); @@ -659,15 +651,15 @@ private: uint64_t Offset) { auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol( Name + "." + Twine(MappingSymbolCounter++))); - EmitLabelAtPos(Symbol, Loc, F, Offset); + emitLabelAtPos(Symbol, Loc, F, Offset); Symbol->setType(ELF::STT_NOTYPE); Symbol->setBinding(ELF::STB_LOCAL); Symbol->setExternal(false); } - void EmitThumbFunc(MCSymbol *Func) override { + void emitThumbFunc(MCSymbol *Func) override { getAssembler().setIsThumbFunc(Func); - EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction); + emitSymbolAttribute(Func, MCSA_ELF_TypeFunction); } // Helper functions for ARM exception handling directives @@ -868,6 +860,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::ArchKind::ARMV8_3A: case ARM::ArchKind::ARMV8_4A: case ARM::ArchKind::ARMV8_5A: + case ARM::ArchKind::ARMV8_6A: setAttributeItem(CPU_arch_profile, ApplicationProfile, false); setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, AllowThumb32, false); @@ -1091,7 +1084,7 @@ void ARMTargetELFStreamer::finishAttributeSection() { Streamer.SwitchSection(AttributeSection); // Format version - Streamer.EmitIntValue(0x41, 1); + Streamer.emitInt8(0x41); } // Vendor size + Vendor name + '\0' @@ -1102,31 +1095,31 @@ void ARMTargetELFStreamer::finishAttributeSection() { const size_t ContentsSize = calculateContentSize(); - Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4); - Streamer.EmitBytes(CurrentVendor); - Streamer.EmitIntValue(0, 1); // '\0' + Streamer.emitInt32(VendorHeaderSize + TagHeaderSize + ContentsSize); + Streamer.emitBytes(CurrentVendor); + Streamer.emitInt8(0); // '\0' - Streamer.EmitIntValue(ARMBuildAttrs::File, 1); - Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4); + Streamer.emitInt8(ARMBuildAttrs::File); + Streamer.emitInt32(TagHeaderSize + ContentsSize); // Size should have been accounted for already, now // emit each field as its type (ULEB or String) for (size_t i = 0; i < Contents.size(); ++i) { AttributeItem item = Contents[i]; - Streamer.EmitULEB128IntValue(item.Tag); + Streamer.emitULEB128IntValue(item.Tag); switch (item.Type) { default: llvm_unreachable("Invalid attribute type"); case AttributeItem::NumericAttribute: - Streamer.EmitULEB128IntValue(item.IntValue); + Streamer.emitULEB128IntValue(item.IntValue); break; case AttributeItem::TextAttribute: - Streamer.EmitBytes(item.StringValue); - Streamer.EmitIntValue(0, 1); // '\0' + Streamer.emitBytes(item.StringValue); + Streamer.emitInt8(0); // '\0' break; case AttributeItem::NumericAndTextAttributes: - Streamer.EmitULEB128IntValue(item.IntValue); - Streamer.EmitBytes(item.StringValue); - Streamer.EmitIntValue(0, 1); // '\0' + Streamer.emitULEB128IntValue(item.IntValue); + Streamer.emitBytes(item.StringValue); + Streamer.emitInt8(0); // '\0' break; } } @@ -1143,7 +1136,7 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) { Streamer.getAssembler().registerSymbol(*Symbol); unsigned Type = cast<MCSymbolELF>(Symbol)->getType(); if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC) - Streamer.EmitThumbFunc(Symbol); + Streamer.emitThumbFunc(Symbol); } void @@ -1155,13 +1148,13 @@ void ARMTargetELFStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) { if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) { const MCSymbol &Sym = SRE->getSymbol(); if (!Sym.isDefined()) { - getStreamer().EmitAssignment(Symbol, Value); + getStreamer().emitAssignment(Symbol, Value); return; } } - getStreamer().EmitThumbFunc(Symbol); - getStreamer().EmitAssignment(Symbol, Value); + getStreamer().emitThumbFunc(Symbol); + getStreamer().emitAssignment(Symbol, Value); } void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) { @@ -1170,12 +1163,12 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) { void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; } -void ARMELFStreamer::FinishImpl() { +void ARMELFStreamer::finishImpl() { MCTargetStreamer &TS = *getTargetStreamer(); ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); ATS.finishAttributeSection(); - MCELFStreamer::FinishImpl(); + MCELFStreamer::finishImpl(); } void ARMELFStreamer::reset() { @@ -1201,7 +1194,7 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix, static_cast<const MCSectionELF &>(Fn.getSection()); // Create the name for new section - StringRef FnSecName(FnSection.getSectionName()); + StringRef FnSecName(FnSection.getName()); SmallString<128> EHSecName(Prefix); if (FnSecName != ".text") { EHSecName += FnSecName; @@ -1213,13 +1206,13 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix, Flags |= ELF::SHF_GROUP; MCSectionELF *EHSection = getContext().getELFSection( EHSecName, Type, Flags, 0, Group, FnSection.getUniqueID(), - static_cast<const MCSymbolELF *>(&Fn)); + static_cast<const MCSymbolELF *>(FnSection.getBeginSymbol())); assert(EHSection && "Failed to get the required EH section"); // Switch to .ARM.extab or .ARM.exidx section SwitchSection(EHSection); - EmitCodeAlignment(4); + emitCodeAlignment(4); } inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) { @@ -1258,7 +1251,7 @@ void ARMELFStreamer::EHReset() { void ARMELFStreamer::emitFnStart() { assert(FnStart == nullptr); FnStart = getContext().createTempSymbol(); - EmitLabel(FnStart); + emitLabel(FnStart); } void ARMELFStreamer::emitFnEnd() { @@ -1284,17 +1277,17 @@ void ARMELFStreamer::emitFnEnd() { MCSymbolRefExpr::VK_ARM_PREL31, getContext()); - EmitValue(FnStartRef, 4); + emitValue(FnStartRef, 4); if (CantUnwind) { - EmitIntValue(ARM::EHABI::EXIDX_CANTUNWIND, 4); + emitInt32(ARM::EHABI::EXIDX_CANTUNWIND); } else if (ExTab) { // Emit a reference to the unwind opcodes in the ".ARM.extab" section. const MCSymbolRefExpr *ExTabEntryRef = MCSymbolRefExpr::create(ExTab, MCSymbolRefExpr::VK_ARM_PREL31, getContext()); - EmitValue(ExTabEntryRef, 4); + emitValue(ExTabEntryRef, 4); } else { // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in // the second word of exception index table entry. The size of the unwind @@ -1307,7 +1300,7 @@ void ARMELFStreamer::emitFnEnd() { Opcodes[1] << 8 | Opcodes[2] << 16 | Opcodes[3] << 24; - EmitIntValue(Intval, Opcodes.size()); + emitIntValue(Intval, Opcodes.size()); } // Switch to the section containing FnStart @@ -1366,7 +1359,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) { // Create .ARM.extab label for offset in .ARM.exidx assert(!ExTab); ExTab = getContext().createTempSymbol(); - EmitLabel(ExTab); + emitLabel(ExTab); // Emit personality if (Personality) { @@ -1375,7 +1368,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) { MCSymbolRefExpr::VK_ARM_PREL31, getContext()); - EmitValue(PersonalityRef, 4); + emitValue(PersonalityRef, 4); } // Emit unwind opcodes @@ -1386,7 +1379,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) { Opcodes[I + 1] << 8 | Opcodes[I + 2] << 16 | Opcodes[I + 3] << 24; - EmitIntValue(Intval, 4); + emitInt32(Intval); } // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or @@ -1397,7 +1390,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) { // In case that the .handlerdata directive is not specified by the // programmer, we should emit zero to terminate the handler data. if (NoHandlerData && !Personality) - EmitIntValue(0, 4); + emitInt32(0); } void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp index b36106a78b71..744d919f2fd4 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp @@ -288,7 +288,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address, case ARM::t2DSB: switch (MI->getOperand(0).getImm()) { default: - if (!printAliasInstr(MI, STI, O)) + if (!printAliasInstr(MI, Address, STI, O)) printInstruction(MI, Address, STI, O); break; case 0: @@ -302,7 +302,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address, return; } - if (!printAliasInstr(MI, STI, O)) + if (!printAliasInstr(MI, Address, STI, O)) printInstruction(MI, Address, STI, O); printAnnotation(O, Annot); @@ -1669,15 +1669,6 @@ void ARMInstPrinter::printVPTMask(const MCInst *MI, unsigned OpNum, } } -void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, - raw_ostream &O) { - uint32_t Val = MI->getOperand(OpNum).getImm(); - O << markup("<imm:") << "#0x"; - O.write_hex(Val); - O << markup(">"); -} - void ARMInstPrinter::printMveSaturateOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h index 20f901033395..37cb731ff001 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h @@ -32,10 +32,10 @@ public: // Autogenerated by tblgen. void printInstruction(const MCInst *MI, uint64_t Address, const MCSubtargetInfo &STI, raw_ostream &O); - virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI, - raw_ostream &O); - virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, + virtual bool printAliasInstr(const MCInst *MI, uint64_t Address, + const MCSubtargetInfo &STI, raw_ostream &O); + virtual void printCustomAliasOperand(const MCInst *MI, uint64_t Address, + unsigned OpIdx, unsigned PrintMethodIdx, const MCSubtargetInfo &STI, raw_ostream &O); static const char *getRegisterName(unsigned RegNo, @@ -43,6 +43,10 @@ public: void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O) { + printOperand(MI, OpNum, STI, O); + } void printSORegRegOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); @@ -109,6 +113,12 @@ public: template <unsigned scale> void printAdrLabelOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + template <unsigned scale> + void printAdrLabelOperand(const MCInst *MI, uint64_t /*Address*/, + unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + printAdrLabelOperand<scale>(MI, OpNum, STI, O); + } void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); void printThumbSRImm(const MCInst *MI, unsigned OpNum, @@ -206,6 +216,11 @@ public: const MCSubtargetInfo &STI, raw_ostream &O); void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + void printThumbLdrLabelOperand(const MCInst *MI, uint64_t /*Address*/, + unsigned OpNum, const MCSubtargetInfo &STI, + raw_ostream &O) { + printThumbLdrLabelOperand(MI, OpNum, STI, O); + } void printFBits16(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); void printFBits32(const MCInst *MI, unsigned OpNum, @@ -260,8 +275,6 @@ public: const MCSubtargetInfo &STI, raw_ostream &O); void printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); - void printExpandedImmOperand(const MCInst *MI, unsigned OpNum, - const MCSubtargetInfo &STI, raw_ostream &O); void printMveSaturateOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); private: diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index d30d15df3d00..765613cf347d 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -37,8 +37,6 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) { ExceptionsType = (TheTriple.isOSDarwin() && !TheTriple.isWatchABI()) ? ExceptionHandling::SjLj : ExceptionHandling::DwarfCFI; - - UseIntegratedAssembler = true; } void ARMELFMCAsmInfo::anchor() { } @@ -73,8 +71,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) { // foo(plt) instead of foo@plt UseParensForSymbolVariant = true; - - UseIntegratedAssembler = true; } void ARMELFMCAsmInfo::setUseIntegratedAssembler(bool Value) { @@ -116,7 +112,6 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() { ExceptionsType = ExceptionHandling::DwarfCFI; UseParensForSymbolVariant = true; - UseIntegratedAssembler = true; DwarfRegNumForCFI = false; // Conditional Thumb 4-byte instructions can have an implicit IT. diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index 268fe7efd9ce..1cb99534f146 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -413,14 +413,6 @@ public: unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - template <uint8_t shift, bool invert> - unsigned getExpandedImmOpValue(const MCInst &MI, unsigned Op, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - static_assert(shift <= 32, "Shift count must be less than or equal to 32."); - const MCOperand MO = MI.getOperand(Op); - return (invert ? (MO.getImm() ^ 0xff) : MO.getImm()) >> shift; - } unsigned NEONThumb2DataIPostEncoder(const MCInst &MI, unsigned EncodedValue, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 9f60e70e0e02..05d73ccf6ff2 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -63,6 +63,25 @@ static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, return true; } } + if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] && + ((MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 10) || + (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 11))) { + Info = "since v7, cp10 and cp11 are reserved for advanced SIMD or floating " + "point instructions"; + return true; + } + return false; +} + +static bool getMRCDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI, + std::string &Info) { + if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] && + ((MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 10) || + (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 11))) { + Info = "since v7, cp10 and cp11 are reserved for advanced SIMD or floating " + "point instructions"; + return true; + } return false; } @@ -168,7 +187,7 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT, if (!ArchFS.empty()) ArchFS = (Twine(ArchFS) + "," + FS).str(); else - ArchFS = FS; + ArchFS = std::string(FS); } return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS); @@ -200,7 +219,7 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, MAI = new ARMELFMCAsmInfo(TheTriple); unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true); - MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0)); + MAI->addInitialFrameState(MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0)); return MAI; } @@ -266,7 +285,9 @@ public: bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, uint64_t &Target) const override { // We only handle PCRel branches for now. - if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL) + if (Inst.getNumOperands() == 0 || + Info->get(Inst.getOpcode()).OpInfo[0].OperandType != + MCOI::OPERAND_PCREL) return false; int64_t Imm = Inst.getOperand(0).getImm(); @@ -285,8 +306,15 @@ public: switch (Inst.getOpcode()) { default: OpId = 0; + if (Inst.getNumOperands() == 0) + return false; break; + case ARM::MVE_WLSTP_8: + case ARM::MVE_WLSTP_16: + case ARM::MVE_WLSTP_32: + case ARM::MVE_WLSTP_64: case ARM::t2WLS: + case ARM::MVE_LETP: case ARM::t2LEUpdate: OpId = 2; break; @@ -316,6 +344,14 @@ static MCInstrAnalysis *createThumbMCInstrAnalysis(const MCInstrInfo *Info) { return new ThumbMCInstrAnalysis(Info); } +bool ARM::isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI) { + // Unfortunately we don't have ARMTargetInfo in the disassembler, so we have + // to rely on feature bits. + if (Coproc >= 8) + return false; + return STI.getFeatureBits()[ARM::FeatureCoprocCDE0 + Coproc]; +} + // Force static initialization. extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() { for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(), diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index 9cbbd56225ef..7cfe6881b456 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -107,6 +107,9 @@ inline bool isVpred(OperandType op) { inline bool isVpred(uint8_t op) { return isVpred(static_cast<OperandType>(op)); } + +bool isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI); + } // end namespace ARM } // End llvm namespace diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index 7b30a61e8ccb..1fee354cad93 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -80,7 +80,7 @@ void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) { default: llvm_unreachable("Invalid Suffix"); } - getStreamer().EmitBytes(StringRef(Buffer, Size)); + getStreamer().emitBytes(StringRef(Buffer, Size)); } // The remaining callbacks should be handled separately by each @@ -108,7 +108,7 @@ void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute, unsigned IntValue, StringRef StringValue) {} void ARMTargetStreamer::emitArch(ARM::ArchKind Arch) {} -void ARMTargetStreamer::emitArchExtension(unsigned ArchExt) {} +void ARMTargetStreamer::emitArchExtension(uint64_t ArchExt) {} void ARMTargetStreamer::emitObjectArch(ARM::ArchKind Arch) {} void ARMTargetStreamer::emitFPU(unsigned FPU) {} void ARMTargetStreamer::finishAttributeSection() {} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp index a9460b70da56..781627c3c425 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp @@ -134,7 +134,7 @@ void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) { uint8_t Buff[16]; Buff[0] = ARM::EHABI::UNWIND_OPCODE_INC_VSP_ULEB128; size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1); - EmitBytes(Buff, ULEBSize + 1); + emitBytes(Buff, ULEBSize + 1); } else if (Offset > 0) { if (Offset > 0x100) { EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP | 0x3fu); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h index 5fb7307159d1..ec11a78f8a7a 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h @@ -64,7 +64,7 @@ public: OpBegins.push_back(OpBegins.back() + Opcodes.size()); } - /// Finalize the unwind opcode sequence for EmitBytes() + /// Finalize the unwind opcode sequence for emitBytes() void Finalize(unsigned &PersonalityIndex, SmallVectorImpl<uint8_t> &Result); @@ -80,7 +80,7 @@ private: OpBegins.push_back(OpBegins.back() + 2); } - void EmitBytes(const uint8_t *Opcode, size_t Size) { + void emitBytes(const uint8_t *Opcode, size_t Size) { Ops.insert(Ops.end(), Opcode, Opcode + Size); OpBegins.push_back(OpBegins.back() + Size); } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp index b3c8146a9bde..e6f649164a29 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp @@ -22,18 +22,18 @@ public: std::unique_ptr<MCObjectWriter> OW) : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {} - void EmitThumbFunc(MCSymbol *Symbol) override; - void FinishImpl() override; + void emitThumbFunc(MCSymbol *Symbol) override; + void finishImpl() override; }; -void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) { +void ARMWinCOFFStreamer::emitThumbFunc(MCSymbol *Symbol) { getAssembler().setIsThumbFunc(Symbol); } -void ARMWinCOFFStreamer::FinishImpl() { - EmitFrames(nullptr); +void ARMWinCOFFStreamer::finishImpl() { + emitFrames(nullptr); - MCWinCOFFStreamer::FinishImpl(); + MCWinCOFFStreamer::finishImpl(); } } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index 9f64af02e698..4d7ad6cd60cb 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -15,6 +15,7 @@ #include "ARM.h" #include "ARMBaseInstrInfo.h" #include "ARMSubtarget.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -37,6 +38,7 @@ #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> @@ -67,27 +69,77 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired<TargetPassConfig>(); + AU.addRequired<LoopInfoWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } private: + LoopInfo *LI = nullptr; + // Check this is a valid gather with correct alignment bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, - unsigned Alignment); + Align Alignment); // Check whether Ptr is hidden behind a bitcast and look through it void lookThroughBitcast(Value *&Ptr); // Check for a getelementptr and deduce base and offsets from it, on success // returning the base directly and the offsets indirectly using the Offsets // argument - Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder); + Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP, + IRBuilder<> &Builder); + // Compute the scale of this gather/scatter instruction + int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); + // If the value is a constant, or derived from constants via additions + // and multilications, return its numeric value + Optional<int64_t> getIfConst(const Value *V); + // If Inst is an add instruction, check whether one summand is a + // constant. If so, scale this constant and return it together with + // the other summand. + std::pair<Value *, int64_t> getVarAndConst(Value *Inst, int TypeScale); - bool lowerGather(IntrinsicInst *I); + Value *lowerGather(IntrinsicInst *I); // Create a gather from a base + vector of offsets Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr, - IRBuilder<> Builder); + Instruction *&Root, IRBuilder<> &Builder); // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> Builder); + IRBuilder<> &Builder, int64_t Increment = 0); + // Create an incrementing gather from a vector of pointers + Value *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr, + IRBuilder<> &Builder, + int64_t Increment = 0); + + Value *lowerScatter(IntrinsicInst *I); + // Create a scatter to a base + vector of offsets + Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Offsets, + IRBuilder<> &Builder); + // Create a scatter to a vector of pointers + Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr, + IRBuilder<> &Builder, + int64_t Increment = 0); + // Create an incrementing scatter from a vector of pointers + Value *tryCreateMaskedScatterBaseWB(IntrinsicInst *I, Value *Ptr, + IRBuilder<> &Builder, + int64_t Increment = 0); + + // QI gathers and scatters can increment their offsets on their own if + // the increment is a constant value (digit) + Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, GetElementPtrInst *GEP, + IRBuilder<> &Builder); + // QI gathers/scatters can increment their offsets on their own if the + // increment is a constant value (digit) - this creates a writeback QI + // gather/scatter + Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, unsigned TypeScale, + IRBuilder<> &Builder); + // Check whether these offsets could be moved out of the loop they're in + bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); + // Pushes the given add out of the loop + void pushOutAdd(PHINode *&Phi, Value *OffsSecondOperand, unsigned StartIndex); + // Pushes the given mul out of the loop + void pushOutMul(PHINode *&Phi, Value *IncrementPerRound, + Value *OffsSecondOperand, unsigned LoopIncrement, + IRBuilder<> &Builder); }; } // end anonymous namespace @@ -103,102 +155,177 @@ Pass *llvm::createMVEGatherScatterLoweringPass() { bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, - unsigned Alignment) { - // Do only allow non-extending gathers for now - if (((NumElements == 4 && ElemSize == 32) || - (NumElements == 8 && ElemSize == 16) || + Align Alignment) { + if (((NumElements == 4 && + (ElemSize == 32 || ElemSize == 16 || ElemSize == 8)) || + (NumElements == 8 && (ElemSize == 16 || ElemSize == 8)) || (NumElements == 16 && ElemSize == 8)) && - ElemSize / 8 <= Alignment) + Alignment >= ElemSize / 8) return true; - LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid " - << "alignment or vector type \n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: instruction does not have " + << "valid alignment or vector type \n"); return false; } -Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, - IRBuilder<> Builder) { - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, + GetElementPtrInst *GEP, + IRBuilder<> &Builder) { if (!GEP) { - LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n"); + LLVM_DEBUG( + dbgs() << "masked gathers/scatters: no getelementpointer found\n"); return nullptr; } - LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading" - << " from base + vector of offsets\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found." + << " Looking at intrinsic for base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); if (GEPPtr->getType()->isVectorTy()) { - LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers" - << " hidden behind a getelementptr currently not" - << " supported. Expanding.\n"); return nullptr; } if (GEP->getNumOperands() != 2) { - LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many" + LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many" << " operands. Expanding.\n"); return nullptr; } Offsets = GEP->getOperand(1); - // SExt offsets inside masked gathers are not permitted by the architecture; - // we therefore can't fold them + // Paranoid check whether the number of parallel lanes is the same + assert(cast<FixedVectorType>(Ty)->getNumElements() == + cast<FixedVectorType>(Offsets->getType())->getNumElements()); + // Only <N x i32> offsets can be integrated into an arm gather, any smaller + // type would have to be sign extended by the gep - and arm gathers can only + // zero extend. Additionally, the offsets do have to originate from a zext of + // a vector with element types smaller or equal the type of the gather we're + // looking at + if (Offsets->getType()->getScalarSizeInBits() != 32) + return nullptr; if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets)) Offsets = ZextOffs->getOperand(0); - Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty)); - // If the offset we found does not have the type the intrinsic expects, - // i.e., the same type as the gather itself, we need to convert it (only i - // types) or fall back to expanding the gather - if (OffsType != Offsets->getType()) { - if (OffsType->getScalarSizeInBits() > - Offsets->getType()->getScalarSizeInBits()) { - LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n"); - Offsets = Builder.CreateZExt(Offsets, OffsType, ""); - } else { - LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't" - << " create masked gather\n"); + else if (!(cast<FixedVectorType>(Offsets->getType())->getNumElements() == 4 && + Offsets->getType()->getScalarSizeInBits() == 32)) + return nullptr; + + if (Ty != Offsets->getType()) { + if ((Ty->getScalarSizeInBits() < + Offsets->getType()->getScalarSizeInBits())) { + LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type." + << " Can't create intrinsic.\n"); return nullptr; + } else { + Offsets = Builder.CreateZExt( + Offsets, VectorType::getInteger(cast<VectorType>(Ty))); } } // If none of the checks failed, return the gep's base pointer + LLVM_DEBUG(dbgs() << "masked gathers/scatters: found correct offsets\n"); return GEPPtr; } void MVEGatherScatterLowering::lookThroughBitcast(Value *&Ptr) { // Look through bitcast instruction if #elements is the same if (auto *BitCast = dyn_cast<BitCastInst>(Ptr)) { - Type *BCTy = BitCast->getType(); - Type *BCSrcTy = BitCast->getOperand(0)->getType(); - if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) { - LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n"); + auto *BCTy = cast<FixedVectorType>(BitCast->getType()); + auto *BCSrcTy = cast<FixedVectorType>(BitCast->getOperand(0)->getType()); + if (BCTy->getNumElements() == BCSrcTy->getNumElements()) { + LLVM_DEBUG( + dbgs() << "masked gathers/scatters: looking through bitcast\n"); Ptr = BitCast->getOperand(0); } } } -bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { +int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize, + unsigned MemoryElemSize) { + // This can be a 32bit load/store scaled by 4, a 16bit load/store scaled by 2, + // or a 8bit, 16bit or 32bit load/store scaled by 1 + if (GEPElemSize == 32 && MemoryElemSize == 32) + return 2; + else if (GEPElemSize == 16 && MemoryElemSize == 16) + return 1; + else if (GEPElemSize == 8) + return 0; + LLVM_DEBUG(dbgs() << "masked gathers/scatters: incorrect scale. Can't " + << "create intrinsic\n"); + return -1; +} + +Optional<int64_t> MVEGatherScatterLowering::getIfConst(const Value *V) { + const Constant *C = dyn_cast<Constant>(V); + if (C != nullptr) + return Optional<int64_t>{C->getUniqueInteger().getSExtValue()}; + if (!isa<Instruction>(V)) + return Optional<int64_t>{}; + + const Instruction *I = cast<Instruction>(V); + if (I->getOpcode() == Instruction::Add || + I->getOpcode() == Instruction::Mul) { + Optional<int64_t> Op0 = getIfConst(I->getOperand(0)); + Optional<int64_t> Op1 = getIfConst(I->getOperand(1)); + if (!Op0 || !Op1) + return Optional<int64_t>{}; + if (I->getOpcode() == Instruction::Add) + return Optional<int64_t>{Op0.getValue() + Op1.getValue()}; + if (I->getOpcode() == Instruction::Mul) + return Optional<int64_t>{Op0.getValue() * Op1.getValue()}; + } + return Optional<int64_t>{}; +} + +std::pair<Value *, int64_t> +MVEGatherScatterLowering::getVarAndConst(Value *Inst, int TypeScale) { + std::pair<Value *, int64_t> ReturnFalse = + std::pair<Value *, int64_t>(nullptr, 0); + // At this point, the instruction we're looking at must be an add or we + // bail out + Instruction *Add = dyn_cast<Instruction>(Inst); + if (Add == nullptr || Add->getOpcode() != Instruction::Add) + return ReturnFalse; + + Value *Summand; + Optional<int64_t> Const; + // Find out which operand the value that is increased is + if ((Const = getIfConst(Add->getOperand(0)))) + Summand = Add->getOperand(1); + else if ((Const = getIfConst(Add->getOperand(1)))) + Summand = Add->getOperand(0); + else + return ReturnFalse; + + // Check that the constant is small enough for an incrementing gather + int64_t Immediate = Const.getValue() << TypeScale; + if (Immediate > 512 || Immediate < -512 || Immediate % 4 != 0) + return ReturnFalse; + + return std::pair<Value *, int64_t>(Summand, Immediate); +} + +Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { using namespace PatternMatch; LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"); // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0) // Attempt to turn the masked gather in I into a MVE intrinsic // Potentially optimising the addressing modes as we do so. - Type *Ty = I->getType(); + auto *Ty = cast<FixedVectorType>(I->getType()); Value *Ptr = I->getArgOperand(0); - unsigned Alignment = cast<ConstantInt>(I->getArgOperand(1))->getZExtValue(); + Align Alignment = cast<ConstantInt>(I->getArgOperand(1))->getAlignValue(); Value *Mask = I->getArgOperand(2); Value *PassThru = I->getArgOperand(3); - if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(), - Ty->getScalarSizeInBits(), Alignment)) - return false; + if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(), + Alignment)) + return nullptr; lookThroughBitcast(Ptr); assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); IRBuilder<> Builder(I->getContext()); Builder.SetInsertPoint(I); Builder.SetCurrentDebugLocation(I->getDebugLoc()); - Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder); + + Instruction *Root = I; + Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder); if (!Load) Load = tryCreateMaskedGatherBase(I, Ptr, Builder); if (!Load) - return false; + return nullptr; if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) { LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - " @@ -206,72 +333,649 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { Load = Builder.CreateSelect(Mask, Load, PassThru); } + Root->replaceAllUsesWith(Load); + Root->eraseFromParent(); + if (Root != I) + // If this was an extending gather, we need to get rid of the sext/zext + // sext/zext as well as of the gather itself + I->eraseFromParent(); + LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n"); - I->replaceAllUsesWith(Load); - I->eraseFromParent(); - return true; + return Load; } -Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase( - IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { +Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I, + Value *Ptr, + IRBuilder<> &Builder, + int64_t Increment) { using namespace PatternMatch; + auto *Ty = cast<FixedVectorType>(I->getType()); LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); - Type *Ty = I->getType(); - if (Ty->getVectorNumElements() != 4) + if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) // Can't build an intrinsic for this return nullptr; Value *Mask = I->getArgOperand(2); if (match(Mask, m_One())) return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, {Ty, Ptr->getType()}, - {Ptr, Builder.getInt32(0)}); + {Ptr, Builder.getInt32(Increment)}); else return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_base_predicated, {Ty, Ptr->getType(), Mask->getType()}, - {Ptr, Builder.getInt32(0), Mask}); + {Ptr, Builder.getInt32(Increment), Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) { + using namespace PatternMatch; + auto *Ty = cast<FixedVectorType>(I->getType()); + LLVM_DEBUG( + dbgs() + << "masked gathers: loading from vector of pointers with writeback\n"); + if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + // Can't build an intrinsic for this + return nullptr; + Value *Mask = I->getArgOperand(2); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_wb, + {Ty, Ptr->getType()}, + {Ptr, Builder.getInt32(Increment)}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_base_wb_predicated, + {Ty, Ptr->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Mask}); } Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( - IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> &Builder) { using namespace PatternMatch; - Type *Ty = I->getType(); + + Type *OriginalTy = I->getType(); + Type *ResultTy = OriginalTy; + + unsigned Unsigned = 1; + // The size of the gather was already checked in isLegalTypeAndAlignment; + // if it was not a full vector width an appropriate extend should follow. + auto *Extend = Root; + if (OriginalTy->getPrimitiveSizeInBits() < 128) { + // Only transform gathers with exactly one use + if (!I->hasOneUse()) + return nullptr; + + // The correct root to replace is not the CallInst itself, but the + // instruction which extends it + Extend = cast<Instruction>(*I->users().begin()); + if (isa<SExtInst>(Extend)) { + Unsigned = 0; + } else if (!isa<ZExtInst>(Extend)) { + LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. " + << "Expanding\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n"); + ResultTy = Extend->getType(); + // The final size of the gather must be a full vector width + if (ResultTy->getPrimitiveSizeInBits() != 128) { + LLVM_DEBUG(dbgs() << "masked gathers: extending from the wrong type. " + << "Expanding\n"); + return nullptr; + } + } + + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder); + Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder); if (!BasePtr) return nullptr; + // Check whether the offset is a constant increment that could be merged into + // a QI gather + Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder); + if (Load) + return Load; - unsigned Scale; - int GEPElemSize = - BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(); - int ResultElemSize = Ty->getScalarSizeInBits(); - // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a - // 8bit, 16bit or 32bit load scaled by 1 - if (GEPElemSize == 32 && ResultElemSize == 32) { - Scale = 2; - } else if (GEPElemSize == 16 && ResultElemSize == 16) { - Scale = 1; - } else if (GEPElemSize == 8) { - Scale = 0; - } else { - LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't" - << " create masked gather\n"); + int Scale = computeScale( + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + OriginalTy->getScalarSizeInBits()); + if (Scale == -1) return nullptr; - } + Root = Extend; Value *Mask = I->getArgOperand(2); if (!match(Mask, m_One())) return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset_predicated, - {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()}, - {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), - Builder.getInt32(Scale), Builder.getInt32(1), Mask}); + {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()}, + {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask}); else return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset, - {Ty, BasePtr->getType(), Offsets->getType()}, - {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), - Builder.getInt32(Scale), Builder.getInt32(1)}); + {ResultTy, BasePtr->getType(), Offsets->getType()}, + {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); +} + +Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"); + + // @llvm.masked.scatter.*(data, ptrs, alignment, mask) + // Attempt to turn the masked scatter in I into a MVE intrinsic + // Potentially optimising the addressing modes as we do so. + Value *Input = I->getArgOperand(0); + Value *Ptr = I->getArgOperand(1); + Align Alignment = cast<ConstantInt>(I->getArgOperand(2))->getAlignValue(); + auto *Ty = cast<FixedVectorType>(Input->getType()); + + if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(), + Alignment)) + return nullptr; + + lookThroughBitcast(Ptr); + assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); + + IRBuilder<> Builder(I->getContext()); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + + Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder); + if (!Store) + Store = tryCreateMaskedScatterBase(I, Ptr, Builder); + if (!Store) + return nullptr; + + LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n"); + I->eraseFromParent(); + return Store; +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + auto *Ty = cast<FixedVectorType>(Input->getType()); + // Only QR variants allow truncating + if (!(Ty->getNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) { + // Can't build an intrinsic for this + return nullptr; + } + Value *Mask = I->getArgOperand(3); + // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask) + LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n"); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base, + {Ptr->getType(), Input->getType()}, + {Ptr, Builder.getInt32(Increment), Input}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_base_predicated, + {Ptr->getType(), Input->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Input, Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + auto *Ty = cast<FixedVectorType>(Input->getType()); + LLVM_DEBUG( + dbgs() + << "masked scatters: storing to a vector of pointers with writeback\n"); + if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + // Can't build an intrinsic for this + return nullptr; + Value *Mask = I->getArgOperand(3); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base_wb, + {Ptr->getType(), Input->getType()}, + {Ptr, Builder.getInt32(Increment), Input}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_base_wb_predicated, + {Ptr->getType(), Input->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Input, Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + Value *Mask = I->getArgOperand(3); + Type *InputTy = Input->getType(); + Type *MemoryTy = InputTy; + LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing" + << " to base + vector of offsets\n"); + // If the input has been truncated, try to integrate that trunc into the + // scatter instruction (we don't care about alignment here) + if (TruncInst *Trunc = dyn_cast<TruncInst>(Input)) { + Value *PreTrunc = Trunc->getOperand(0); + Type *PreTruncTy = PreTrunc->getType(); + if (PreTruncTy->getPrimitiveSizeInBits() == 128) { + Input = PreTrunc; + InputTy = PreTruncTy; + } + } + if (InputTy->getPrimitiveSizeInBits() != 128) { + LLVM_DEBUG( + dbgs() << "masked scatters: cannot create scatters for non-standard" + << " input types. Expanding.\n"); + return nullptr; + } + + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + Value *Offsets; + Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); + if (!BasePtr) + return nullptr; + // Check whether the offset is a constant increment that could be merged into + // a QI gather + Value *Store = + tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder); + if (Store) + return Store; + int Scale = computeScale( + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + MemoryTy->getScalarSizeInBits()); + if (Scale == -1) + return nullptr; + + if (!match(Mask, m_One())) + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset_predicated, + {BasePtr->getType(), Offsets->getType(), Input->getType(), + Mask->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Mask}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset, + {BasePtr->getType(), Offsets->getType(), Input->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale)}); +} + +Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat( + IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP, + IRBuilder<> &Builder) { + FixedVectorType *Ty; + if (I->getIntrinsicID() == Intrinsic::masked_gather) + Ty = cast<FixedVectorType>(I->getType()); + else + Ty = cast<FixedVectorType>(I->getArgOperand(0)->getType()); + // Incrementing gathers only exist for v4i32 + if (Ty->getNumElements() != 4 || + Ty->getScalarSizeInBits() != 32) + return nullptr; + Loop *L = LI->getLoopFor(I->getParent()); + if (L == nullptr) + // Incrementing gathers are not beneficial outside of a loop + return nullptr; + LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing " + "wb gather/scatter\n"); + + // The gep was in charge of making sure the offsets are scaled correctly + // - calculate that factor so it can be applied by hand + DataLayout DT = I->getParent()->getParent()->getParent()->getDataLayout(); + int TypeScale = + computeScale(DT.getTypeSizeInBits(GEP->getOperand(0)->getType()), + DT.getTypeSizeInBits(GEP->getType()) / + cast<FixedVectorType>(GEP->getType())->getNumElements()); + if (TypeScale == -1) + return nullptr; + + if (GEP->hasOneUse()) { + // Only in this case do we want to build a wb gather, because the wb will + // change the phi which does affect other users of the gep (which will still + // be using the phi in the old way) + Value *Load = + tryCreateIncrementingWBGatScat(I, BasePtr, Offsets, TypeScale, Builder); + if (Load != nullptr) + return Load; + } + LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing " + "non-wb gather/scatter\n"); + + std::pair<Value *, int64_t> Add = getVarAndConst(Offsets, TypeScale); + if (Add.first == nullptr) + return nullptr; + Value *OffsetsIncoming = Add.first; + int64_t Immediate = Add.second; + + // Make sure the offsets are scaled correctly + Instruction *ScaledOffsets = BinaryOperator::Create( + Instruction::Shl, OffsetsIncoming, + Builder.CreateVectorSplat(Ty->getNumElements(), Builder.getInt32(TypeScale)), + "ScaledIndex", I); + // Add the base to the offsets + OffsetsIncoming = BinaryOperator::Create( + Instruction::Add, ScaledOffsets, + Builder.CreateVectorSplat( + Ty->getNumElements(), + Builder.CreatePtrToInt( + BasePtr, + cast<VectorType>(ScaledOffsets->getType())->getElementType())), + "StartIndex", I); + + if (I->getIntrinsicID() == Intrinsic::masked_gather) + return cast<IntrinsicInst>( + tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate)); + else + return cast<IntrinsicInst>( + tryCreateMaskedScatterBase(I, OffsetsIncoming, Builder, Immediate)); +} + +Value *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat( + IntrinsicInst *I, Value *BasePtr, Value *Offsets, unsigned TypeScale, + IRBuilder<> &Builder) { + // Check whether this gather's offset is incremented by a constant - if so, + // and the load is of the right type, we can merge this into a QI gather + Loop *L = LI->getLoopFor(I->getParent()); + // Offsets that are worth merging into this instruction will be incremented + // by a constant, thus we're looking for an add of a phi and a constant + PHINode *Phi = dyn_cast<PHINode>(Offsets); + if (Phi == nullptr || Phi->getNumIncomingValues() != 2 || + Phi->getParent() != L->getHeader() || Phi->getNumUses() != 2) + // No phi means no IV to write back to; if there is a phi, we expect it + // to have exactly two incoming values; the only phis we are interested in + // will be loop IV's and have exactly two uses, one in their increment and + // one in the gather's gep + return nullptr; + + unsigned IncrementIndex = + Phi->getIncomingBlock(0) == L->getLoopLatch() ? 0 : 1; + // Look through the phi to the phi increment + Offsets = Phi->getIncomingValue(IncrementIndex); + + std::pair<Value *, int64_t> Add = getVarAndConst(Offsets, TypeScale); + if (Add.first == nullptr) + return nullptr; + Value *OffsetsIncoming = Add.first; + int64_t Immediate = Add.second; + if (OffsetsIncoming != Phi) + // Then the increment we are looking at is not an increment of the + // induction variable, and we don't want to do a writeback + return nullptr; + + Builder.SetInsertPoint(&Phi->getIncomingBlock(1 - IncrementIndex)->back()); + unsigned NumElems = + cast<FixedVectorType>(OffsetsIncoming->getType())->getNumElements(); + + // Make sure the offsets are scaled correctly + Instruction *ScaledOffsets = BinaryOperator::Create( + Instruction::Shl, Phi->getIncomingValue(1 - IncrementIndex), + Builder.CreateVectorSplat(NumElems, Builder.getInt32(TypeScale)), + "ScaledIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back()); + // Add the base to the offsets + OffsetsIncoming = BinaryOperator::Create( + Instruction::Add, ScaledOffsets, + Builder.CreateVectorSplat( + NumElems, + Builder.CreatePtrToInt( + BasePtr, + cast<VectorType>(ScaledOffsets->getType())->getElementType())), + "StartIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back()); + // The gather is pre-incrementing + OffsetsIncoming = BinaryOperator::Create( + Instruction::Sub, OffsetsIncoming, + Builder.CreateVectorSplat(NumElems, Builder.getInt32(Immediate)), + "PreIncrementStartIndex", + &Phi->getIncomingBlock(1 - IncrementIndex)->back()); + Phi->setIncomingValue(1 - IncrementIndex, OffsetsIncoming); + + Builder.SetInsertPoint(I); + + Value *EndResult; + Value *NewInduction; + if (I->getIntrinsicID() == Intrinsic::masked_gather) { + // Build the incrementing gather + Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate); + // One value to be handed to whoever uses the gather, one is the loop + // increment + EndResult = Builder.CreateExtractValue(Load, 0, "Gather"); + NewInduction = Builder.CreateExtractValue(Load, 1, "GatherIncrement"); + } else { + // Build the incrementing scatter + NewInduction = tryCreateMaskedScatterBaseWB(I, Phi, Builder, Immediate); + EndResult = NewInduction; + } + Instruction *AddInst = cast<Instruction>(Offsets); + AddInst->replaceAllUsesWith(NewInduction); + AddInst->eraseFromParent(); + Phi->setIncomingValue(IncrementIndex, NewInduction); + + return EndResult; +} + +void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi, + Value *OffsSecondOperand, + unsigned StartIndex) { + LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising add instruction\n"); + Instruction *InsertionPoint = + &cast<Instruction>(Phi->getIncomingBlock(StartIndex)->back()); + // Initialize the phi with a vector that contains a sum of the constants + Instruction *NewIndex = BinaryOperator::Create( + Instruction::Add, Phi->getIncomingValue(StartIndex), OffsSecondOperand, + "PushedOutAdd", InsertionPoint); + unsigned IncrementIndex = StartIndex == 0 ? 1 : 0; + + // Order such that start index comes first (this reduces mov's) + Phi->addIncoming(NewIndex, Phi->getIncomingBlock(StartIndex)); + Phi->addIncoming(Phi->getIncomingValue(IncrementIndex), + Phi->getIncomingBlock(IncrementIndex)); + Phi->removeIncomingValue(IncrementIndex); + Phi->removeIncomingValue(StartIndex); +} + +void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi, + Value *IncrementPerRound, + Value *OffsSecondOperand, + unsigned LoopIncrement, + IRBuilder<> &Builder) { + LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising mul instruction\n"); + + // Create a new scalar add outside of the loop and transform it to a splat + // by which loop variable can be incremented + Instruction *InsertionPoint = &cast<Instruction>( + Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back()); + + // Create a new index + Value *StartIndex = BinaryOperator::Create( + Instruction::Mul, Phi->getIncomingValue(LoopIncrement == 1 ? 0 : 1), + OffsSecondOperand, "PushedOutMul", InsertionPoint); + + Instruction *Product = + BinaryOperator::Create(Instruction::Mul, IncrementPerRound, + OffsSecondOperand, "Product", InsertionPoint); + // Increment NewIndex by Product instead of the multiplication + Instruction *NewIncrement = BinaryOperator::Create( + Instruction::Add, Phi, Product, "IncrementPushedOutMul", + cast<Instruction>(Phi->getIncomingBlock(LoopIncrement)->back()) + .getPrevNode()); + + Phi->addIncoming(StartIndex, + Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)); + Phi->addIncoming(NewIncrement, Phi->getIncomingBlock(LoopIncrement)); + Phi->removeIncomingValue((unsigned)0); + Phi->removeIncomingValue((unsigned)0); + return; +} + +// Check whether all usages of this instruction are as offsets of +// gathers/scatters or simple arithmetics only used by gathers/scatters +static bool hasAllGatScatUsers(Instruction *I) { + if (I->hasNUses(0)) { + return false; + } + bool Gatscat = true; + for (User *U : I->users()) { + if (!isa<Instruction>(U)) + return false; + if (isa<GetElementPtrInst>(U) || + isGatherScatter(dyn_cast<IntrinsicInst>(U))) { + return Gatscat; + } else { + unsigned OpCode = cast<Instruction>(U)->getOpcode(); + if ((OpCode == Instruction::Add || OpCode == Instruction::Mul) && + hasAllGatScatUsers(cast<Instruction>(U))) { + continue; + } + return false; + } + } + return Gatscat; +} + +bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB, + LoopInfo *LI) { + LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize\n"); + // Optimise the addresses of gathers/scatters by moving invariant + // calculations out of the loop + if (!isa<Instruction>(Offsets)) + return false; + Instruction *Offs = cast<Instruction>(Offsets); + if (Offs->getOpcode() != Instruction::Add && + Offs->getOpcode() != Instruction::Mul) + return false; + Loop *L = LI->getLoopFor(BB); + if (L == nullptr) + return false; + if (!Offs->hasOneUse()) { + if (!hasAllGatScatUsers(Offs)) + return false; + } + + // Find out which, if any, operand of the instruction + // is a phi node + PHINode *Phi; + int OffsSecondOp; + if (isa<PHINode>(Offs->getOperand(0))) { + Phi = cast<PHINode>(Offs->getOperand(0)); + OffsSecondOp = 1; + } else if (isa<PHINode>(Offs->getOperand(1))) { + Phi = cast<PHINode>(Offs->getOperand(1)); + OffsSecondOp = 0; + } else { + bool Changed = true; + if (isa<Instruction>(Offs->getOperand(0)) && + L->contains(cast<Instruction>(Offs->getOperand(0)))) + Changed |= optimiseOffsets(Offs->getOperand(0), BB, LI); + if (isa<Instruction>(Offs->getOperand(1)) && + L->contains(cast<Instruction>(Offs->getOperand(1)))) + Changed |= optimiseOffsets(Offs->getOperand(1), BB, LI); + if (!Changed) { + return false; + } else { + if (isa<PHINode>(Offs->getOperand(0))) { + Phi = cast<PHINode>(Offs->getOperand(0)); + OffsSecondOp = 1; + } else if (isa<PHINode>(Offs->getOperand(1))) { + Phi = cast<PHINode>(Offs->getOperand(1)); + OffsSecondOp = 0; + } else { + return false; + } + } + } + // A phi node we want to perform this function on should be from the + // loop header, and shouldn't have more than 2 incoming values + if (Phi->getParent() != L->getHeader() || + Phi->getNumIncomingValues() != 2) + return false; + + // The phi must be an induction variable + Instruction *Op; + int IncrementingBlock = -1; + + for (int i = 0; i < 2; i++) + if ((Op = dyn_cast<Instruction>(Phi->getIncomingValue(i))) != nullptr) + if (Op->getOpcode() == Instruction::Add && + (Op->getOperand(0) == Phi || Op->getOperand(1) == Phi)) + IncrementingBlock = i; + if (IncrementingBlock == -1) + return false; + + Instruction *IncInstruction = + cast<Instruction>(Phi->getIncomingValue(IncrementingBlock)); + + // If the phi is not used by anything else, we can just adapt it when + // replacing the instruction; if it is, we'll have to duplicate it + PHINode *NewPhi; + Value *IncrementPerRound = IncInstruction->getOperand( + (IncInstruction->getOperand(0) == Phi) ? 1 : 0); + + // Get the value that is added to/multiplied with the phi + Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp); + + if (IncrementPerRound->getType() != OffsSecondOperand->getType()) + // Something has gone wrong, abort + return false; + + // Only proceed if the increment per round is a constant or an instruction + // which does not originate from within the loop + if (!isa<Constant>(IncrementPerRound) && + !(isa<Instruction>(IncrementPerRound) && + !L->contains(cast<Instruction>(IncrementPerRound)))) + return false; + + if (Phi->getNumUses() == 2) { + // No other users -> reuse existing phi (One user is the instruction + // we're looking at, the other is the phi increment) + if (IncInstruction->getNumUses() != 1) { + // If the incrementing instruction does have more users than + // our phi, we need to copy it + IncInstruction = BinaryOperator::Create( + Instruction::BinaryOps(IncInstruction->getOpcode()), Phi, + IncrementPerRound, "LoopIncrement", IncInstruction); + Phi->setIncomingValue(IncrementingBlock, IncInstruction); + } + NewPhi = Phi; + } else { + // There are other users -> create a new phi + NewPhi = PHINode::Create(Phi->getType(), 0, "NewPhi", Phi); + std::vector<Value *> Increases; + // Copy the incoming values of the old phi + NewPhi->addIncoming(Phi->getIncomingValue(IncrementingBlock == 1 ? 0 : 1), + Phi->getIncomingBlock(IncrementingBlock == 1 ? 0 : 1)); + IncInstruction = BinaryOperator::Create( + Instruction::BinaryOps(IncInstruction->getOpcode()), NewPhi, + IncrementPerRound, "LoopIncrement", IncInstruction); + NewPhi->addIncoming(IncInstruction, + Phi->getIncomingBlock(IncrementingBlock)); + IncrementingBlock = 1; + } + + IRBuilder<> Builder(BB->getContext()); + Builder.SetInsertPoint(Phi); + Builder.SetCurrentDebugLocation(Offs->getDebugLoc()); + + switch (Offs->getOpcode()) { + case Instruction::Add: + pushOutAdd(NewPhi, OffsSecondOperand, IncrementingBlock == 1 ? 0 : 1); + break; + case Instruction::Mul: + pushOutMul(NewPhi, IncrementPerRound, OffsSecondOperand, IncrementingBlock, + Builder); + break; + default: + return false; + } + LLVM_DEBUG( + dbgs() << "masked gathers/scatters: simplified loop variable add/mul\n"); + + // The instruction has now been "absorbed" into the phi value + Offs->replaceAllUsesWith(NewPhi); + if (Offs->hasNUses(0)) + Offs->eraseFromParent(); + // Clean up the old increment in case it's unused because we built a new + // one + if (IncInstruction->hasNUses(0)) + IncInstruction->eraseFromParent(); + + return true; } bool MVEGatherScatterLowering::runOnFunction(Function &F) { @@ -282,20 +986,51 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) { auto *ST = &TM.getSubtarget<ARMSubtarget>(F); if (!ST->hasMVEIntegerOps()) return false; + LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SmallVector<IntrinsicInst *, 4> Gathers; + SmallVector<IntrinsicInst *, 4> Scatters; + + bool Changed = false; + for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); - if (II && II->getIntrinsicID() == Intrinsic::masked_gather) + if (II && II->getIntrinsicID() == Intrinsic::masked_gather) { Gathers.push_back(II); + if (isa<GetElementPtrInst>(II->getArgOperand(0))) + Changed |= optimiseOffsets( + cast<Instruction>(II->getArgOperand(0))->getOperand(1), + II->getParent(), LI); + } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) { + Scatters.push_back(II); + if (isa<GetElementPtrInst>(II->getArgOperand(1))) + Changed |= optimiseOffsets( + cast<Instruction>(II->getArgOperand(1))->getOperand(1), + II->getParent(), LI); + } } } - if (Gathers.empty()) - return false; + for (unsigned i = 0; i < Gathers.size(); i++) { + IntrinsicInst *I = Gathers[i]; + Value *L = lowerGather(I); + if (L == nullptr) + continue; - for (IntrinsicInst *I : Gathers) - lowerGather(I); + // Get rid of any now dead instructions + SimplifyInstructionsInBlock(cast<Instruction>(L)->getParent()); + Changed = true; + } - return true; + for (unsigned i = 0; i < Scatters.size(); i++) { + IntrinsicInst *I = Scatters[i]; + Value *S = lowerScatter(I); + if (S == nullptr) + continue; + + // Get rid of any now dead instructions + SimplifyInstructionsInBlock(cast<Instruction>(S)->getParent()); + Changed = true; + } + return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp index 038c68739cdf..5bf3522ab2e6 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -1,4 +1,4 @@ -//===- MVETailPredication.cpp - MVE Tail Predication ----------------------===// +//===- MVETailPredication.cpp - MVE Tail Predication ------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,8 +8,17 @@ // /// \file /// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead -/// branches to help accelerate DSP applications. These two extensions can be -/// combined to provide implicit vector predication within a low-overhead loop. +/// branches to help accelerate DSP applications. These two extensions, +/// combined with a new form of predication called tail-predication, can be used +/// to provide implicit vector predication within a low-overhead loop. +/// This is implicit because the predicate of active/inactive lanes is +/// calculated by hardware, and thus does not need to be explicitly passed +/// to vector instructions. The instructions responsible for this are the +/// DLSTP and WLSTP instructions, which setup a tail-predicated loop and the +/// the total number of data elements processed by the loop. The loop-end +/// LETP instruction is responsible for decrementing and setting the remaining +/// elements to be processed and generating the mask of active lanes. +/// /// The HardwareLoops pass inserts intrinsics identifying loops that the /// backend will attempt to convert into a low-overhead loop. The vectorizer is /// responsible for generating a vectorized loop in which the lanes are @@ -21,36 +30,62 @@ /// - A loop containing multiple VCPT instructions, predicating multiple VPT /// blocks of instructions operating on different vector types. /// -/// This pass inserts the inserts the VCTP intrinsic to represent the effect of -/// tail predication. This will be picked up by the ARM Low-overhead loop pass, -/// which performs the final transformation to a DLSTP or WLSTP tail-predicated -/// loop. +/// This pass: +/// 1) Checks if the predicates of the masked load/store instructions are +/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes +/// the Backedge Taken Count (BTC) of the scalar loop as its second argument, +/// which we extract to set up the number of elements processed by the loop. +/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target +/// specific VCTP intrinsic to represent the effect of tail predication. +/// This will be picked up by the ARM Low-overhead loop pass, which performs +/// the final transformation to a DLSTP or WLSTP tail-predicated loop. #include "ARM.h" #include "ARMSubtarget.h" +#include "ARMTargetTransformInfo.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" using namespace llvm; #define DEBUG_TYPE "mve-tail-predication" #define DESC "Transform predicated vector loops to use MVE tail predication" -cl::opt<bool> -DisableTailPredication("disable-mve-tail-predication", cl::Hidden, - cl::init(true), - cl::desc("Disable MVE Tail Predication")); +cl::opt<TailPredication::Mode> EnableTailPredication( + "tail-predication", cl::desc("MVE tail-predication options"), + cl::init(TailPredication::Disabled), + cl::values(clEnumValN(TailPredication::Disabled, "disabled", + "Don't tail-predicate loops"), + clEnumValN(TailPredication::EnabledNoReductions, + "enabled-no-reductions", + "Enable tail-predication, but not for reduction loops"), + clEnumValN(TailPredication::Enabled, + "enabled", + "Enable tail-predication, including reduction loops"), + clEnumValN(TailPredication::ForceEnabledNoReductions, + "force-enabled-no-reductions", + "Enable tail-predication, but not for reduction loops, " + "and force this which might be unsafe"), + clEnumValN(TailPredication::ForceEnabled, + "force-enabled", + "Enable tail-predication, including reduction loops, " + "and force this which might be unsafe"))); + + namespace { class MVETailPredication : public LoopPass { @@ -58,6 +93,7 @@ class MVETailPredication : public LoopPass { Loop *L = nullptr; ScalarEvolution *SE = nullptr; TargetTransformInfo *TTI = nullptr; + const ARMSubtarget *ST = nullptr; public: static char ID; @@ -76,7 +112,6 @@ public: bool runOnLoop(Loop *L, LPPassManager&) override; private: - /// Perform the relevant checks on the loop and convert if possible. bool TryConvert(Value *TripCount); @@ -84,19 +119,21 @@ private: /// load/stores. bool IsPredicatedVectorLoop(); - /// Compute a value for the total number of elements that the predicated - /// loop will process. - Value *ComputeElements(Value *TripCount, VectorType *VecTy); - - /// Is the icmp that generates an i1 vector, based upon a loop counter - /// and a limit that is defined outside the loop. - bool isTailPredicate(Instruction *Predicate, Value *NumElements); + /// Perform checks on the arguments of @llvm.get.active.lane.mask + /// intrinsic: check if the first is a loop induction variable, and for the + /// the second check that no overflow can occur in the expression that use + /// this backedge-taken count. + bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount, + FixedVectorType *VecTy); /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(Instruction *Predicate, - DenseMap<Instruction*, Instruction*> &NewPredicates, - VectorType *VecTy, - Value *NumElements); + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, + FixedVectorType *VecTy); + + /// Rematerialize the iteration count in exit blocks, which enables + /// ARMLowOverheadLoops to better optimise away loop update statements inside + /// hardware-loops. + void RematerializeIterCount(); }; } // end namespace @@ -121,13 +158,14 @@ static bool IsMasked(Instruction *I) { } bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { - if (skipLoop(L) || DisableTailPredication) + if (skipLoop(L) || !EnableTailPredication) return false; + MaskedInsts.clear(); Function &F = *L->getHeader()->getParent(); auto &TPC = getAnalysis<TargetPassConfig>(); auto &TM = TPC.getTM<TargetMachine>(); - auto *ST = &TM.getSubtarget<ARMSubtarget>(F); + ST = &TM.getSubtarget<ARMSubtarget>(F); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); this->L = L; @@ -185,125 +223,59 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); - return TryConvert(Setup->getArgOperand(0)); -} -bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) { - // Look for the following: - - // %trip.count.minus.1 = add i32 %N, -1 - // %broadcast.splatinsert10 = insertelement <4 x i32> undef, - // i32 %trip.count.minus.1, i32 0 - // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, - // <4 x i32> undef, - // <4 x i32> zeroinitializer - // ... - // ... - // %index = phi i32 - // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, - // <4 x i32> undef, - // <4 x i32> zeroinitializer - // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> - // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11 - - // And return whether V == %pred. - - using namespace PatternMatch; - - CmpInst::Predicate Pred; - Instruction *Shuffle = nullptr; - Instruction *Induction = nullptr; - - // The vector icmp - if (!match(I, m_ICmp(Pred, m_Instruction(Induction), - m_Instruction(Shuffle))) || - Pred != ICmpInst::ICMP_ULE) - return false; - - // First find the stuff outside the loop which is setting up the limit - // vector.... - // The invariant shuffle that broadcast the limit into a vector. - Instruction *Insert = nullptr; - if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(), - m_Zero()))) - return false; - - // Insert the limit into a vector. - Instruction *BECount = nullptr; - if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount), - m_Zero()))) - return false; - - // The limit calculation, backedge count. - Value *TripCount = nullptr; - if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes()))) - return false; - - if (TripCount != NumElements || !L->isLoopInvariant(BECount)) - return false; - - // Now back to searching inside the loop body... - // Find the add with takes the index iv and adds a constant vector to it. - Instruction *BroadcastSplat = nullptr; - Constant *Const = nullptr; - if (!match(Induction, m_Add(m_Instruction(BroadcastSplat), - m_Constant(Const)))) - return false; - - // Check that we're adding <0, 1, 2, 3... - if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) { - for (unsigned i = 0; i < CDS->getNumElements(); ++i) { - if (CDS->getElementAsInteger(i) != i) - return false; - } - } else - return false; - - // The shuffle which broadcasts the index iv into a vector. - if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(), - m_Zero()))) - return false; - - // The insert element which initialises a vector with the index iv. - Instruction *IV = nullptr; - if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero()))) - return false; - - // The index iv. - auto *Phi = dyn_cast<PHINode>(IV); - if (!Phi) - return false; - - // TODO: Don't think we need to check the entry value. - Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader()); - if (!match(OnEntry, m_Zero())) - return false; - - Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch()); - unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements(); - - Instruction *LHS = nullptr; - if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes)))) + if (!TryConvert(Setup->getArgOperand(0))) { + LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); return false; + } - return LHS == Phi; + return true; } -static VectorType* getVectorType(IntrinsicInst *I) { +static FixedVectorType *getVectorType(IntrinsicInst *I) { unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType()); - return cast<VectorType>(PtrTy->getElementType()); + auto *VecTy = cast<FixedVectorType>(PtrTy->getElementType()); + assert(VecTy && "No scalable vectors expected here"); + return VecTy; } bool MVETailPredication::IsPredicatedVectorLoop() { // Check that the loop contains at least one masked load/store intrinsic. // We only support 'normal' vector instructions - other than masked // load/stores. + bool ActiveLaneMask = false; for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { + auto *Int = dyn_cast<IntrinsicInst>(&I); + if (!Int) + continue; + + switch (Int->getIntrinsicID()) { + case Intrinsic::get_active_lane_mask: + ActiveLaneMask = true; + LLVM_FALLTHROUGH; + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::usub_sat: + continue; + case Intrinsic::fma: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::round: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::fabs: + if (ST->hasMVEFloatOps()) + continue; + LLVM_FALLTHROUGH; + default: + break; + } + if (IsMasked(&I)) { - VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I)); + auto *VecTy = getVectorType(Int); unsigned Lanes = VecTy->getNumElements(); unsigned ElementWidth = VecTy->getScalarSizeInBits(); // MVE vectors are 128-bit, but don't support 128 x i1. @@ -312,94 +284,23 @@ bool MVETailPredication::IsPredicatedVectorLoop() { if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) return false; MaskedInsts.push_back(cast<IntrinsicInst>(&I)); - } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) { - for (auto &U : Int->args()) { - if (isa<VectorType>(U->getType())) - return false; - } + continue; + } + + for (const Use &U : Int->args()) { + if (isa<VectorType>(U->getType())) + return false; } } } + if (!ActiveLaneMask) { + LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n"); + return false; + } return !MaskedInsts.empty(); } -Value* MVETailPredication::ComputeElements(Value *TripCount, - VectorType *VecTy) { - const SCEV *TripCountSE = SE->getSCEV(TripCount); - ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()), - VecTy->getNumElements()); - - if (VF->equalsInt(1)) - return nullptr; - - // TODO: Support constant trip counts. - auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* { - if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) { - if (Const->getAPInt() != -VF->getValue()) - return nullptr; - } else - return nullptr; - return dyn_cast<SCEVMulExpr>(S->getOperand(1)); - }; - - auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* { - if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - return dyn_cast<SCEVUDivExpr>(S->getOperand(1)); - }; - - auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* { - if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) { - if (Const->getValue() != VF) - return nullptr; - } else - return nullptr; - - if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) { - if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) { - if (Const->getAPInt() != (VF->getValue() - 1)) - return nullptr; - } else - return nullptr; - - return RoundUp->getOperand(1); - } - return nullptr; - }; - - // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to - // determine the numbers of elements instead? Looks like this is what is used - // for delinearization, but I'm not sure if it can be applied to the - // vectorized form - at least not without a bit more work than I feel - // comfortable with. - - // Search for Elems in the following SCEV: - // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw> - const SCEV *Elems = nullptr; - if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE)) - if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1))) - if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS())) - if (auto *Mul = VisitAdd(Add)) - if (auto *Div = VisitMul(Mul)) - if (auto *Res = VisitDiv(Div)) - Elems = Res; - - if (!Elems) - return nullptr; - - Instruction *InsertPt = L->getLoopPreheader()->getTerminator(); - if (!isSafeToExpandAt(Elems, InsertPt, *SE)) - return nullptr; - - auto DL = L->getHeader()->getModule()->getDataLayout(); - SCEVExpander Expander(*SE, DL, "elements"); - return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt); -} - // Look through the exit block to see whether there's a duplicate predicate // instruction. This can happen when we need to perform a select on values // from the last and previous iteration. Instead of doing a straight @@ -407,31 +308,13 @@ Value* MVETailPredication::ComputeElements(Value *TripCount, // in the block. This means that the VPR doesn't have to be live into the // exit block which should make it easier to convert this loop into a proper // tail predicated loop. -static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates, - SetVector<Instruction*> &MaybeDead, Loop *L) { +static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) { BasicBlock *Exit = L->getUniqueExitBlock(); if (!Exit) { LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n"); return; } - for (auto &Pair : NewPredicates) { - Instruction *OldPred = Pair.first; - Instruction *NewPred = Pair.second; - - for (auto &I : *Exit) { - if (I.isSameOperationAs(OldPred)) { - Instruction *PredClone = NewPred->clone(); - PredClone->insertBefore(&I); - I.replaceAllUsesWith(PredClone); - MaybeDead.insert(&I); - LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump(); - dbgs() << "ARM TP: with: "; PredClone->dump()); - break; - } - } - } - // Drop references and add operands to check for dead. SmallPtrSet<Instruction*, 4> Dead; while (!MaybeDead.empty()) { @@ -440,11 +323,10 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates, if (I->hasNUsesOrMore(1)) continue; - for (auto &U : I->operands()) { + for (auto &U : I->operands()) if (auto *OpI = dyn_cast<Instruction>(U)) MaybeDead.insert(OpI); - } - I->dropAllReferences(); + Dead.insert(I); } @@ -457,24 +339,211 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates, DeleteDeadPHIs(I); } -void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate, - DenseMap<Instruction*, Instruction*> &NewPredicates, - VectorType *VecTy, Value *NumElements) { - IRBuilder<> Builder(L->getHeader()->getFirstNonPHI()); +// The active lane intrinsic has this form: +// +// @llvm.get.active.lane.mask(IV, BTC) +// +// Here we perform checks that this intrinsic behaves as expected, +// which means: +// +// 1) The element count, which is calculated with BTC + 1, cannot overflow. +// 2) The element count needs to be sufficiently large that the decrement of +// element counter doesn't overflow, which means that we need to prove: +// ceil(ElementCount / VectorWidth) >= TripCount +// by rounding up ElementCount up: +// ((ElementCount + (VectorWidth - 1)) / VectorWidth +// and evaluate if expression isKnownNonNegative: +// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount +// 3) The IV must be an induction phi with an increment equal to the +// vector width. +bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, + Value *TripCount, FixedVectorType *VecTy) { + bool ForceTailPredication = + EnableTailPredication == TailPredication::ForceEnabledNoReductions || + EnableTailPredication == TailPredication::ForceEnabled; + // 1) Test whether entry to the loop is protected by a conditional + // BTC + 1 < 0. In other words, if the scalar trip count overflows, + // becomes negative, we shouldn't enter the loop and creating + // tripcount expression BTC + 1 is not safe. So, check that BTC + // isn't max. This is evaluated in unsigned, because the semantics + // of @get.active.lane.mask is a ULE comparison. + + int VectorWidth = VecTy->getNumElements(); + auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); + auto *BTC = SE->getSCEV(BackedgeTakenCount); + + if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) && + !ForceTailPredication) { + LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: "; + BTC->dump()); + return false; + } + + // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: + // + // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount + // + // 2.1) First prove overflow can't happen in: + // + // ElementCount + (VectorWidth - 1) + // + // Because of a lack of context, it is difficult to get a useful bounds on + // this expression. But since ElementCount uses the same variables as the + // TripCount (TC), for which we can find meaningful value ranges, we use that + // instead and assert that: + // + // upperbound(TC) <= UINT_MAX - VectorWidth + // + auto *TC = SE->getSCEV(TripCount); + unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); + auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); + uint64_t MaxMinusVW = Diff.getZExtValue(); + uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); + + if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { + LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n"; + dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n"; + dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";); + return false; + } + + // 2.2) Make sure overflow doesn't happen in final expression: + // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount, + // To do this, compare the full ranges of these subexpressions: + // + // Range(Ceil) <= Range(TC) + // + // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime + // values (and not constants), we have to compensate for the lowerbound value + // range to be off by 1. The reason is that BTC lives in the preheader in + // this form: + // + // %trip.count.minus = add nsw nuw i32 %N, -1 + // + // For the loop to be executed, %N has to be >= 1 and as a result the value + // range of %trip.count.minus has a lower bound of 0. Value %TC has this form: + // + // %5 = add nuw nsw i32 %4, 1 + // call void @llvm.set.loop.iterations.i32(i32 %5) + // + // where %5 is some expression using %N, which needs to have a lower bound of + // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set, + // we first add 0 to TC such that we can do the <= comparison on both sets. + // + auto *One = SE->getOne(TripCount->getType()); + // ElementCount = BTC + 1 + auto *ElementCount = SE->getAddExpr(BTC, One); + // Tmp = ElementCount + (VW-1) + auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); + // Ceil = ElementCount + (VW-1) / VW + auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth))); + + ConstantRange RangeCeil = SE->getSignedRange(Ceil) ; + ConstantRange RangeTC = SE->getSignedRange(TC) ; + if (!RangeTC.isSingleElement()) { + auto ZeroRange = + ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0)); + RangeTC = RangeTC.unionWith(ZeroRange); + } + if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) { + LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n"); + return false; + } + + // 3) Find out if IV is an induction phi. Note that We can't use Loop + // helpers here to get the induction variable, because the hardware loop is + // no longer in loopsimplify form, and also the hwloop intrinsic use a + // different counter. Using SCEV, we check that the induction is of the + // form i = i + 4, where the increment must be equal to the VectorWidth. + auto *IV = ActiveLaneMask->getOperand(0); + auto *IVExpr = SE->getSCEV(IV); + auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr); + if (!AddExpr) { + LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); + return false; + } + // Check that this AddRec is associated with this loop. + if (AddExpr->getLoop() != L) { + LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); + return false; + } + auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1)); + if (!Step) { + LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; + AddExpr->getOperand(1)->dump()); + return false; + } + auto StepValue = Step->getValue()->getSExtValue(); + if (VectorWidth == StepValue) + return true; + + LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match " + "vector width " << VectorWidth << "\n"); + + return false; +} + +// Materialize NumElements in the preheader block. +static Value *getNumElements(BasicBlock *Preheader, Value *BTC) { + // First, check the preheader if it not already exist: + // + // preheader: + // %BTC = add i32 %N, -1 + // .. + // vector.body: + // + // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1, + // but instead can just return %N. + for (auto &I : *Preheader) { + if (I.getOpcode() != Instruction::Add || &I != BTC) + continue; + ConstantInt *MinusOne = nullptr; + if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1)))) + continue; + if (MinusOne->getSExtValue() == -1) { + LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n"); + return I.getOperand(0); + } + } + + // But we do need to materialise BTC if it is not already there, + // e.g. if it is a constant. + IRBuilder<> Builder(Preheader->getTerminator()); + Value *NumElements = Builder.CreateAdd(BTC, + ConstantInt::get(BTC->getType(), 1), "num.elements"); + LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n"); + return NumElements; +} + +void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, + Value *TripCount, FixedVectorType *VecTy) { + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); + unsigned VectorWidth = VecTy->getNumElements(); + + // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, + // is one less than the trip count. So we need to find or create + // %num.elements = %BTC + 1 in the preheader. + Value *BTC = ActiveLaneMask->getOperand(1); + Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); + Value *NumElements = getNumElements(L->getLoopPreheader(), BTC); // Insert a phi to count the number of elements processed by the loop. + Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); PHINode *Processed = Builder.CreatePHI(Ty, 2); Processed->addIncoming(NumElements, L->getLoopPreheader()); - // Insert the intrinsic to represent the effect of tail predication. - Builder.SetInsertPoint(cast<Instruction>(Predicate)); + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus + // represent the effect of tail predication. + Builder.SetInsertPoint(ActiveLaneMask); ConstantInt *Factor = - ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements()); + ConstantInt::get(cast<IntegerType>(Ty), VectorWidth); Intrinsic::ID VCTPID; - switch (VecTy->getNumElements()) { + switch (VectorWidth) { default: llvm_unreachable("unexpected number of lanes"); case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; @@ -488,9 +557,8 @@ void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate, // purposes, but takes a v4i1 instead of a v2i1. } Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); - Value *TailPredicate = Builder.CreateCall(VCTP, Processed); - Predicate->replaceAllUsesWith(TailPredicate); - NewPredicates[Predicate] = cast<Instruction>(TailPredicate); + Value *VCTPCall = Builder.CreateCall(VCTP, Processed); + ActiveLaneMask->replaceAllUsesWith(VCTPCall); // Add the incoming value to the new phi. // TODO: This add likely already exists in the loop. @@ -498,47 +566,45 @@ void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate, Processed->addIncoming(Remaining, L->getLoopLatch()); LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: " << *Processed << "\n" - << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n"); + << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } bool MVETailPredication::TryConvert(Value *TripCount) { if (!IsPredicatedVectorLoop()) { - LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop"); + LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n"); return false; } LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - - // Walk through the masked intrinsics and try to find whether the predicate - // operand is generated from an induction variable. SetVector<Instruction*> Predicates; - DenseMap<Instruction*, Instruction*> NewPredicates; + // Walk through the masked intrinsics and try to find whether the predicate + // operand is generated by intrinsic @llvm.get.active.lane.mask(). for (auto *I : MaskedInsts) { - Intrinsic::ID ID = I->getIntrinsicID(); - unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3; + unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3; auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp)); if (!Predicate || Predicates.count(Predicate)) continue; - VectorType *VecTy = getVectorType(I); - Value *NumElements = ComputeElements(TripCount, VecTy); - if (!NumElements) - continue; - - if (!isTailPredicate(Predicate, NumElements)) { - LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n"); + auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate); + if (!ActiveLaneMask || + ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) continue; - } - LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n"); Predicates.insert(Predicate); + LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " + << *ActiveLaneMask << "\n"); - InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements); + auto *VecTy = getVectorType(I); + if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) { + LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy); } - // Now clean up. - Cleanup(NewPredicates, Predicates, L); + Cleanup(Predicates, L); return true; } diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp index a5df46c94f42..dc769ae526bc 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp @@ -22,9 +22,9 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/ReachingDefAnalysis.h" #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Debug.h" #include <cassert> #include <new> @@ -34,83 +34,220 @@ using namespace llvm; #define DEBUG_TYPE "arm-mve-vpt" namespace { - class MVEVPTBlock : public MachineFunctionPass { - public: - static char ID; +class MVEVPTBlock : public MachineFunctionPass { +public: + static char ID; + const Thumb2InstrInfo *TII; + const TargetRegisterInfo *TRI; - MVEVPTBlock() : MachineFunctionPass(ID) {} + MVEVPTBlock() : MachineFunctionPass(ID) {} - bool runOnMachineFunction(MachineFunction &Fn) override; + bool runOnMachineFunction(MachineFunction &Fn) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<ReachingDefAnalysis>(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs).set( - MachineFunctionProperties::Property::TracksLiveness); - } - - StringRef getPassName() const override { - return "MVE VPT block insertion pass"; - } + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } - private: - bool InsertVPTBlocks(MachineBasicBlock &MBB); + StringRef getPassName() const override { + return "MVE VPT block insertion pass"; + } - const Thumb2InstrInfo *TII = nullptr; - ReachingDefAnalysis *RDA = nullptr; - }; +private: + bool InsertVPTBlocks(MachineBasicBlock &MBB); +}; - char MVEVPTBlock::ID = 0; +char MVEVPTBlock::ID = 0; } // end anonymous namespace INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false) -static MachineInstr *findVCMPToFoldIntoVPST(MachineInstr *MI, - ReachingDefAnalysis *RDA, +static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI, + const TargetRegisterInfo *TRI, unsigned &NewOpcode) { - // First, search backwards to the instruction that defines VPR - auto *Def = RDA->getReachingMIDef(MI, ARM::VPR); - if (!Def) - return nullptr; + // Search backwards to the instruction that defines VPR. This may or not + // be a VCMP, we check that after this loop. If we find another instruction + // that reads cpsr, we return nullptr. + MachineBasicBlock::iterator CmpMI = MI; + while (CmpMI != MI->getParent()->begin()) { + --CmpMI; + if (CmpMI->modifiesRegister(ARM::VPR, TRI)) + break; + if (CmpMI->readsRegister(ARM::VPR, TRI)) + break; + } - // Now check that Def is a VCMP - if (!(NewOpcode = VCMPOpcodeToVPT(Def->getOpcode()))) + if (CmpMI == MI) + return nullptr; + NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode()); + if (NewOpcode == 0) return nullptr; - // Check that Def's operands are not defined between the VCMP and MI, i.e. - // check that they have the same reaching def. - if (!RDA->hasSameReachingDef(Def, MI, Def->getOperand(1).getReg()) || - !RDA->hasSameReachingDef(Def, MI, Def->getOperand(2).getReg())) + // Search forward from CmpMI to MI, checking if either register was def'd + if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI), + MI, TRI)) + return nullptr; + if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI), + MI, TRI)) return nullptr; + return &*CmpMI; +} + +// Advances Iter past a block of predicated instructions. +// Returns true if it successfully skipped the whole block of predicated +// instructions. Returns false when it stopped early (due to MaxSteps), or if +// Iter didn't point to a predicated instruction. +static bool StepOverPredicatedInstrs(MachineBasicBlock::instr_iterator &Iter, + MachineBasicBlock::instr_iterator EndIter, + unsigned MaxSteps, + unsigned &NumInstrsSteppedOver) { + ARMVCC::VPTCodes NextPred = ARMVCC::None; + Register PredReg; + NumInstrsSteppedOver = 0; + + while (Iter != EndIter) { + NextPred = getVPTInstrPredicate(*Iter, PredReg); + assert(NextPred != ARMVCC::Else && + "VPT block pass does not expect Else preds"); + if (NextPred == ARMVCC::None || MaxSteps == 0) + break; + --MaxSteps; + ++Iter; + ++NumInstrsSteppedOver; + }; + + return NumInstrsSteppedOver != 0 && + (NextPred == ARMVCC::None || Iter == EndIter); +} + +// Returns true if at least one instruction in the range [Iter, End) defines +// or kills VPR. +static bool IsVPRDefinedOrKilledByBlock(MachineBasicBlock::iterator Iter, + MachineBasicBlock::iterator End) { + for (; Iter != End; ++Iter) + if (Iter->definesRegister(ARM::VPR) || Iter->killsRegister(ARM::VPR)) + return true; + return false; +} + +// Creates a T, TT, TTT or TTTT BlockMask depending on BlockSize. +static ARM::PredBlockMask GetInitialBlockMask(unsigned BlockSize) { + switch (BlockSize) { + case 1: + return ARM::PredBlockMask::T; + case 2: + return ARM::PredBlockMask::TT; + case 3: + return ARM::PredBlockMask::TTT; + case 4: + return ARM::PredBlockMask::TTTT; + default: + llvm_unreachable("Invalid BlockSize!"); + } +} + +// Given an iterator (Iter) that points at an instruction with a "Then" +// predicate, tries to create the largest block of continuous predicated +// instructions possible, and returns the VPT Block Mask of that block. +// +// This will try to perform some minor optimization in order to maximize the +// size of the block. +static ARM::PredBlockMask +CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter, + MachineBasicBlock::instr_iterator EndIter, + SmallVectorImpl<MachineInstr *> &DeadInstructions) { + MachineBasicBlock::instr_iterator BlockBeg = Iter; + (void)BlockBeg; + assert(getVPTInstrPredicate(*Iter) == ARMVCC::Then && + "Expected a Predicated Instruction"); + + LLVM_DEBUG(dbgs() << "VPT block created for: "; Iter->dump()); + + unsigned BlockSize; + StepOverPredicatedInstrs(Iter, EndIter, 4, BlockSize); + + LLVM_DEBUG(for (MachineBasicBlock::instr_iterator AddedInstIter = + std::next(BlockBeg); + AddedInstIter != Iter; ++AddedInstIter) { + dbgs() << " adding: "; + AddedInstIter->dump(); + }); + + // Generate the initial BlockMask + ARM::PredBlockMask BlockMask = GetInitialBlockMask(BlockSize); + + // Remove VPNOTs while there's still room in the block, so we can make the + // largest block possible. + ARMVCC::VPTCodes CurrentPredicate = ARMVCC::Else; + while (BlockSize < 4 && Iter != EndIter && + Iter->getOpcode() == ARM::MVE_VPNOT) { + + // Try to skip all of the predicated instructions after the VPNOT, stopping + // after (4 - BlockSize). If we can't skip them all, stop. + unsigned ElseInstCnt = 0; + MachineBasicBlock::instr_iterator VPNOTBlockEndIter = std::next(Iter); + if (!StepOverPredicatedInstrs(VPNOTBlockEndIter, EndIter, (4 - BlockSize), + ElseInstCnt)) + break; + + // Check if this VPNOT can be removed or not: It can only be removed if at + // least one of the predicated instruction that follows it kills or sets + // VPR. + if (!IsVPRDefinedOrKilledByBlock(Iter, VPNOTBlockEndIter)) + break; + + LLVM_DEBUG(dbgs() << " removing VPNOT: "; Iter->dump();); + + // Record the new size of the block + BlockSize += ElseInstCnt; + assert(BlockSize <= 4 && "Block is too large!"); + + // Record the VPNot to remove it later. + DeadInstructions.push_back(&*Iter); + ++Iter; + + // Replace the predicates of the instructions we're adding. + // Note that we are using "Iter" to iterate over the block so we can update + // it at the same time. + for (; Iter != VPNOTBlockEndIter; ++Iter) { + // Find the register in which the predicate is + int OpIdx = findFirstVPTPredOperandIdx(*Iter); + assert(OpIdx != -1); + + // Change the predicate and update the mask + Iter->getOperand(OpIdx).setImm(CurrentPredicate); + BlockMask = expandPredBlockMask(BlockMask, CurrentPredicate); + + LLVM_DEBUG(dbgs() << " adding : "; Iter->dump()); + } - return Def; + CurrentPredicate = + (CurrentPredicate == ARMVCC::Then ? ARMVCC::Else : ARMVCC::Then); + } + return BlockMask; } bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { bool Modified = false; MachineBasicBlock::instr_iterator MBIter = Block.instr_begin(); MachineBasicBlock::instr_iterator EndIter = Block.instr_end(); - SmallSet<MachineInstr *, 4> RemovedVCMPs; + + SmallVector<MachineInstr *, 4> DeadInstructions; while (MBIter != EndIter) { MachineInstr *MI = &*MBIter; - unsigned PredReg = 0; - DebugLoc dl = MI->getDebugLoc(); + Register PredReg; + DebugLoc DL = MI->getDebugLoc(); ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg); // The idea of the predicate is that None, Then and Else are for use when // handling assembly language: they correspond to the three possible // suffixes "", "t" and "e" on the mnemonic. So when instructions are read - // from assembly source or disassembled from object code, you expect to see - // a mixture whenever there's a long VPT block. But in code generation, we - // hope we'll never generate an Else as input to this pass. + // from assembly source or disassembled from object code, you expect to + // see a mixture whenever there's a long VPT block. But in code + // generation, we hope we'll never generate an Else as input to this pass. assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds"); if (Pred == ARMVCC::None) { @@ -118,46 +255,25 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { continue; } - LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump()); - int VPTInstCnt = 1; - ARMVCC::VPTCodes NextPred; - - // Look at subsequent instructions, checking if they can be in the same VPT - // block. - ++MBIter; - while (MBIter != EndIter && VPTInstCnt < 4) { - NextPred = getVPTInstrPredicate(*MBIter, PredReg); - assert(NextPred != ARMVCC::Else && - "VPT block pass does not expect Else preds"); - if (NextPred != Pred) - break; - LLVM_DEBUG(dbgs() << " adding : "; MBIter->dump()); - ++VPTInstCnt; - ++MBIter; - }; - - unsigned BlockMask = getARMVPTBlockMask(VPTInstCnt); + ARM::PredBlockMask BlockMask = + CreateVPTBlock(MBIter, EndIter, DeadInstructions); - // Search back for a VCMP that can be folded to create a VPT, or else create - // a VPST directly + // Search back for a VCMP that can be folded to create a VPT, or else + // create a VPST directly MachineInstrBuilder MIBuilder; unsigned NewOpcode; - MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, RDA, NewOpcode); - if (VCMP) { + LLVM_DEBUG(dbgs() << " final block mask: " << (unsigned)BlockMask << "\n"); + if (MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode)) { LLVM_DEBUG(dbgs() << " folding VCMP into VPST: "; VCMP->dump()); - MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode)); - MIBuilder.addImm(BlockMask); + MIBuilder = BuildMI(Block, MI, DL, TII->get(NewOpcode)); + MIBuilder.addImm((uint64_t)BlockMask); MIBuilder.add(VCMP->getOperand(1)); MIBuilder.add(VCMP->getOperand(2)); MIBuilder.add(VCMP->getOperand(3)); - // We delay removing the actual VCMP instruction by saving it to a list - // and deleting all instructions in this list in one go after we have - // created the VPT blocks. We do this in order not to invalidate the - // ReachingDefAnalysis that is queried by 'findVCMPToFoldIntoVPST'. - RemovedVCMPs.insert(VCMP); + VCMP->eraseFromParent(); } else { - MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST)); - MIBuilder.addImm(BlockMask); + MIBuilder = BuildMI(Block, MI, DL, TII->get(ARM::MVE_VPST)); + MIBuilder.addImm((uint64_t)BlockMask); } finalizeBundle( @@ -166,16 +282,18 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) { Modified = true; } - for (auto *I : RemovedVCMPs) - I->eraseFromParent(); + // Erase all dead instructions + for (MachineInstr *DeadMI : DeadInstructions) { + if (DeadMI->isInsideBundle()) + DeadMI->eraseFromBundle(); + else + DeadMI->eraseFromParent(); + } return Modified; } bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) { - if (skipFunction(Fn.getFunction())) - return false; - const ARMSubtarget &STI = static_cast<const ARMSubtarget &>(Fn.getSubtarget()); @@ -183,7 +301,7 @@ bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) { return false; TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); - RDA = &getAnalysis<ReachingDefAnalysis>(); + TRI = STI.getRegisterInfo(); LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n" << "********** Function: " << Fn.getName() << '\n'); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp new file mode 100644 index 000000000000..382ddd4572c7 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp @@ -0,0 +1,464 @@ +//===-- MVEVPTOptimisationsPass.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass does a few optimisations related to MVE VPT blocks before +/// register allocation is performed. The goal is to maximize the sizes of the +/// blocks that will be created by the MVE VPT Block Insertion pass (which runs +/// after register allocation). The first optimisation done by this pass is the +/// replacement of "opposite" VCMPs with VPNOTs, so the Block Insertion pass +/// can delete them later to create larger VPT blocks. +/// The second optimisation replaces re-uses of old VCCR values with VPNOTs when +/// inside a block of predicated instructions. This is done to avoid +/// spill/reloads of VPR in the middle of a block, which prevents the Block +/// Insertion pass from creating large blocks. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/Debug.h" +#include <cassert> + +using namespace llvm; + +#define DEBUG_TYPE "arm-mve-vpt-opts" + +namespace { +class MVEVPTOptimisations : public MachineFunctionPass { +public: + static char ID; + const Thumb2InstrInfo *TII; + MachineRegisterInfo *MRI; + + MVEVPTOptimisations() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &Fn) override; + + StringRef getPassName() const override { + return "ARM MVE VPT Optimisation Pass"; + } + +private: + MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB, + MachineInstr &Instr, + MachineOperand &User, + Register Target); + bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB); + bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB); +}; + +char MVEVPTOptimisations::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(MVEVPTOptimisations, DEBUG_TYPE, + "ARM MVE VPT Optimisations pass", false, false) + +// Returns true if Opcode is any VCMP Opcode. +static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; } + +// Returns true if a VCMP with this Opcode can have its operands swapped. +// There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs, +// and VCMPr instructions (since the r is always on the right). +static bool CanHaveSwappedOperands(unsigned Opcode) { + switch (Opcode) { + default: + return true; + case ARM::MVE_VCMPf32: + case ARM::MVE_VCMPf16: + case ARM::MVE_VCMPf32r: + case ARM::MVE_VCMPf16r: + case ARM::MVE_VCMPi8r: + case ARM::MVE_VCMPi16r: + case ARM::MVE_VCMPi32r: + case ARM::MVE_VCMPu8r: + case ARM::MVE_VCMPu16r: + case ARM::MVE_VCMPu32r: + case ARM::MVE_VCMPs8r: + case ARM::MVE_VCMPs16r: + case ARM::MVE_VCMPs32r: + return false; + } +} + +// Returns the CondCode of a VCMP Instruction. +static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) { + assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP"); + return ARMCC::CondCodes(Instr.getOperand(3).getImm()); +} + +// Returns true if Cond is equivalent to a VPNOT instruction on the result of +// Prev. Cond and Prev must be VCMPs. +static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) { + assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode())); + + // Opcodes must match. + if (Cond.getOpcode() != Prev.getOpcode()) + return false; + + MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2); + MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2); + + // If the VCMP has the opposite condition with the same operands, we can + // replace it with a VPNOT + ARMCC::CondCodes ExpectedCode = GetCondCode(Cond); + ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode); + if (ExpectedCode == GetCondCode(Prev)) + if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2)) + return true; + // Check again with operands swapped if possible + if (!CanHaveSwappedOperands(Cond.getOpcode())) + return false; + ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode); + return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) && + CondOP2.isIdenticalTo(PrevOP1); +} + +// Returns true if Instr writes to VCCR. +static bool IsWritingToVCCR(MachineInstr &Instr) { + if (Instr.getNumOperands() == 0) + return false; + MachineOperand &Dst = Instr.getOperand(0); + if (!Dst.isReg()) + return false; + Register DstReg = Dst.getReg(); + if (!DstReg.isVirtual()) + return false; + MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo(); + const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg); + return RegClass && (RegClass->getID() == ARM::VCCRRegClassID); +} + +// Transforms +// <Instr that uses %A ('User' Operand)> +// Into +// %K = VPNOT %Target +// <Instr that uses %K ('User' Operand)> +// And returns the newly inserted VPNOT. +// This optimization is done in the hopes of preventing spills/reloads of VPR by +// reducing the number of VCCR values with overlapping lifetimes. +MachineInstr &MVEVPTOptimisations::ReplaceRegisterUseWithVPNOT( + MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User, + Register Target) { + Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target)); + + MachineInstrBuilder MIBuilder = + BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT)) + .addDef(NewResult) + .addReg(Target); + addUnpredicatedMveVpredNOp(MIBuilder); + + // Make the user use NewResult instead, and clear its kill flag. + User.setReg(NewResult); + User.setIsKill(false); + + LLVM_DEBUG(dbgs() << " Inserting VPNOT (for spill prevention): "; + MIBuilder.getInstr()->dump()); + + return *MIBuilder.getInstr(); +} + +// Moves a VPNOT before its first user if an instruction that uses Reg is found +// in-between the VPNOT and its user. +// Returns true if there is at least one user of the VPNOT in the block. +static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Iter, + Register Reg) { + assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!"); + assert(getVPTInstrPredicate(*Iter) == ARMVCC::None && + "The VPNOT cannot be predicated"); + + MachineInstr &VPNOT = *Iter; + Register VPNOTResult = VPNOT.getOperand(0).getReg(); + Register VPNOTOperand = VPNOT.getOperand(1).getReg(); + + // Whether the VPNOT will need to be moved, and whether we found a user of the + // VPNOT. + bool MustMove = false, HasUser = false; + MachineOperand *VPNOTOperandKiller = nullptr; + for (; Iter != MBB.end(); ++Iter) { + if (MachineOperand *MO = + Iter->findRegisterUseOperand(VPNOTOperand, /*isKill*/ true)) { + // If we find the operand that kills the VPNOTOperand's result, save it. + VPNOTOperandKiller = MO; + } + + if (Iter->findRegisterUseOperandIdx(Reg) != -1) { + MustMove = true; + continue; + } + + if (Iter->findRegisterUseOperandIdx(VPNOTResult) == -1) + continue; + + HasUser = true; + if (!MustMove) + break; + + // Move the VPNOT right before Iter + LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << " Before: "; + Iter->dump()); + MBB.splice(Iter, &MBB, VPNOT.getIterator()); + // If we move the instr, and its operand was killed earlier, remove the kill + // flag. + if (VPNOTOperandKiller) + VPNOTOperandKiller->setIsKill(false); + + break; + } + return HasUser; +} + +// This optimisation attempts to reduce the number of overlapping lifetimes of +// VCCR values by replacing uses of old VCCR values with VPNOTs. For example, +// this replaces +// %A:vccr = (something) +// %B:vccr = VPNOT %A +// %Foo = (some op that uses %B) +// %Bar = (some op that uses %A) +// With +// %A:vccr = (something) +// %B:vccr = VPNOT %A +// %Foo = (some op that uses %B) +// %TMP2:vccr = VPNOT %B +// %Bar = (some op that uses %A) +bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) { + MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end(); + SmallVector<MachineInstr *, 4> DeadInstructions; + bool Modified = false; + + while (Iter != End) { + Register VCCRValue, OppositeVCCRValue; + // The first loop looks for 2 unpredicated instructions: + // %A:vccr = (instr) ; A is stored in VCCRValue + // %B:vccr = VPNOT %A ; B is stored in OppositeVCCRValue + for (; Iter != End; ++Iter) { + // We're only interested in unpredicated instructions that write to VCCR. + if (!IsWritingToVCCR(*Iter) || + getVPTInstrPredicate(*Iter) != ARMVCC::None) + continue; + Register Dst = Iter->getOperand(0).getReg(); + + // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've + // found what we were looking for. + if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT && + Iter->findRegisterUseOperandIdx(VCCRValue) != -1) { + // Move the VPNOT closer to its first user if needed, and ignore if it + // has no users. + if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue)) + continue; + + OppositeVCCRValue = Dst; + ++Iter; + break; + } + + // Else, just set VCCRValue. + VCCRValue = Dst; + } + + // If the first inner loop didn't find anything, stop here. + if (Iter == End) + break; + + assert(VCCRValue && OppositeVCCRValue && + "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop " + "stopped before the end of the block!"); + assert(VCCRValue != OppositeVCCRValue && + "VCCRValue should not be equal to OppositeVCCRValue!"); + + // LastVPNOTResult always contains the same value as OppositeVCCRValue. + Register LastVPNOTResult = OppositeVCCRValue; + + // This second loop tries to optimize the remaining instructions. + for (; Iter != End; ++Iter) { + bool IsInteresting = false; + + if (MachineOperand *MO = Iter->findRegisterUseOperand(VCCRValue)) { + IsInteresting = true; + + // - If the instruction is a VPNOT, it can be removed, and we can just + // replace its uses with LastVPNOTResult. + // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue. + if (Iter->getOpcode() == ARM::MVE_VPNOT) { + Register Result = Iter->getOperand(0).getReg(); + + MRI->replaceRegWith(Result, LastVPNOTResult); + DeadInstructions.push_back(&*Iter); + Modified = true; + + LLVM_DEBUG(dbgs() + << "Replacing all uses of '" << printReg(Result) + << "' with '" << printReg(LastVPNOTResult) << "'\n"); + } else { + MachineInstr &VPNOT = + ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult); + Modified = true; + + LastVPNOTResult = VPNOT.getOperand(0).getReg(); + std::swap(VCCRValue, OppositeVCCRValue); + + LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue) + << "' with '" << printReg(LastVPNOTResult) + << "' in instr: " << *Iter); + } + } else { + // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult + // instead as they contain the same value. + if (MachineOperand *MO = + Iter->findRegisterUseOperand(OppositeVCCRValue)) { + IsInteresting = true; + + // This is pointless if LastVPNOTResult == OppositeVCCRValue. + if (LastVPNOTResult != OppositeVCCRValue) { + LLVM_DEBUG(dbgs() << "Replacing usage of '" + << printReg(OppositeVCCRValue) << "' with '" + << printReg(LastVPNOTResult) << " for instr: "; + Iter->dump()); + MO->setReg(LastVPNOTResult); + Modified = true; + } + + MO->setIsKill(false); + } + + // If this is an unpredicated VPNOT on + // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it. + if (Iter->getOpcode() == ARM::MVE_VPNOT && + getVPTInstrPredicate(*Iter) == ARMVCC::None) { + Register VPNOTOperand = Iter->getOperand(1).getReg(); + if (VPNOTOperand == LastVPNOTResult || + VPNOTOperand == OppositeVCCRValue) { + IsInteresting = true; + + std::swap(VCCRValue, OppositeVCCRValue); + LastVPNOTResult = Iter->getOperand(0).getReg(); + } + } + } + + // If this instruction was not interesting, and it writes to VCCR, stop. + if (!IsInteresting && IsWritingToVCCR(*Iter)) + break; + } + } + + for (MachineInstr *DeadInstruction : DeadInstructions) + DeadInstruction->removeFromParent(); + + return Modified; +} + +// This optimisation replaces VCMPs with VPNOTs when they are equivalent. +bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) { + SmallVector<MachineInstr *, 4> DeadInstructions; + + // The last VCMP that we have seen and that couldn't be replaced. + // This is reset when an instruction that writes to VCCR/VPR is found, or when + // a VCMP is replaced with a VPNOT. + // We'll only replace VCMPs with VPNOTs when this is not null, and when the + // current VCMP is the opposite of PrevVCMP. + MachineInstr *PrevVCMP = nullptr; + // If we find an instruction that kills the result of PrevVCMP, we save the + // operand here to remove the kill flag in case we need to use PrevVCMP's + // result. + MachineOperand *PrevVCMPResultKiller = nullptr; + + for (MachineInstr &Instr : MBB.instrs()) { + if (PrevVCMP) { + if (MachineOperand *MO = Instr.findRegisterUseOperand( + PrevVCMP->getOperand(0).getReg(), /*isKill*/ true)) { + // If we come accross the instr that kills PrevVCMP's result, record it + // so we can remove the kill flag later if we need to. + PrevVCMPResultKiller = MO; + } + } + + // Ignore predicated instructions. + if (getVPTInstrPredicate(Instr) != ARMVCC::None) + continue; + + // Only look at VCMPs + if (!IsVCMP(Instr.getOpcode())) { + // If the instruction writes to VCCR, forget the previous VCMP. + if (IsWritingToVCCR(Instr)) + PrevVCMP = nullptr; + continue; + } + + if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) { + PrevVCMP = &Instr; + continue; + } + + // The register containing the result of the VCMP that we're going to + // replace. + Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg(); + + // Build a VPNOT to replace the VCMP, reusing its operands. + MachineInstrBuilder MIBuilder = + BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT)) + .add(Instr.getOperand(0)) + .addReg(PrevVCMPResultReg); + addUnpredicatedMveVpredNOp(MIBuilder); + LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): "; + MIBuilder.getInstr()->dump(); dbgs() << " Removed VCMP: "; + Instr.dump()); + + // If we found an instruction that uses, and kills PrevVCMP's result, + // remove the kill flag. + if (PrevVCMPResultKiller) + PrevVCMPResultKiller->setIsKill(false); + + // Finally, mark the old VCMP for removal and reset + // PrevVCMP/PrevVCMPResultKiller. + DeadInstructions.push_back(&Instr); + PrevVCMP = nullptr; + PrevVCMPResultKiller = nullptr; + } + + for (MachineInstr *DeadInstruction : DeadInstructions) + DeadInstruction->removeFromParent(); + + return !DeadInstructions.empty(); +} + +bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { + const ARMSubtarget &STI = + static_cast<const ARMSubtarget &>(Fn.getSubtarget()); + + if (!STI.isThumb2() || !STI.hasMVEIntegerOps()) + return false; + + TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); + MRI = &Fn.getRegInfo(); + + LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" + << "********** Function: " << Fn.getName() << '\n'); + + bool Modified = false; + for (MachineBasicBlock &MBB : Fn) { + Modified |= ReplaceVCMPsByVPNOTs(MBB); + Modified |= ReduceOldVCCRValueUses(MBB); + } + + LLVM_DEBUG(dbgs() << "**************************************\n"); + return Modified; +} + +/// createMVEVPTOptimisationsPass +FunctionPass *llvm::createMVEVPTOptimisationsPass() { + return new MVEVPTOptimisations(); +} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp index 956d474f1d79..d568e9afe432 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -88,8 +88,10 @@ emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB, 0, MIFlags); } BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDhirr), ARM::SP) - .addReg(ARM::SP).addReg(ScratchReg, RegState::Kill) - .add(predOps(ARMCC::AL)); + .addReg(ARM::SP) + .addReg(ScratchReg, RegState::Kill) + .add(predOps(ARMCC::AL)) + .setMIFlags(MIFlags); return; } // FIXME: This is assuming the heuristics in emitThumbRegPlusImmediate @@ -127,7 +129,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - Amount = alignTo(Amount, getStackAlignment()); + Amount = alignTo(Amount, getStackAlign()); // Replace the pseudo instruction with a new instruction... unsigned Opc = Old.getOpcode(); @@ -180,9 +182,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, if (ArgRegsSaveSize) { emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize, ARM::NoRegister, MachineInstr::FrameSetup); - CFAOffset -= ArgRegsSaveSize; - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + CFAOffset += ArgRegsSaveSize; + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -193,9 +195,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -(NumBytes - ArgRegsSaveSize), ARM::NoRegister, MachineInstr::FrameSetup); - CFAOffset -= NumBytes - ArgRegsSaveSize; + CFAOffset += NumBytes - ArgRegsSaveSize; unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -257,9 +259,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, } if (adjustedGPRCS1Size) { - CFAOffset -= adjustedGPRCS1Size; - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + CFAOffset += adjustedGPRCS1Size; + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -305,8 +307,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlags(MachineInstr::FrameSetup) .add(predOps(ARMCC::AL)); if(FramePtrOffsetInBlock) { - CFAOffset += FramePtrOffsetInBlock; - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( + CFAOffset -= FramePtrOffsetInBlock; + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa( nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) @@ -384,9 +386,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes, ScratchRegister, MachineInstr::FrameSetup); if (!HasFP) { - CFAOffset -= NumBytes; + CFAOffset += NumBytes; unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -402,7 +404,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, AFI->setDPRCalleeSavedAreaSize(DPRCSSize); if (RegInfo->needsStackRealignment(MF)) { - const unsigned NrBitsToZero = countTrailingZeros(MFI.getMaxAlignment()); + const unsigned NrBitsToZero = Log2(MFI.getMaxAlign()); // Emit the following sequence, using R4 as a temporary, since we cannot use // SP as a source or destination register for the shifts: // mov r4, sp @@ -804,11 +806,9 @@ static const unsigned *findNextOrderedReg(const unsigned *CurrentReg, return CurrentReg; } -bool Thumb1FrameLowering:: -spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { +bool Thumb1FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; @@ -927,11 +927,9 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, return true; } -bool Thumb1FrameLowering:: -restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { +bool Thumb1FrameLowering::restoreCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; @@ -1049,6 +1047,10 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (!STI.hasV5TOps()) continue; + // CMSE entry functions must return via BXNS, see emitEpilogue. + if (AFI->isCmseNSEntryFunction()) + continue; + // Pop LR into PC. Reg = ARM::PC; (*MIB).setDesc(TII.get(ARM::tPOP_RET)); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.h index 61af48712b6c..a4b2a085ea38 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1FrameLowering.h @@ -27,12 +27,13 @@ public: bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const override; - bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const override; + bool + restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + MutableArrayRef<CalleeSavedInfo> CSI, + const TargetRegisterInfo *TRI) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index b08b71a4952d..79afa378cb62 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -76,7 +76,7 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB, void Thumb1InstrInfo:: storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned SrcReg, bool isKill, int FI, + Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { assert((RC == &ARM::tGPRRegClass || @@ -92,7 +92,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFrameInfo &MFI = MF.getFrameInfo(); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); BuildMI(MBB, I, DL, get(ARM::tSTRspi)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI) @@ -104,7 +104,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, void Thumb1InstrInfo:: loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, int FI, + Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { assert( @@ -121,7 +121,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFrameInfo &MFI = MF.getFrameInfo(); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg) .addFrameIndex(FI) .addImm(0) diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h index 530289fe8c5d..017b7222337c 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb1InstrInfo.h @@ -42,13 +42,13 @@ public: bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned SrcReg, bool isKill, int FrameIndex, + Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned DestReg, int FrameIndex, + Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp index 786fc78d0233..5cdaa7f02201 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -183,7 +183,7 @@ Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI, ++I; if (I != E) { - unsigned NPredReg = 0; + Register NPredReg; ARMCC::CondCodes NCC = getITInstrPredicate(*I, NPredReg); if (NCC == CC || NCC == OCC) return true; @@ -199,7 +199,7 @@ bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) { while (MBBI != E) { MachineInstr *MI = &*MBBI; DebugLoc dl = MI->getDebugLoc(); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg); if (CC == ARMCC::AL) { ++MBBI; @@ -239,7 +239,7 @@ bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) { MachineInstr *NMI = &*MBBI; MI = NMI; - unsigned NPredReg = 0; + Register NPredReg; ARMCC::CondCodes NCC = getITInstrPredicate(*NMI, NPredReg); if (NCC == CC || NCC == OCC) { Mask |= ((NCC ^ CC) & 1) << Pos; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index e06bb9546c03..48c6b47f2154 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -66,7 +66,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail, // If the first instruction of Tail is predicated, we may have to update // the IT instruction. - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes CC = getInstrPredicate(*Tail, PredReg); MachineBasicBlock::iterator MBBI = Tail; if (CC != ARMCC::AL) @@ -114,7 +114,7 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, return false; } - unsigned PredReg = 0; + Register PredReg; return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL; } @@ -133,7 +133,7 @@ void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB, void Thumb2InstrInfo:: storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned SrcReg, bool isKill, int FI, + Register SrcReg, bool isKill, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { DebugLoc DL; @@ -143,7 +143,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFrameInfo &MFI = MF.getFrameInfo(); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); if (ARM::GPRRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(ARM::t2STRi12)) @@ -176,14 +176,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, void Thumb2InstrInfo:: loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, int FI, + Register DestReg, int FI, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); @@ -229,9 +229,9 @@ void Thumb2InstrInfo::expandLoadStackGuard( void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, - unsigned BaseReg, int NumBytes, - ARMCC::CondCodes Pred, unsigned PredReg, + const DebugLoc &dl, Register DestReg, + Register BaseReg, int NumBytes, + ARMCC::CondCodes Pred, Register PredReg, const ARMBaseInstrInfo &TII, unsigned MIFlags) { if (NumBytes == 0 && DestReg != BaseReg) { @@ -471,7 +471,7 @@ immediateOffsetOpcode(unsigned opcode) } bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + Register FrameReg, int &Offset, const ARMBaseInstrInfo &TII, const TargetRegisterInfo *TRI) { unsigned Opcode = MI.getOpcode(); @@ -491,7 +491,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, if (IsSP || Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) { Offset += MI.getOperand(FrameRegIdx+1).getImm(); - unsigned PredReg; + Register PredReg; if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL && !MI.definesRegister(ARM::CPSR)) { // Turn it into a move. @@ -634,7 +634,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, assert((Offset & OffsetMask) == 0 && "Can't encode this offset!"); (void)OffsetMask; // squash unused-variable warning at -NDEBUG } else if (AddrMode == ARMII::AddrModeT2_i8s4) { - Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4; + Offset += MI.getOperand(FrameRegIdx + 1).getImm(); NumBits = 8 + 2; // MCInst operand expects already scaled value. Scale = 1; @@ -706,7 +706,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, } ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI, - unsigned &PredReg) { + Register &PredReg) { unsigned Opc = MI.getOpcode(); if (Opc == ARM::tBcc || Opc == ARM::t2Bcc) return ARMCC::AL; @@ -727,7 +727,7 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) { } ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI, - unsigned &PredReg) { + Register &PredReg) { int PIdx = findFirstVPTPredOperandIdx(MI); if (PIdx == -1) { PredReg = 0; @@ -737,3 +737,33 @@ ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI, PredReg = MI.getOperand(PIdx+1).getReg(); return (ARMVCC::VPTCodes)MI.getOperand(PIdx).getImm(); } + +void llvm::recomputeVPTBlockMask(MachineInstr &Instr) { + assert(isVPTOpcode(Instr.getOpcode()) && "Not a VPST or VPT Instruction!"); + + MachineOperand &MaskOp = Instr.getOperand(0); + assert(MaskOp.isImm() && "Operand 0 is not the block mask of the VPT/VPST?!"); + + MachineBasicBlock::iterator Iter = ++Instr.getIterator(), + End = Instr.getParent()->end(); + + // Verify that the instruction after the VPT/VPST is predicated (it should + // be), and skip it. + assert( + getVPTInstrPredicate(*Iter) == ARMVCC::Then && + "VPT/VPST should be followed by an instruction with a 'then' predicate!"); + ++Iter; + + // Iterate over the predicated instructions, updating the BlockMask as we go. + ARM::PredBlockMask BlockMask = ARM::PredBlockMask::T; + while (Iter != End) { + ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*Iter); + if (Pred == ARMVCC::None) + break; + BlockMask = expandPredBlockMask(BlockMask, Pred); + ++Iter; + } + + // Rewrite the BlockMask. + MaskOp.setImm((int64_t)(BlockMask)); +} diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h index 7d8dff14e1e7..ec3763632239 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2InstrInfo.h @@ -44,13 +44,13 @@ public: void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned SrcReg, bool isKill, int FrameIndex, + Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned DestReg, int FrameIndex, + Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; @@ -67,13 +67,24 @@ private: /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical /// to llvm::getInstrPredicate except it returns AL for conditional branch /// instructions which are "predicated", but are not in IT blocks. -ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg); +ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, Register &PredReg); // getVPTInstrPredicate: VPT analogue of that, plus a helper function // corresponding to MachineInstr::findFirstPredOperandIdx. int findFirstVPTPredOperandIdx(const MachineInstr &MI); ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI, - unsigned &PredReg); + Register &PredReg); +inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) { + Register PredReg; + return getVPTInstrPredicate(MI, PredReg); } +// Recomputes the Block Mask of Instr, a VPT or VPST instruction. +// This rebuilds the block mask of the instruction depending on the predicates +// of the instructions following it. This should only be used after the +// MVEVPTBlockInsertion pass has run, and should be used whenever a predicated +// instruction is added to/removed from the block. +void recomputeVPTBlockMask(MachineInstr &Instr); +} // namespace llvm + #endif diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index c5a62aa33990..ae661594bdc9 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -457,7 +457,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, return false; if (!MI->hasOneMemOperand() || - (*MI->memoperands_begin())->getAlignment() < 4) + (*MI->memoperands_begin())->getAlign() < Align(4)) return false; // We're creating a completely different type of load/store - LDM from LDR. @@ -516,13 +516,23 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, isLdStMul = true; break; } - case ARM::t2STMIA: - // If the base register is killed, we don't care what its value is after the - // instruction, so we can use an updating STMIA. + case ARM::t2STMIA: { + // t2STMIA is reduced to tSTMIA_UPD which has writeback. We can only do this + // if the base register is killed, as then it doesn't matter what its value + // is after the instruction. if (!MI->getOperand(0).isKill()) return false; + // If the base register is in the register list and isn't the lowest + // numbered register (i.e. it's in operand 4 onwards) then with writeback + // the stored value is unknown, so we can't convert to tSTMIA_UPD. + Register BaseReg = MI->getOperand(0).getReg(); + for (unsigned i = 4; i < MI->getNumOperands(); ++i) + if (MI->getOperand(i).getReg() == BaseReg) + return false; + break; + } case ARM::t2LDMIA_RET: { Register BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) @@ -676,7 +686,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, default: break; case ARM::t2ADDSri: case ARM::t2ADDSrr: { - unsigned PredReg = 0; + Register PredReg; if (getInstrPredicate(*MI, PredReg) == ARMCC::AL) { switch (Opc) { default: break; @@ -718,7 +728,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop); } case ARM::t2TEQrr: { - unsigned PredReg = 0; + Register PredReg; // Can only convert to eors if we're not in an IT block. if (getInstrPredicate(*MI, PredReg) != ARMCC::AL) break; @@ -789,7 +799,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, // Check if it's possible / necessary to transfer the predicate. const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); bool SkipPred = false; if (Pred != ARMCC::AL) { @@ -882,7 +892,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, // Check if it's possible / necessary to transfer the predicate. const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg); bool SkipPred = false; if (Pred != ARMCC::AL) { diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp index b0ba58d8dc4a..4da6f6ab6994 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -70,7 +70,7 @@ static void emitThumb1LoadConstPool(MachineBasicBlock &MBB, MachineConstantPool *ConstantPool = MF.getConstantPool(); const Constant *C = ConstantInt::get( Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4)); BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci)) .addReg(DestReg, getDefRegState(true), SubIdx) @@ -89,7 +89,7 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB, MachineConstantPool *ConstantPool = MF.getConstantPool(); const Constant *C = ConstantInt::get( Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val); - unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4); + unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4)); BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci)) .addReg(DestReg, getDefRegState(true), SubIdx) @@ -102,14 +102,13 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB, /// specified immediate. void ThumbRegisterInfo::emitLoadConstPool( MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val, - ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const { + const DebugLoc &dl, Register DestReg, unsigned SubIdx, int Val, + ARMCC::CondCodes Pred, Register PredReg, unsigned MIFlags) const { MachineFunction &MF = *MBB.getParent(); const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); if (STI.isThumb1Only()) { - assert( - (isARMLowRegister(DestReg) || Register::isVirtualRegister(DestReg)) && - "Thumb1 does not have ldr to high register"); + assert((isARMLowRegister(DestReg) || DestReg.isVirtual()) && + "Thumb1 does not have ldr to high register"); return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred, PredReg, MIFlags); } @@ -123,7 +122,7 @@ void ThumbRegisterInfo::emitLoadConstPool( /// constpool entry. static void emitThumbRegPlusImmInReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, unsigned BaseReg, int NumBytes, + const DebugLoc &dl, Register DestReg, Register BaseReg, int NumBytes, bool CanChangeCC, const TargetInstrInfo &TII, const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) { MachineFunction &MF = *MBB.getParent(); @@ -139,7 +138,7 @@ static void emitThumbRegPlusImmInReg( isSub = true; NumBytes = -NumBytes; } - unsigned LdReg = DestReg; + Register LdReg = DestReg; if (DestReg == ARM::SP) assert(BaseReg == ARM::SP && "Unexpected!"); if (!isARMLowRegister(DestReg) && !Register::isVirtualRegister(DestReg)) @@ -185,8 +184,8 @@ static void emitThumbRegPlusImmInReg( /// be too long. This is allowed to modify the condition flags. void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, - unsigned BaseReg, int NumBytes, + const DebugLoc &dl, Register DestReg, + Register BaseReg, int NumBytes, const TargetInstrInfo &TII, const ARMBaseRegisterInfo &MRI, unsigned MIFlags) { @@ -358,7 +357,7 @@ static unsigned convertToNonSPOpcode(unsigned Opcode) { bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + Register FrameReg, int &Offset, const ARMBaseInstrInfo &TII) const { MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); @@ -427,8 +426,8 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II, return Offset == 0; } -void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, - int64_t Offset) const { +void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg, + int64_t Offset) const { const MachineFunction &MF = *MI.getParent()->getParent(); const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); if (!STI.isThumb1Only()) @@ -458,12 +457,12 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, return ARMBaseRegisterInfo::eliminateFrameIndex(II, SPAdj, FIOperandNum, RS); - unsigned VReg = 0; + Register VReg; const ARMBaseInstrInfo &TII = *STI.getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); MachineInstrBuilder MIB(*MBB.getParent(), &MI); - unsigned FrameReg; + Register FrameReg; int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); const ARMFrameLowering *TFI = getFrameLowering(MF); int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.h index 08cf67284d4c..e05a24dbaca5 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ThumbRegisterInfo.h @@ -38,18 +38,18 @@ public: /// specified immediate. void emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, + const DebugLoc &dl, Register DestReg, unsigned SubIdx, int Val, ARMCC::CondCodes Pred = ARMCC::AL, - unsigned PredReg = 0, + Register PredReg = Register(), unsigned MIFlags = MachineInstr::NoFlags) const override; // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be // however much remains to be handled. Return 'true' if no further // work is required. bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, + Register FrameReg, int &Offset, const ARMBaseInstrInfo &TII) const; - void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override; void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp index 4ace61cccd0f..3356d56481e5 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp @@ -15,6 +15,37 @@ using namespace llvm; namespace llvm { +ARM::PredBlockMask expandPredBlockMask(ARM::PredBlockMask BlockMask, + ARMVCC::VPTCodes Kind) { + using PredBlockMask = ARM::PredBlockMask; + assert(Kind != ARMVCC::None && "Cannot expand a mask with None!"); + assert(countTrailingZeros((unsigned)BlockMask) != 0 && + "Mask is already full"); + + auto ChooseMask = [&](PredBlockMask AddedThen, PredBlockMask AddedElse) { + return Kind == ARMVCC::Then ? AddedThen : AddedElse; + }; + + switch (BlockMask) { + case PredBlockMask::T: + return ChooseMask(PredBlockMask::TT, PredBlockMask::TE); + case PredBlockMask::TT: + return ChooseMask(PredBlockMask::TTT, PredBlockMask::TTE); + case PredBlockMask::TE: + return ChooseMask(PredBlockMask::TET, PredBlockMask::TEE); + case PredBlockMask::TTT: + return ChooseMask(PredBlockMask::TTTT, PredBlockMask::TTTE); + case PredBlockMask::TTE: + return ChooseMask(PredBlockMask::TTET, PredBlockMask::TTEE); + case PredBlockMask::TET: + return ChooseMask(PredBlockMask::TETT, PredBlockMask::TETE); + case PredBlockMask::TEE: + return ChooseMask(PredBlockMask::TEET, PredBlockMask::TEEE); + default: + llvm_unreachable("Unknown Mask"); + } +} + namespace ARMSysReg { // lookup system register using 12-bit SYSm value. diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h index 27605422983d..80b7276adb4e 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h @@ -91,41 +91,41 @@ namespace ARMVCC { Then, Else }; - - enum VPTMaskValue { - T = 8, // 0b1000 - TT = 4, // 0b0100 - TE = 12, // 0b1100 - TTT = 2, // 0b0010 - TTE = 6, // 0b0110 - TEE = 10, // 0b1010 - TET = 14, // 0b1110 - TTTT = 1, // 0b0001 - TTTE = 3, // 0b0011 - TTEE = 5, // 0b0101 - TTET = 7, // 0b0111 - TEEE = 9, // 0b1001 - TEET = 11, // 0b1011 - TETT = 13, // 0b1101 - TETE = 15 // 0b1111 +} // namespace ARMVCC + +namespace ARM { + /// Mask values for IT and VPT Blocks, to be used by MCOperands. + /// Note that this is different from the "real" encoding used by the + /// instructions. In this encoding, the lowest set bit indicates the end of + /// the encoding, and above that, "1" indicates an else, while "0" indicates + /// a then. + /// Tx = x100 + /// Txy = xy10 + /// Txyz = xyz1 + enum class PredBlockMask { + T = 0b1000, + TT = 0b0100, + TE = 0b1100, + TTT = 0b0010, + TTE = 0b0110, + TEE = 0b1110, + TET = 0b1010, + TTTT = 0b0001, + TTTE = 0b0011, + TTEE = 0b0111, + TTET = 0b0101, + TEEE = 0b1111, + TEET = 0b1101, + TETT = 0b1001, + TETE = 0b1011 }; -} +} // namespace ARM -inline static unsigned getARMVPTBlockMask(unsigned NumInsts) { - switch (NumInsts) { - case 1: - return ARMVCC::T; - case 2: - return ARMVCC::TT; - case 3: - return ARMVCC::TTT; - case 4: - return ARMVCC::TTTT; - default: - break; - }; - llvm_unreachable("Unexpected number of instruction in a VPT block"); -} +// Expands a PredBlockMask by adding an E or a T at the end, depending on Kind. +// e.g ExpandPredBlockMask(T, Then) = TT, ExpandPredBlockMask(TT, Else) = TTE, +// and so on. +ARM::PredBlockMask expandPredBlockMask(ARM::PredBlockMask BlockMask, + ARMVCC::VPTCodes Kind); inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) { switch (CC) { |