diff options
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64InstrInfo.td')
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64InstrInfo.td | 1379 |
1 files changed, 864 insertions, 515 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 1053ba924276..d112d4f10e47 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -143,20 +143,20 @@ def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">, "fuse-aes">; def HasSVE : Predicate<"Subtarget->isSVEAvailable()">, AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">; +def HasSVEB16B16 : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEB16B16()">, + AssemblerPredicateWithAll<(all_of FeatureSVEB16B16), "sve-b16b16">; def HasSVE2 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2()">, AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">; def HasSVE2p1 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">, AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">; -def HasSVE2AES : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2AES()">, - AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">; +def HasSVEAES : Predicate<"Subtarget->hasSVEAES()">, + AssemblerPredicateWithAll<(all_of FeatureSVEAES), "sve-aes">; def HasSVE2SM4 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SM4()">, AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">; def HasSVE2SHA3 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">, AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">; -def HasSVE2BitPerm : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2BitPerm()">, - AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">; -def HasB16B16 : Predicate<"Subtarget->hasB16B16()">, - AssemblerPredicateWithAll<(all_of FeatureB16B16), "b16b16">; +def HasSVEBitPerm : Predicate<"Subtarget->hasSVEBitPerm()">, + AssemblerPredicateWithAll<(all_of FeatureSVEBitPerm), "sve-bitperm">; def HasSMEandIsNonStreamingSafe : Predicate<"Subtarget->hasSME()">, AssemblerPredicateWithAll<(all_of FeatureSME), "sme">; @@ -170,6 +170,8 @@ def HasSMEFA64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF AssemblerPredicateWithAll<(all_of FeatureSMEFA64), "sme-fa64">; def HasSMEI16I64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEI16I64()">, AssemblerPredicateWithAll<(all_of FeatureSMEI16I64), "sme-i16i64">; +def HasSMEB16B16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEB16B16()">, + AssemblerPredicateWithAll<(all_of FeatureSMEB16B16), "sme-b16b16">; def HasSME2andIsNonStreamingSafe : Predicate<"Subtarget->hasSME2()">, AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">; @@ -204,47 +206,104 @@ def HasSSVE_FP8DOT4 : Predicate<"Subtarget->hasSSVE_FP8DOT4() || " "ssve-fp8dot4 or (sve2 and fp8dot4)">; def HasLUT : Predicate<"Subtarget->hasLUT()">, AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">; -def HasSME_LUTv2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">, +def HasSME_LUTv2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">, AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">; def HasSMEF8F16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">, AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">; def HasSMEF8F32 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">, AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">; +def HasSME_MOP4 : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">, + AssemblerPredicateWithAll<(all_of FeatureSME_MOP4), "sme-mop4">; +def HasSME_TMOP : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">, + AssemblerPredicateWithAll<(all_of FeatureSME_TMOP), "sme-tmop">; + +def HasCMPBR : Predicate<"Subtarget->hasCMPBR()">, + AssemblerPredicateWithAll<(all_of FeatureCMPBR), "cmpbr">; +def HasF8F32MM : Predicate<"Subtarget->hasF8F32MM()">, + AssemblerPredicateWithAll<(all_of FeatureF8F32MM), "f8f32mm">; +def HasF8F16MM : Predicate<"Subtarget->hasF8F16MM()">, + AssemblerPredicateWithAll<(all_of FeatureF8F16MM), "f8f16mm">; +def HasFPRCVT : Predicate<"Subtarget->hasFPRCVT()">, + AssemblerPredicateWithAll<(all_of FeatureFPRCVT), "fprcvt">; +def HasLSFE : Predicate<"Subtarget->hasLSFE()">, + AssemblerPredicateWithAll<(all_of FeatureLSFE), "lsfe">; +def HasSME2p2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">, + AssemblerPredicateWithAll<(all_of FeatureSME2p2), "sme2p2">; +def HasSVEAES2 : Predicate<"Subtarget->hasSVEAES2()">, + AssemblerPredicateWithAll<(all_of FeatureSVEAES2), "sve-aes2">; +def HasSVEBFSCALE : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">, + AssemblerPredicateWithAll<(all_of FeatureSVEBFSCALE), "sve-bfscale">; +def HasSVE_F16F32MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">, + AssemblerPredicateWithAll<(all_of FeatureSVE_F16F32MM), "sve-f16f32mm">; +def HasPCDPHINT : Predicate<"Subtarget->hasPCDPHINT()">, + AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">; +def HasLSUI : Predicate<"Subtarget->hasLSUI()">, + AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">; +def HasOCCMO : Predicate<"Subtarget->hasOCCMO()">, + AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">; // A subset of SVE(2) instructions are legal in Streaming SVE execution mode, // they should be enabled if either has been specified. -def HasSVEorSME +def HasSVE_or_SME : Predicate<"Subtarget->hasSVE() || (Subtarget->isStreaming() && Subtarget->hasSME())">, AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME), "sve or sme">; -def HasSVE2orSME +def HasNonStreamingSVE_or_SME2p2 + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">, + AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME2p2), + "sve or sme2p2">; +def HasSVE2_or_SME : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME())">, AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME), "sve2 or sme">; -def HasSVE2orSME2 +def HasSVE2_or_SME2 : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME2())">, AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME2), "sve2 or sme2">; -def HasSVE2p1_or_HasSME +def HasNonStreamingSVE2_or_SSVE_AES + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">, + AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_AES), "sve2 or ssve-aes">; +def HasSVE2p1_or_SME : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME())">, AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">; -def HasSVE2p1_or_HasSME2 +def HasSVE2p1_or_SME2 : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2())">, AssemblerPredicateWithAll<(any_of FeatureSME2, FeatureSVE2p1), "sme2 or sve2p1">; -def HasSVE2p1_or_HasSME2p1 +def HasSVE2p1_or_SME2p1 : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2p1())">, AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">; - -def HasSMEF16F16orSMEF8F16 +def HasSVE2p2_or_SME2p2 + : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">, + AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2), "sme2p2 or sve2p2">; +def HasNonStreamingSVE2p1_or_SSVE_AES + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">, + AssemblerPredicateWithAll<(any_of FeatureSVE2p1, FeatureSSVE_AES), "sve2p1 or ssve-aes">; +def HasSMEF16F16_or_SMEF8F16 : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">, AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16), "sme-f16f16 or sme-f8f16">; +def HasNonStreamingSVE2p2_or_SME2p2 + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p2()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">, + AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2), + "sme2p2 or sve2p2">; +def HasNonStreamingSVE2_or_SSVE_BitPerm + : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||" + "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_BitPerm())">, + AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_BitPerm), "sve2 or ssve-bitperm">; // A subset of NEON instructions are legal in Streaming SVE execution mode, // so don't need the additional check for 'isNeonAvailable'. def HasNEONandIsStreamingSafe : Predicate<"Subtarget->hasNEON()">, AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">; +// A subset of NEON instructions are legal in Streaming SVE mode only with +sme2p2. +def HasNEONandIsSME2p2StreamingSafe + : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">, + AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">; def HasRCPC : Predicate<"Subtarget->hasRCPC()">, AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">; def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">, @@ -330,6 +389,8 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">; def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; +def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">; + def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisInt<1>]>>; @@ -387,9 +448,9 @@ def SDT_AArch64FCCMP : SDTypeProfile<1, 5, SDTCisInt<3>, SDTCisInt<4>, SDTCisVT<5, i32>]>; -def SDT_AArch64FCmp : SDTypeProfile<0, 2, - [SDTCisFP<0>, - SDTCisSameAs<0, 1>]>; +def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisFP<1>, + SDTCisSameAs<2, 1>]>; def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>; def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>; @@ -756,6 +817,8 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>; def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>; def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>; +def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>; + def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>; def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>; def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>; @@ -828,9 +891,11 @@ def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>; def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ", SDT_AArch64TLSDescCallSeq, - [SDNPInGlue, SDNPOutGlue, SDNPHasChain, - SDNPVariadic]>; + [SDNPOutGlue, SDNPHasChain, SDNPVariadic]>; +def AArch64tlsdesc_auth_callseq : SDNode<"AArch64ISD::TLSDESC_AUTH_CALLSEQ", + SDT_AArch64TLSDescCallSeq, + [SDNPOutGlue, SDNPHasChain, SDNPVariadic]>; def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge", SDT_AArch64WrapperLarge>; @@ -853,6 +918,7 @@ def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>; def AArch64sdot : SDNode<"AArch64ISD::SDOT", SDT_AArch64Dot>; def AArch64udot : SDNode<"AArch64ISD::UDOT", SDT_AArch64Dot>; +def AArch64usdot : SDNode<"AArch64ISD::USDOT", SDT_AArch64Dot>; def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>; def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>; @@ -938,8 +1004,10 @@ def AArch64probedalloca [SDNPHasChain, SDNPMayStore]>; def AArch64mrs : SDNode<"AArch64ISD::MRS", - SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>, - [SDNPHasChain, SDNPOutGlue]>; + SDTypeProfile<2, 1, [SDTCisVT<0, i64>, + SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>, + [SDNPHasChain]>; def SD_AArch64rshrnb : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<2>]>; def AArch64rshrnb : SDNode<"AArch64ISD::RSHRNB_I", SD_AArch64rshrnb>; @@ -971,6 +1039,17 @@ def smullwithsignbits : PatFrag<(ops node:$l, node:$r), (mul node:$l, node:$r), CurDAG->ComputeNumSignBits(N->getOperand(1)) > 32; }]>; +// Match "nnan" flagged calls to fminnum and fmmaxnum. Then semantically equivalent +// to fmaximum/fminimum. +def fmaxnum_nnan : PatFrag<(ops node:$Rn, node:$Rm), + (fmaxnum node:$Rn, node:$Rm), [{ + return N->getFlags().hasNoNaNs(); + }]>; +def fminnum_nnan : PatFrag<(ops node:$Rn, node:$Rm), + (fminnum node:$Rn, node:$Rm), [{ + return N->getFlags().hasNoNaNs(); + }]>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -1062,7 +1141,7 @@ def PROBED_STACKALLOC_DYN : Pseudo<(outs), [(AArch64probedalloca GPR64common:$target)]>, Sched<[]>; -} // Defs = [SP, NZCV], Uses = [SP] in +} // Defs = [SP, NZCV], Uses = [SP] in } // hasSideEffects = 1, isCodeGenOnly = 1 let isReMaterializable = 1, isCodeGenOnly = 1 in { @@ -1215,6 +1294,11 @@ def : InstAlias<"sevl", (HINT 0b101)>; def : InstAlias<"dgh", (HINT 0b110)>; def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>; def : InstAlias<"csdb", (HINT 20)>; + +let Predicates = [HasPCDPHINT] in { + def STSHH: STSHHI; +} + // In order to be able to write readable assembly, LLVM should accept assembly // inputs that use Branch Target Indentification mnemonics, even with BTI disabled. // However, in order to be compatible with other assemblers (e.g. GAS), LLVM @@ -1372,8 +1456,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>; def BFCVTN : SIMD_BFCVTN; def BFCVTN2 : SIMD_BFCVTN2; -def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))), - (EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>; +def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))), + (BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>; // Vector-scalar BFDOT: // The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit @@ -1395,8 +1479,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in { def BFCVT : BF16ToSinglePrecision<"bfcvt">; -// Round FP32 to BF16. -def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>; } // ARMv8.6A AArch64 matrix multiplication @@ -1404,8 +1486,8 @@ let Predicates = [HasMatMulInt8] in { def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>; def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>; def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>; -defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>; -defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>; +defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", AArch64usdot>; +defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", AArch64usdot>; // sudot lane has a pattern where usdot is expected (there is no sudot). // The second operand is used in the dup operation to repeat the indexed @@ -1415,9 +1497,9 @@ class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind, ValueType AccumType, ValueType InputType> : BaseSIMDThreeSameVectorIndexS<Q, 0, 0b00, 0b1111, "sudot", dst_kind, lhs_kind, rhs_kind, RegType, AccumType, - InputType, null_frag> { + InputType, VectorIndexS, null_frag> { let Pattern = [(set (AccumType RegType:$dst), - (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd), + (AccumType (AArch64usdot (AccumType RegType:$Rd), (InputType (bitconvert (AccumType (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx)))), @@ -1770,28 +1852,28 @@ let Predicates = [HasPAuth] in { // materialization here), in part because they're handled in a safer way by // the kernel, notably on Darwin. def BLRA : Pseudo<(outs), (ins GPR64noip:$Rn, i32imm:$Key, i64imm:$Disc, - GPR64noip:$AddrDisc), + GPR64:$AddrDisc), [(AArch64authcall GPR64noip:$Rn, timm:$Key, timm:$Disc, - GPR64noip:$AddrDisc)]>, Sched<[]> { + GPR64:$AddrDisc)]>, Sched<[]> { let isCodeGenOnly = 1; let hasSideEffects = 1; let mayStore = 0; let mayLoad = 0; let isCall = 1; let Size = 12; // 4 fixed + 8 variable, to compute discriminator. - let Defs = [X17,LR]; + let Defs = [X16,X17,LR]; let Uses = [SP]; } def BLRA_RVMARKER : Pseudo< (outs), (ins i64imm:$rvfunc, GPR64noip:$Rn, i32imm:$Key, i64imm:$Disc, - GPR64noip:$AddrDisc), + GPR64:$AddrDisc), [(AArch64authcall_rvmarker tglobaladdr:$rvfunc, GPR64noip:$Rn, timm:$Key, timm:$Disc, - GPR64noip:$AddrDisc)]>, Sched<[]> { + GPR64:$AddrDisc)]>, Sched<[]> { let isCodeGenOnly = 1; let isCall = 1; - let Defs = [X17,LR]; + let Defs = [X16,X17,LR]; let Uses = [SP]; } @@ -1872,8 +1954,15 @@ let Predicates = [HasPAuth] in { Sched<[WriteI, ReadI]> { let isReMaterializable = 1; let isCodeGenOnly = 1; - let Size = 40; // 12 fixed + 28 variable, for pointer offset, and discriminator - let Defs = [X16,X17]; + let Size = 68; // 12 fixed + 56 variable, for pointer offset, discriminator and + // ELF signed GOT signed pointer authentication (if no FPAC) + let Defs = [X16,X17,NZCV]; + } + + def LOADgotAUTH : Pseudo<(outs GPR64common:$dst), (ins i64imm:$addr), []>, + Sched<[WriteI, ReadI]> { + let Defs = [X16,X17,NZCV]; + let Size = 44; } // Load a signed global address from a special $auth_ptr$ stub slot. @@ -1887,30 +1976,36 @@ let Predicates = [HasPAuth] in { } // Size 16: 4 fixed + 8 variable, to compute discriminator. + // The size returned by getInstSizeInBytes() is incremented according + // to the variant of LR check. + // As the check requires either x16 or x17 as a scratch register and + // authenticated tail call instructions have two register operands, + // make sure at least one register is usable as a scratch one - for that + // purpose, use tcGPRnotx16x17 register class for one of the operands. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Size = 16, - Uses = [SP] in { + Defs = [X16,X17], Uses = [SP] in { def AUTH_TCRETURN - : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff, i32imm:$Key, + : Pseudo<(outs), (ins tcGPRnotx16x17:$dst, i32imm:$FPDiff, i32imm:$Key, i64imm:$Disc, tcGPR64:$AddrDisc), []>, Sched<[WriteBrReg]>; def AUTH_TCRETURN_BTI : Pseudo<(outs), (ins tcGPRx16x17:$dst, i32imm:$FPDiff, i32imm:$Key, - i64imm:$Disc, tcGPR64:$AddrDisc), + i64imm:$Disc, tcGPRnotx16x17:$AddrDisc), []>, Sched<[WriteBrReg]>; } let Predicates = [TailCallAny] in - def : Pat<(AArch64authtcret tcGPR64:$dst, (i32 timm:$FPDiff), (i32 timm:$Key), + def : Pat<(AArch64authtcret tcGPRnotx16x17:$dst, (i32 timm:$FPDiff), (i32 timm:$Key), (i64 timm:$Disc), tcGPR64:$AddrDisc), - (AUTH_TCRETURN tcGPR64:$dst, imm:$FPDiff, imm:$Key, imm:$Disc, + (AUTH_TCRETURN tcGPRnotx16x17:$dst, imm:$FPDiff, imm:$Key, imm:$Disc, tcGPR64:$AddrDisc)>; let Predicates = [TailCallX16X17] in def : Pat<(AArch64authtcret tcGPRx16x17:$dst, (i32 timm:$FPDiff), (i32 timm:$Key), (i64 timm:$Disc), - tcGPR64:$AddrDisc), + tcGPRnotx16x17:$AddrDisc), (AUTH_TCRETURN_BTI tcGPRx16x17:$dst, imm:$FPDiff, imm:$Key, - imm:$Disc, tcGPR64:$AddrDisc)>; + imm:$Disc, tcGPRnotx16x17:$AddrDisc)>; } // v9.5-A pointer authentication extensions @@ -1935,6 +2030,8 @@ let Predicates = [HasPAuthLR] in { // opcode2, opcode, asm def AUTIASPPCr : SignAuthOneReg<0b00001, 0b100100, "autiasppcr">; def AUTIBSPPCr : SignAuthOneReg<0b00001, 0b100101, "autibsppcr">; + } + let Defs = [X17], Uses = [X15, X16, X17] in { // opcode2, opcode, asm def PACIA171615 : SignAuthFixedRegs<0b00001, 0b100010, "pacia171615">; def PACIB171615 : SignAuthFixedRegs<0b00001, 0b100011, "pacib171615">; @@ -2080,6 +2177,12 @@ def MSR_FPSR : Pseudo<(outs), (ins GPR64:$val), PseudoInstExpansion<(MSR 0xda21, GPR64:$val)>, Sched<[WriteSys]>; +let Defs = [FPMR] in +def MSR_FPMR : Pseudo<(outs), (ins GPR64:$val), + [(int_aarch64_set_fpmr i64:$val)]>, + PseudoInstExpansion<(MSR 0xda22, GPR64:$val)>, + Sched<[WriteSys]>; + // Generic system instructions def SYSxt : SystemXtI<0, "sys">; def SYSLxt : SystemLXtI<1, "sysl">; @@ -2212,7 +2315,7 @@ def s64imm_32bit : ImmLeaf<i64, [{ }]>; def trunc_imm : SDNodeXForm<imm, [{ - return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32); + return CurDAG->getTargetConstant((uint32_t)N->getZExtValue(), SDLoc(N), MVT::i32); }]>; def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">, @@ -2552,12 +2655,74 @@ defm CASPA : CompareAndSwapPair<1, 0, "a">; defm CASPL : CompareAndSwapPair<0, 1, "l">; defm CASPAL : CompareAndSwapPair<1, 1, "al">; +// v9.6-a atomic CAST +let Predicates = [HasLSUI] in { +defm CAST : CompareAndSwapUnprivileged<0b11, 0, 0, "">; +defm CASLT : CompareAndSwapUnprivileged<0b11, 0, 1, "l">; +defm CASAT : CompareAndSwapUnprivileged<0b11, 1, 0, "a">; +defm CASALT : CompareAndSwapUnprivileged<0b11, 1, 1, "al">; + +def : MnemonicAlias<"cas", "cast">; +def : MnemonicAlias<"casl", "caslt">; +def : MnemonicAlias<"casa", "casat">; +def : MnemonicAlias<"casal", "casalt">; + +// v9.6-a atomic CASPT +defm CASPT : CompareAndSwapPairUnprivileged<0b01, 0, 0, "">; +defm CASPLT : CompareAndSwapPairUnprivileged<0b01, 0, 1, "l">; +defm CASPAT : CompareAndSwapPairUnprivileged<0b01, 1, 0, "a">; +defm CASPALT : CompareAndSwapPairUnprivileged<0b01, 1, 1, "al">; + +def : MnemonicAlias<"casp", "caspt">; +def : MnemonicAlias<"caspl", "casplt">; +def : MnemonicAlias<"caspa", "caspat">; +def : MnemonicAlias<"caspal", "caspalt">; +} + // v8.1 atomic SWP defm SWP : Swap<0, 0, "">; defm SWPA : Swap<1, 0, "a">; defm SWPL : Swap<0, 1, "l">; defm SWPAL : Swap<1, 1, "al">; +// v9.6a atomic swap (FEAT_LSUI) +let Predicates = [HasLSUI] in { + defm SWPT : SwapLSUI<0, 0, "">; + defm SWPTA : SwapLSUI<1, 0, "a">; + defm SWPTL : SwapLSUI<0, 1, "l">; + defm SWPTAL : SwapLSUI<1, 1, "al">; + + def : MnemonicAlias<"swp", "swpt">; + def : MnemonicAlias<"swpa", "swpta">; + def : MnemonicAlias<"swpl", "swptl">; + def : MnemonicAlias<"swpal", "swptal">; +} + +// v9.6-a unprivileged atomic LD<OP> (FEAT_LSUI) +let Predicates = [HasLSUI] in { + defm LDTADD : LDOPregisterLSUI<0b000, "add", 0, 0, "">; + defm LDTADDA : LDOPregisterLSUI<0b000, "add", 1, 0, "a">; + defm LDTADDL : LDOPregisterLSUI<0b000, "add", 0, 1, "l">; + defm LDTADDAL : LDOPregisterLSUI<0b000, "add", 1, 1, "al">; + + defm LDTCLR : LDOPregisterLSUI<0b001, "clr", 0, 0, "">; + defm LDTCLRA : LDOPregisterLSUI<0b001, "clr", 1, 0, "a">; + defm LDTCLRL : LDOPregisterLSUI<0b001, "clr", 0, 1, "l">; + defm LDTCLRAL : LDOPregisterLSUI<0b001, "clr", 1, 1, "al">; + + defm LDTSET : LDOPregisterLSUI<0b011, "set", 0, 0, "">; + defm LDTSETA : LDOPregisterLSUI<0b011, "set", 1, 0, "a">; + defm LDTSETL : LDOPregisterLSUI<0b011, "set", 0, 1, "l">; + defm LDTSETAL : LDOPregisterLSUI<0b011, "set", 1, 1, "al">; + + defm : STOPregisterLSUI<"sttadd","LDTADD">; // STTADDx + defm : STOPregisterLSUI<"sttclr","LDTCLR">; // STTCLRx + defm : STOPregisterLSUI<"sttset","LDTSET">; // STTSETx +} + +// v9.6-a FEAT_RME_GPC3 +def APAS : APASI; + // v8.1 atomic LD<OP>(register). Performs load and then ST<OP>(register) defm LDADD : LDOPregister<0b000, "add", 0, 0, "">; defm LDADDA : LDOPregister<0b000, "add", 1, 0, "a">; @@ -2640,14 +2805,17 @@ def : Pat<(int_aarch64_ldg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$ def : InstAlias<"ldg $Rt, [$Rn]", (LDG GPR64:$Rt, GPR64sp:$Rn, 0), 1>; +let mayLoad = 1 in def LDGM : MemTagVector<1, "ldgm", "\t$Rt, [$Rn]", (outs GPR64:$Rt), (ins GPR64sp:$Rn)>; +let mayStore = 1 in { def STGM : MemTagVector<0, "stgm", "\t$Rt, [$Rn]", (outs), (ins GPR64:$Rt, GPR64sp:$Rn)>; def STZGM : MemTagVector<0, "stzgm", "\t$Rt, [$Rn]", (outs), (ins GPR64:$Rt, GPR64sp:$Rn)> { let Inst{23} = 0; } +} // mayStore = 1 defm STG : MemTagStore<0b00, "stg">; defm STZG : MemTagStore<0b01, "stzg">; @@ -2827,6 +2995,9 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>; def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>; +def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>; +def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>; + def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)), (and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))), (REV16Xr GPR64:$Rn)>; @@ -3157,8 +3328,16 @@ def TLSDESC_CALLSEQ : Pseudo<(outs), (ins i64imm:$sym), [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>, Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>; +let isCall = 1, Defs = [NZCV, LR, X0, X16], hasSideEffects = 1, Size = 16, + isCodeGenOnly = 1 in +def TLSDESC_AUTH_CALLSEQ + : Pseudo<(outs), (ins i64imm:$sym), + [(AArch64tlsdesc_auth_callseq tglobaltlsaddr:$sym)]>, + Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>; def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym), (TLSDESC_CALLSEQ texternalsym:$sym)>; +def : Pat<(AArch64tlsdesc_auth_callseq texternalsym:$sym), + (TLSDESC_AUTH_CALLSEQ texternalsym:$sym)>; //===----------------------------------------------------------------------===// // Conditional branch (immediate) instruction. @@ -3302,59 +3481,6 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>; // Pre-fetch. defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">; -// For regular load, we do not have any alignment requirement. -// Thus, it is safe to directly map the vector loads with interesting -// addressing modes. -// FIXME: We could do the same for bitconvert to floating point vectors. -multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop, - ValueType ScalTy, ValueType VecTy, - Instruction LOADW, Instruction LOADX, - SubRegIndex sub> { - def : Pat<(VecTy (scalar_to_vector (ScalTy - (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))), - (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), - (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset), - sub)>; - - def : Pat<(VecTy (scalar_to_vector (ScalTy - (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))), - (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), - (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset), - sub)>; -} - -let AddedComplexity = 10 in { -defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>; -defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>; - -defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>; -defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>; - -defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>; -defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>; - -defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>; -defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>; - -defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>; -defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>; - -defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>; - -defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>; - - -def : Pat <(v1i64 (scalar_to_vector (i64 - (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend64:$extend))))), - (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; - -def : Pat <(v1i64 (scalar_to_vector (i64 - (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend64:$extend))))), - (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; -} - // Match all load 64 bits width whose type is compatible with FPR64 multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy, Instruction LOADW, Instruction LOADX> { @@ -3478,42 +3604,6 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr", def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; -// For regular load, we do not have any alignment requirement. -// Thus, it is safe to directly map the vector loads with interesting -// addressing modes. -// FIXME: We could do the same for bitconvert to floating point vectors. -def : Pat <(v8i8 (scalar_to_vector (i32 - (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), - (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), - (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -def : Pat <(v16i8 (scalar_to_vector (i32 - (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -def : Pat <(v4i16 (scalar_to_vector (i32 - (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), - (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -def : Pat <(v8i16 (scalar_to_vector (i32 - (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), - (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -def : Pat <(v2i32 (scalar_to_vector (i32 - (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), - (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), - (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; -def : Pat <(v4i32 (scalar_to_vector (i32 - (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), - (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; -def : Pat <(v1i64 (scalar_to_vector (i64 - (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; -def : Pat <(v2i64 (scalar_to_vector (i64 - (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), - (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>; - // Match all load 64 bits width whose type is compatible with FPR64 let Predicates = [IsLE] in { // We must use LD1 to perform vector loads in big-endian. @@ -3879,12 +3969,13 @@ def : InstAlias<"ldrsh $Rt, [$Rn, $offset]", def : InstAlias<"ldrsw $Rt, [$Rn, $offset]", (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; -// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros, -// load, 0) can use a single load. -multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT, - ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst, - ComplexPattern Addr, ComplexPattern UnscaledAddr, Operand AddrImm, - SubRegIndex SubReg> { +// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros, load, 0) +// can use a single load. Same for scalar_to_vector(load) or insert(undef, load, 0). +multiclass LoadInsertVTPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType ScalarVT, + Instruction LoadInst, Instruction UnscaledLoadInst, + Instruction ROWLoadInst, Instruction ROXLoadInst, + ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr, + Operand AddrImm, SubRegIndex SubReg> { // Scaled def : Pat <(vector_insert (VT immAllZerosV), (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)), @@ -3893,42 +3984,82 @@ multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueT def : Pat <(vector_insert (VT immAllZerosV), (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)), (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>; + // roW + def : Pat <(vector_insert (VT immAllZerosV), + (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), (i64 0)), + (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>; + // roX + def : Pat <(vector_insert (VT immAllZerosV), + (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), (i64 0)), + (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>; - // Half-vector patterns - def : Pat <(vector_insert (HVT immAllZerosV), - (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)), - (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>; - // Unscaled - def : Pat <(vector_insert (HVT immAllZerosV), - (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)), - (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>; - - // SVE patterns - def : Pat <(vector_insert (SVT immAllZerosV), - (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)), - (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>; - // Unscaled - def : Pat <(vector_insert (SVT immAllZerosV), - (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)), + // Undef equivalents of the patterns above. + def : Pat <(VT (vec_ins_or_scal_vec + (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))))), + (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>; + def : Pat <(VT (vec_ins_or_scal_vec + (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))))), (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>; -} - -defm : LoadInsertZeroPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32, LDRBui, LDURBi, - am_indexed8, am_unscaled8, uimm12s1, bsub>; -defm : LoadInsertZeroPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32, LDRHui, LDURHi, - am_indexed16, am_unscaled16, uimm12s2, hsub>; -defm : LoadInsertZeroPatterns<load, v4i32, v2i32, nxv4i32, i32, LDRSui, LDURSi, - am_indexed32, am_unscaled32, uimm12s4, ssub>; -defm : LoadInsertZeroPatterns<load, v2i64, v1i64, nxv2i64, i64, LDRDui, LDURDi, - am_indexed64, am_unscaled64, uimm12s8, dsub>; -defm : LoadInsertZeroPatterns<load, v8f16, v4f16, nxv8f16, f16, LDRHui, LDURHi, - am_indexed16, am_unscaled16, uimm12s2, hsub>; -defm : LoadInsertZeroPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16, LDRHui, LDURHi, - am_indexed16, am_unscaled16, uimm12s2, hsub>; -defm : LoadInsertZeroPatterns<load, v4f32, v2f32, nxv4f32, f32, LDRSui, LDURSi, - am_indexed32, am_unscaled32, uimm12s4, ssub>; -defm : LoadInsertZeroPatterns<load, v2f64, v1f64, nxv2f64, f64, LDRDui, LDURDi, - am_indexed64, am_unscaled64, uimm12s8, dsub>; + def : Pat <(VT (vec_ins_or_scal_vec + (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))))), + (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>; + def : Pat <(VT (vec_ins_or_scal_vec + (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))))), + (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>; +} + +multiclass LoadInsertPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT, + ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst, + Instruction ROWLoadInst, Instruction ROXLoadInst, + ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr, + Operand AddrImm, SubRegIndex SubReg> { + defm : LoadInsertVTPatterns<LoadOp, VT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst, + ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>; + defm : LoadInsertVTPatterns<LoadOp, HVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst, + ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>; + defm : LoadInsertVTPatterns<LoadOp, SVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst, + ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>; +} + +defm : LoadInsertPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32, + LDRBui, LDURBi, LDRBroW, LDRBroX, + ro8, am_indexed8, am_unscaled8, uimm12s1, bsub>; +defm : LoadInsertPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32, + LDRHui, LDURHi, LDRHroW, LDRHroX, + ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>; +defm : LoadInsertPatterns<load, v4i32, v2i32, nxv4i32, i32, + LDRSui, LDURSi, LDRSroW, LDRSroX, + ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>; +defm : LoadInsertPatterns<load, v2i64, isVoid, nxv2i64, i64, + LDRDui, LDURDi, LDRDroW, LDRDroX, + ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>; +defm : LoadInsertPatterns<load, v8f16, v4f16, nxv8f16, f16, + LDRHui, LDURHi, LDRHroW, LDRHroX, + ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>; +defm : LoadInsertPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16, + LDRHui, LDURHi, LDRHroW, LDRHroX, + ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>; +defm : LoadInsertPatterns<load, v4f32, v2f32, nxv4f32, f32, + LDRSui, LDURSi, LDRSroW, LDRSroX, + ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>; +defm : LoadInsertPatterns<load, v2f64, isVoid, nxv2f64, f64, + LDRDui, LDURDi, LDRDroW, LDRDroX, + ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>; + +// Extra patterns for v1f64 scalar_to_vector(load), which need to avoid the +// SUBREG_TO_REG used above. +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (ro64.Wpat GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend))))), + (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend)>; +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))), + (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>; // Pre-fetch. defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum", @@ -4049,6 +4180,33 @@ defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">; defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">; } +// Armv9.6-a Load/store pair (FEAT_LSUI) +let Predicates = [HasLSUI] in { + defm LDTP : LoadPairOffset<0b11, 0, GPR64z, simm7s8, "ldtp">; + def LDTPpre : LoadPairPreIdx<0b11, 0, GPR64z, simm7s8, "ldtp">; + def LDTPpost : LoadPairPostIdx<0b11, 0, GPR64z, simm7s8, "ldtp">; + + defm STTNPX : StorePairNoAllocLSUI<0b11, 0, GPR64z, simm7s8, "sttnp">; + defm LDTNPX : LoadPairNoAllocLSUI<0b11, 0, GPR64z, simm7s8, "ldtnp">; + + defm STTP : StorePairOffset<0b11, 0, GPR64z, simm7s8, "sttp">; + def STTPpre : StorePairPreIdx<0b11, 0, GPR64z, simm7s8, "sttp">; + def STTPpost : StorePairPostIdx<0b11, 0, GPR64z, simm7s8, "sttp">; +} + +let Predicates = [HasLSUI, HasNEON] in { + defm LDTPQ : LoadPairOffset<0b11, 1, FPR128Op, simm7s16, "ldtp">; + def LDTPQpre : LoadPairPreIdx<0b11, 1, FPR128Op, simm7s16, "ldtp">; + def LDTPQpost : LoadPairPostIdx<0b11, 1, FPR128Op, simm7s16, "ldtp">; + + defm STTNPQ : StorePairNoAllocLSUI<0b11, 1, FPR128Op, simm7s16, "sttnp">; + defm LDTNPQ : LoadPairNoAllocLSUI<0b11, 1, FPR128Op, simm7s16, "ldtnp">; + + defm STTPQ : StorePairOffset<0b11, 1, FPR128Op, simm7s16, "sttp">; + def STTPQpre : StorePairPreIdx<0b11, 1, FPR128Op, simm7s16, "sttp">; + def STTPQpost : StorePairPostIdx<0b11, 1, FPR128Op, simm7s16, "sttp">; +} + def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>; @@ -4504,6 +4662,10 @@ def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128Op, "str", pre_store, f128>; def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32z, "strb", pre_truncsti8, i32>; def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32z, "strh", pre_truncsti16, i32>; +// bf16 pre-index store +def : Pat<(pre_store (bf16 FPR16:$Rt), GPR64sp:$addr, simm9:$off), + (STRHpre FPR16:$Rt, GPR64sp:$addr, simm9:$off)>; + // truncstore i64 def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off), (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, @@ -4529,6 +4691,8 @@ def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4bf16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; @@ -4544,6 +4708,8 @@ def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v8bf16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; //--- // (immediate post-indexed) @@ -4689,6 +4855,29 @@ let Predicates = [HasLOR] in { def STLLRH0 : InstAlias<"stllrh\t$Rt, [$Rn, #0]", (STLLRH GPR32: $Rt, GPR64sp:$Rn)>; } +// v9.6-a Unprivileged load store operations +let Predicates = [HasLSUI] in { +defm LDTXRW : LoadUnprivilegedLSUI<0b10, GPR32, "ldtxr">; +defm LDTXRX : LoadUnprivilegedLSUI<0b11, GPR64, "ldtxr">; + +def : MnemonicAlias<"ldxr", "ldtxr">; + +def LDATXRW : LoadExclusiveLSUI <0b10, 1, 1, GPR32, "ldatxr">; +def LDATXRX : LoadExclusiveLSUI <0b11, 1, 1, GPR64, "ldatxr">; + +def : MnemonicAlias<"ldaxr", "ldatxr">; + +defm STTXRW : StoreUnprivilegedLSUI<0b10, GPR32, "sttxr">; +defm STTXRX : StoreUnprivilegedLSUI<0b11, GPR64, "sttxr">; + +def : MnemonicAlias<"stxr", "sttxr">; + +def STLTXRW : StoreExclusiveLSUI<0b10, 0, 1, GPR32, "stltxr">; +def STLTXRX : StoreExclusiveLSUI<0b11, 0, 1, GPR64, "stltxr">; + +def : MnemonicAlias<"stlxr", "stltxr">; +} + //===----------------------------------------------------------------------===// // Scaled floating point to integer conversion instructions. //===----------------------------------------------------------------------===// @@ -4706,8 +4895,21 @@ defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>; defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; +let Predicates = [HasNEON, HasFPRCVT] in{ + defm FCVTAS : FPToIntegerSIMDScalar<0b11, 0b010, "fcvtas">; + defm FCVTAU : FPToIntegerSIMDScalar<0b11, 0b011, "fcvtau">; + defm FCVTMS : FPToIntegerSIMDScalar<0b10, 0b100, "fcvtms">; + defm FCVTMU : FPToIntegerSIMDScalar<0b10, 0b101, "fcvtmu">; + defm FCVTNS : FPToIntegerSIMDScalar<0b01, 0b010, "fcvtns">; + defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu">; + defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps">; + defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu">; + defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">; + defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">; +} + // AArch64's FCVT instructions saturate when out of range. -multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> { +multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> { let Predicates = [HasFullFP16] in { def : Pat<(i32 (to_int_sat f16:$Rn, i32)), (!cast<Instruction>(INST # UWHr) f16:$Rn)>; @@ -4724,6 +4926,21 @@ multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> { (!cast<Instruction>(INST # UXDr) f64:$Rn)>; let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat_gi f16:$Rn)), + (!cast<Instruction>(INST # UWHr) f16:$Rn)>; + def : Pat<(i64 (to_int_sat_gi f16:$Rn)), + (!cast<Instruction>(INST # UXHr) f16:$Rn)>; + } + def : Pat<(i32 (to_int_sat_gi f32:$Rn)), + (!cast<Instruction>(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int_sat_gi f32:$Rn)), + (!cast<Instruction>(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int_sat_gi f64:$Rn)), + (!cast<Instruction>(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int_sat_gi f64:$Rn)), + (!cast<Instruction>(INST # UXDr) f64:$Rn)>; + + let Predicates = [HasFullFP16] in { def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)), (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)), @@ -4737,10 +4954,25 @@ multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> { (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)), (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; + + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), + (!cast<Instruction>(INST # SWHri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), + (!cast<Instruction>(INST # SXHri) $Rn, $scale)>; + } + def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), + (!cast<Instruction>(INST # SWSri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), + (!cast<Instruction>(INST # SXSri) $Rn, $scale)>; + def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), + (!cast<Instruction>(INST # SWDri) $Rn, $scale)>; + def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), + (!cast<Instruction>(INST # SXDri) $Rn, $scale)>; } -defm : FPToIntegerSatPats<fp_to_sint_sat, "FCVTZS">; -defm : FPToIntegerSatPats<fp_to_uint_sat, "FCVTZU">; +defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">; +defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">; multiclass FPToIntegerIntPats<Intrinsic round, string INST> { let Predicates = [HasFullFP16] in { @@ -4834,8 +5066,13 @@ def : Pat<(i64 (any_llround f64:$Rn)), // Scaled integer to floating point conversion instructions. //===----------------------------------------------------------------------===// -defm SCVTF : IntegerToFP<0, "scvtf", any_sint_to_fp>; -defm UCVTF : IntegerToFP<1, "ucvtf", any_uint_to_fp>; +defm SCVTF : IntegerToFP<0b00, 0b010, "scvtf", any_sint_to_fp>; +defm UCVTF : IntegerToFP<0b00, 0b011, "ucvtf", any_uint_to_fp>; + +let Predicates = [HasNEON, HasFPRCVT] in { + defm SCVTF : IntegerToFPSIMDScalar<0b11, 0b100, "scvtf">; + defm UCVTF : IntegerToFPSIMDScalar<0b11, 0b101, "ucvtf">; +} def : Pat<(f16 (fdiv (f16 (any_sint_to_fp (i32 GPR32:$Rn))), fixedpoint_f16_i32:$scale)), (SCVTFSWHri GPR32:$Rn, fixedpoint_f16_i32:$scale)>; @@ -4907,22 +5144,6 @@ let Predicates = [HasFullFP16] in { //===----------------------------------------------------------------------===// defm FCVT : FPConversion<"fcvt">; -// Helper to get bf16 into fp32. -def cvt_bf16_to_fp32 : - OutPatFrag<(ops node:$Rn), - (f32 (COPY_TO_REGCLASS - (i32 (UBFMWri - (i32 (COPY_TO_REGCLASS (INSERT_SUBREG (f32 (IMPLICIT_DEF)), - node:$Rn, hsub), GPR32)), - (i64 (i32shift_a (i64 16))), - (i64 (i32shift_b (i64 16))))), - FPR32))>; -// Pattern for bf16 -> fp32. -def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))), - (cvt_bf16_to_fp32 FPR16:$Rn)>; -// Pattern for bf16 -> fp64. -def : Pat<(f64 (any_fpextend (bf16 FPR16:$Rn))), - (FCVTDSr (f32 (cvt_bf16_to_fp32 FPR16:$Rn)))>; //===----------------------------------------------------------------------===// // Floating point single operand instructions. @@ -5049,6 +5270,27 @@ def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(fminnum_ieee (f64 FPR64:$a), (f64 FPR64:$b)), + (FMINNMDrr FPR64:$a, FPR64:$b)>; +def : Pat<(fmaxnum_ieee (f64 FPR64:$a), (f64 FPR64:$b)), + (FMAXNMDrr FPR64:$a, FPR64:$b)>; +def : Pat<(f64 (fcanonicalize f64:$a)), + (FMINNMDrr f64:$a, f64:$a)>; +def : Pat<(fminnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)), + (FMINNMSrr FPR32:$a, FPR32:$b)>; +def : Pat<(fmaxnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)), + (FMAXNMSrr FPR32:$a, FPR32:$b)>; +def : Pat<(f32 (fcanonicalize f32:$a)), + (FMINNMSrr f32:$a, f32:$a)>; + +let Predicates = [HasFullFP16] in { +def : Pat<(fminnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)), + (FMINNMHrr FPR16:$a, FPR16:$b)>; +def : Pat<(fmaxnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)), + (FMAXNMHrr FPR16:$a, FPR16:$b)>; +def : Pat<(f16 (fcanonicalize f16:$a)), + (FMINNMHrr f16:$a, f16:$a)>; +} //===----------------------------------------------------------------------===// // Floating point three operand instructions. //===----------------------------------------------------------------------===// @@ -5159,7 +5401,7 @@ let isPseudo = 1 in { //===----------------------------------------------------------------------===// let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, isCodeGenOnly = 1, isReturn = 1, isEHScopeReturn = 1, isPseudo = 1 in { - def CLEANUPRET : Pseudo<(outs), (ins), [(cleanupret)]>, Sched<[]>; + def CLEANUPRET : Pseudo<(outs), (ins), [(cleanupret bb)]>, Sched<[]>; let usesCustomInserter = 1 in def CATCHRET : Pseudo<(outs), (ins am_brcond:$dst, am_brcond:$src), [(catchret bb:$dst, bb:$src)]>, Sched<[]>; @@ -5286,12 +5528,17 @@ defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>; defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>; // AArch64's FCVT instructions saturate when out of range. -multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> { +multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> { let Predicates = [HasFullFP16] in { def : Pat<(v4i16 (to_int_sat v4f16:$Rn, i16)), (!cast<Instruction>(INST # v4f16) v4f16:$Rn)>; def : Pat<(v8i16 (to_int_sat v8f16:$Rn, i16)), (!cast<Instruction>(INST # v8f16) v8f16:$Rn)>; + + def : Pat<(v4i16 (to_int_sat_gi v4f16:$Rn)), + (!cast<Instruction>(INST # v4f16) v4f16:$Rn)>; + def : Pat<(v8i16 (to_int_sat_gi v8f16:$Rn)), + (!cast<Instruction>(INST # v8f16) v8f16:$Rn)>; } def : Pat<(v2i32 (to_int_sat v2f32:$Rn, i32)), (!cast<Instruction>(INST # v2f32) v2f32:$Rn)>; @@ -5299,9 +5546,16 @@ multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> { (!cast<Instruction>(INST # v4f32) v4f32:$Rn)>; def : Pat<(v2i64 (to_int_sat v2f64:$Rn, i64)), (!cast<Instruction>(INST # v2f64) v2f64:$Rn)>; + + def : Pat<(v2i32 (to_int_sat_gi v2f32:$Rn)), + (!cast<Instruction>(INST # v2f32) v2f32:$Rn)>; + def : Pat<(v4i32 (to_int_sat_gi v4f32:$Rn)), + (!cast<Instruction>(INST # v4f32) v4f32:$Rn)>; + def : Pat<(v2i64 (to_int_sat_gi v2f64:$Rn)), + (!cast<Instruction>(INST # v2f64) v2f64:$Rn)>; } -defm : SIMDTwoVectorFPToIntSatPats<fp_to_sint_sat, "FCVTZS">; -defm : SIMDTwoVectorFPToIntSatPats<fp_to_uint_sat, "FCVTZU">; +defm : SIMDTwoVectorFPToIntSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">; +defm : SIMDTwoVectorFPToIntSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">; def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>; def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>; @@ -5363,14 +5617,14 @@ defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", any_sint_to_fp>; defm SHLL : SIMDVectorLShiftLongBySizeBHS; defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; -defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>; -defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>; +defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", truncssat_s>; +defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", truncssat_u>; defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>; defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp", BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >; defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>; defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", any_uint_to_fp>; -defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>; +defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", truncusat_u>; defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>; defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>; defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; @@ -5409,84 +5663,16 @@ defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>; defm : SIMDVectorLShiftLongBySizeBHSPats<zext>; defm : SIMDVectorLShiftLongBySizeBHSPats<sext>; -// Constant vector values, used in the S/UQXTN patterns below. -def VImmFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 85))))>; -def VImmFFFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 51))))>; -def VImm7F: PatLeaf<(AArch64movi_shift (i32 127), (i32 0))>; -def VImm80: PatLeaf<(AArch64mvni_shift (i32 127), (i32 0))>; -def VImm7FFF: PatLeaf<(AArch64movi_msl (i32 127), (i32 264))>; -def VImm8000: PatLeaf<(AArch64mvni_msl (i32 127), (i32 264))>; - -// trunc(umin(X, 255)) -> UQXTRN v8i8 -def : Pat<(v8i8 (trunc (umin (v8i16 V128:$Vn), (v8i16 VImmFF)))), - (UQXTNv8i8 V128:$Vn)>; -// trunc(umin(X, 65535)) -> UQXTRN v4i16 -def : Pat<(v4i16 (trunc (umin (v4i32 V128:$Vn), (v4i32 VImmFFFF)))), - (UQXTNv4i16 V128:$Vn)>; -// trunc(smin(smax(X, -128), 128)) -> SQXTRN -// with reversed min/max -def : Pat<(v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)), - (v8i16 VImm7F)))), - (SQXTNv8i8 V128:$Vn)>; -def : Pat<(v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)), - (v8i16 VImm80)))), - (SQXTNv8i8 V128:$Vn)>; -// trunc(smin(smax(X, -32768), 32767)) -> SQXTRN -// with reversed min/max -def : Pat<(v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)), - (v4i32 VImm7FFF)))), - (SQXTNv4i16 V128:$Vn)>; -def : Pat<(v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)), - (v4i32 VImm8000)))), - (SQXTNv4i16 V128:$Vn)>; - -// concat_vectors(Vd, trunc(umin(X, 255))) -> UQXTRN(Vd, Vn) -def : Pat<(v16i8 (concat_vectors - (v8i8 V64:$Vd), - (v8i8 (trunc (umin (v8i16 V128:$Vn), (v8i16 VImmFF)))))), - (UQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -// concat_vectors(Vd, trunc(umin(X, 65535))) -> UQXTRN(Vd, Vn) -def : Pat<(v8i16 (concat_vectors - (v4i16 V64:$Vd), - (v4i16 (trunc (umin (v4i32 V128:$Vn), (v4i32 VImmFFFF)))))), - (UQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; - -// concat_vectors(Vd, trunc(smin(smax Vm, -128), 127) ~> SQXTN2(Vd, Vn) -// with reversed min/max -def : Pat<(v16i8 (concat_vectors - (v8i8 V64:$Vd), - (v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)), - (v8i16 VImm7F)))))), - (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -def : Pat<(v16i8 (concat_vectors - (v8i8 V64:$Vd), - (v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)), - (v8i16 VImm80)))))), - (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; - -// concat_vectors(Vd, trunc(smin(smax Vm, -32768), 32767) ~> SQXTN2(Vd, Vn) -// with reversed min/max -def : Pat<(v8i16 (concat_vectors - (v4i16 V64:$Vd), - (v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)), - (v4i32 VImm7FFF)))))), - (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; -def : Pat<(v8i16 (concat_vectors - (v4i16 V64:$Vd), - (v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)), - (v4i32 VImm8000)))))), - (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; - // Select BSWAP vector instructions into REV instructions -def : Pat<(v4i16 (bswap (v4i16 V64:$Rn))), +def : Pat<(v4i16 (bswap (v4i16 V64:$Rn))), (v4i16 (REV16v8i8 (v4i16 V64:$Rn)))>; -def : Pat<(v8i16 (bswap (v8i16 V128:$Rn))), +def : Pat<(v8i16 (bswap (v8i16 V128:$Rn))), (v8i16 (REV16v16i8 (v8i16 V128:$Rn)))>; -def : Pat<(v2i32 (bswap (v2i32 V64:$Rn))), +def : Pat<(v2i32 (bswap (v2i32 V64:$Rn))), (v2i32 (REV32v8i8 (v2i32 V64:$Rn)))>; -def : Pat<(v4i32 (bswap (v4i32 V128:$Rn))), +def : Pat<(v4i32 (bswap (v4i32 V128:$Rn))), (v4i32 (REV32v16i8 (v4i32 V128:$Rn)))>; -def : Pat<(v2i64 (bswap (v2i64 V128:$Rn))), +def : Pat<(v2i64 (bswap (v2i64 V128:$Rn))), (v2i64 (REV64v16i8 (v2i64 V128:$Rn)))>; //===----------------------------------------------------------------------===// @@ -5530,6 +5716,42 @@ defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", any_fminnum>; defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>; defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", any_fminimum>; +let Predicates = [HasNEON] in { +def : Pat<(v2f64 (fminnum_ieee (v2f64 V128:$Rn), (v2f64 V128:$Rm))), + (v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>; +def : Pat<(v2f64 (fmaxnum_ieee (v2f64 V128:$Rn), (v2f64 V128:$Rm))), + (v2f64 (FMAXNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>; +def : Pat<(v2f64 (fcanonicalize (v2f64 V128:$Rn))), + (v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rn)))>; +def : Pat<(v4f32 (fminnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))), + (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>; +def : Pat<(v4f32 (fmaxnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))), + (v4f32 (FMAXNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>; +def : Pat<(v4f32 (fcanonicalize (v4f32 V128:$Rn))), + (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rn)))>; +def : Pat<(v2f32 (fminnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))), + (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>; +def : Pat<(v2f32 (fmaxnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))), + (v2f32 (FMAXNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>; +def : Pat<(v2f32 (fcanonicalize (v2f32 V64:$Rn))), + (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rn)))>; +} + +let Predicates = [HasNEON, HasFullFP16] in { +def : Pat<(v8f16 (fminnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>; +def : Pat<(v8f16 (fmaxnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (v8f16 (FMAXNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>; +def : Pat<(v8f16 (fcanonicalize (v8f16 V128:$Rn))), + (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rn)))>; +def : Pat<(v4f16 (fminnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>; +def : Pat<(v4f16 (fmaxnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (v4f16 (FMAXNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>; +def : Pat<(v4f16 (fcanonicalize (v4f16 V64:$Rn))), + (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rn)))>; +} + // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the // instruction expects the addend first, while the fma intrinsic puts it last. defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla", @@ -6156,7 +6378,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))), // Some float -> int -> float conversion patterns for which we want to keep the // int values in FP registers using the corresponding NEON instructions to // avoid more costly int <-> fp register transfers. -let Predicates = [HasNEONandIsStreamingSafe] in { +let Predicates = [HasNEONandIsSME2p2StreamingSafe] in { def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))), (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>; def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))), @@ -6166,14 +6388,14 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))), def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))), (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>; -let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in { +let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in { def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))), (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>; def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; } -// int -> float conversion of value in lane 0 of simd vector should use +// int -> float conversion of value in lane 0 of simd vector should use // correct cvtf variant to avoid costly fpr <-> gpr register transfers. def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))), (SCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>; @@ -6189,13 +6411,13 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))), // fp16: integer extraction from vector must be at least 32-bits to be legal. // Actual extraction result is then an in-reg sign-extension of lower 16-bits. -let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in { -def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract - (v8i16 FPR128:$Rn), (i64 0))), i16)))), +let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in { +def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract + (v8i16 FPR128:$Rn), (i64 0))), i16)))), (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>; // unsigned 32-bit extracted element is truncated to 16-bits using AND -def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract +def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract (v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))), (UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>; } @@ -6286,7 +6508,7 @@ def : Pat <(f64 (uint_to_fp (i32 (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>; // 64-bits -> double are handled in target specific dag combine: // performIntToFpCombine. -} // let Predicates = [HasNEONandIsStreamingSafe] +} // let Predicates = [HasNEON] //===----------------------------------------------------------------------===// // Advanced SIMD three different-sized vector instructions. @@ -6588,6 +6810,46 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd), let Predicates = [HasLUT] in { defm LUT2 : BaseSIMDTableLookupIndexed2<"luti2">; defm LUT4 : BaseSIMDTableLookupIndexed4<"luti4">; + + multiclass Luti2_patterns<Instruction Instr, ValueType VT64, ValueType VT128>{ + def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT64:$Rn, + v8i8:$Rm, i32:$idx)), + (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; + def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT64:$Rn, + v16i8:$Rm, i32:$idx)), + (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), + V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT128:$Rn, + v8i8:$Rm, i32:$idx)), + (Instr V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), + VectorIndexS32b_timm:$idx)>; + def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT128:$Rn, + v16i8:$Rm, i32:$idx)), + (Instr V128:$Rn, V128:$Rm, VectorIndexS32b_timm:$idx)>; + } + + defm : Luti2_patterns<LUT2_B, v8i8, v16i8>; + defm : Luti2_patterns<LUT2_H, v4i16, v8i16>; + defm : Luti2_patterns<LUT2_H, v4f16, v8f16>; + defm : Luti2_patterns<LUT2_H, v4bf16, v8bf16>; + + def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq v16i8:$Rn, + v16i8:$Rm, i32:$idx)), + (LUT4_B VecListOne16b:$Rn, V128:$Rm, VectorIndexD32b_timm:$idx)>; + def : Pat<(v16i8 (int_aarch64_neon_vluti4q_lane v16i8:$Rn, + v8i8:$Rm, i32:$idx)), + (LUT4_B VecListOne16b:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexD32b_timm:$idx)>; + + foreach VT = [v8i16, v8f16, v8bf16] in { + def : Pat<(VT (int_aarch64_neon_vluti4q_laneq_x2 VT:$Rn1, + VT:$Rn2, v16i8:$Rm, i32:$idx)), + (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>; + def : Pat<(VT (int_aarch64_neon_vluti4q_lane_x2 VT:$Rn1, + VT:$Rn2, v8i8:$Rm, i32:$idx)), + (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>; + } } //---------------------------------------------------------------------------- @@ -6728,6 +6990,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)), (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>; +// Also covers DUP (truncate i64 to i32) +def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))), + (DUPv2i32lane V128:$Rn, imm:$idx)>; +def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))), + (DUPv4i32lane V128:$Rn, imm:$idx)>; + // If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane // instruction even if the types don't match: we just have to remap the lane // carefully. N.b. this trick only applies to truncations. @@ -6741,44 +7009,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64); }]>; -multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT, - ValueType Src128VT, ValueType ScalVT, - Instruction DUP, SDNodeXForm IdxXFORM> { - def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn), - imm:$idx)))), - (DUP V128:$Rn, (IdxXFORM imm:$idx))>; - - def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn), - imm:$idx)))), - (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; -} - -defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>; -defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>; -defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>; - -defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>; -defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>; -defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>; - -multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP, - SDNodeXForm IdxXFORM> { - def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn), - imm:$idx))))), - (DUP V128:$Rn, (IdxXFORM imm:$idx))>; - - def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn), - imm:$idx))))), - (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; -} - -defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>; -defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>; -defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>; - -defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>; -defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>; -defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>; +class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT, + Instruction DUP, SDNodeXForm IdxXFORM> + : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))), + (DUP V128:$Rn, (IdxXFORM imm:$idx))>; + +// DUP (truncate i16 to i8) +def : DUPWithTruncPat<v8i8, v8i16, i32, DUPv8i8lane, VecIndex_x2>; +def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>; +// DUP (truncate i32/64 to i8) +def : DUPWithTruncPat<v8i8, v4i32, i32, DUPv8i8lane, VecIndex_x4>; +def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>; +// DUP (truncate i32/i64 to i16) +def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>; +def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>; // SMOV and UMOV definitions, with some extra patterns for convenience defm SMOV : SMov; @@ -6824,10 +7068,10 @@ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn), defm INS : SIMDIns; -def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)), +def : Pat<(v16i8 (vec_ins_or_scal_vec GPR32:$Rn)), (SUBREG_TO_REG (i32 0), (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; -def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)), +def : Pat<(v8i8 (vec_ins_or_scal_vec GPR32:$Rn)), (SUBREG_TO_REG (i32 0), (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; @@ -6835,50 +7079,49 @@ def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)), def : Pat<(v8i8 (bitconvert (i64 (zext GPR32:$Rn)))), (SUBREG_TO_REG (i32 0), (f32 (FMOVWSr GPR32:$Rn)), ssub)>; -def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)), +def : Pat<(v8i16 (vec_ins_or_scal_vec GPR32:$Rn)), (SUBREG_TO_REG (i32 0), (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; -def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)), +def : Pat<(v4i16 (vec_ins_or_scal_vec GPR32:$Rn)), (SUBREG_TO_REG (i32 0), (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; -def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), +def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))), (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), +def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), +def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))), (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), +def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))), (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), +def : Pat<(v2i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))), (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; -def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))), +def : Pat<(v4i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))), (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), (i32 FPR32:$Rn), ssub))>; - -def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))), +def : Pat<(v2i64 (vec_ins_or_scal_vec (i64 FPR64:$Rn))), (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (i64 FPR64:$Rn), dsub))>; -def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), +def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))), (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), +def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))), (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))), +def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))), (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))), +def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))), (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; -def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), +def : Pat<(v4f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; -def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), +def : Pat<(v2f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))), (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; -def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))), +def : Pat<(v2f64 (vec_ins_or_scal_vec (f64 FPR64:$Rn))), (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>; def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), @@ -7002,8 +7245,23 @@ def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2) )>; -multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, - ValueType VTScal, Instruction INS> { +// Move elements between vectors +multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, ValueType VTSVE, + ValueType VTScal, Operand SVEIdxTy, Instruction INS> { + // Extracting from the lowest 128-bits of an SVE vector + def : Pat<(VT128 (vector_insert VT128:$Rn, + (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))), + (i64 imm:$Immd))), + (INS VT128:$Rn, imm:$Immd, (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn)>; + + def : Pat<(VT64 (vector_insert VT64:$Rn, + (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))), + (i64 imm:$Immd))), + (EXTRACT_SUBREG + (INS (SUBREG_TO_REG (i64 0), VT64:$Rn, dsub), imm:$Immd, + (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn), + dsub)>; + // Extracting from another NEON vector def : Pat<(VT128 (vector_insert V128:$src, (VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))), (i64 imm:$Immd))), @@ -7031,15 +7289,15 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, dsub)>; } -defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>; -defm : Neon_INS_elt_pattern<v8bf16, v4bf16, bf16, INSvi16lane>; -defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>; -defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>; +defm : Neon_INS_elt_pattern<v8f16, v4f16, nxv8f16, f16, VectorIndexH, INSvi16lane>; +defm : Neon_INS_elt_pattern<v8bf16, v4bf16, nxv8bf16, bf16, VectorIndexH, INSvi16lane>; +defm : Neon_INS_elt_pattern<v4f32, v2f32, nxv4f32, f32, VectorIndexS, INSvi32lane>; +defm : Neon_INS_elt_pattern<v2f64, v1f64, nxv2f64, f64, VectorIndexD, INSvi64lane>; -defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, INSvi8lane>; -defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>; -defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>; -defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi64lane>; +defm : Neon_INS_elt_pattern<v16i8, v8i8, nxv16i8, i32, VectorIndexB, INSvi8lane>; +defm : Neon_INS_elt_pattern<v8i16, v4i16, nxv8i16, i32, VectorIndexH, INSvi16lane>; +defm : Neon_INS_elt_pattern<v4i32, v2i32, nxv4i32, i32, VectorIndexS, INSvi32lane>; +defm : Neon_INS_elt_pattern<v2i64, v1i64, nxv2i64, i64, VectorIndexD, INSvi64lane>; // Insert from bitcast // vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0) @@ -7089,7 +7347,8 @@ def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx), // All concat_vectors operations are canonicalised to act on i64 vectors for // AArch64. In the general case we need an instruction, which had just as well be // INS. -multiclass ConcatPat<ValueType DstTy, ValueType SrcTy> { +multiclass ConcatPat<ValueType DstTy, ValueType SrcTy, + ComplexPattern ExtractHigh> { def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)), (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>; @@ -7102,16 +7361,22 @@ multiclass ConcatPat<ValueType DstTy, ValueType SrcTy> { // If the high lanes are undef we can just ignore them: def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)), (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>; + + // Concatting the high half of two vectors is the insert of the first + // into the low half of the second. + def : Pat<(DstTy (concat_vectors (ExtractHigh (DstTy V128:$Rn)), + (ExtractHigh (DstTy V128:$Rm)))), + (INSvi64lane V128:$Rm, (i64 0), V128:$Rn, (i64 1))>; } -defm : ConcatPat<v2i64, v1i64>; -defm : ConcatPat<v2f64, v1f64>; -defm : ConcatPat<v4i32, v2i32>; -defm : ConcatPat<v4f32, v2f32>; -defm : ConcatPat<v8i16, v4i16>; -defm : ConcatPat<v8f16, v4f16>; -defm : ConcatPat<v8bf16, v4bf16>; -defm : ConcatPat<v16i8, v8i8>; +defm : ConcatPat<v2i64, v1i64, extract_high_v2i64>; +defm : ConcatPat<v2f64, v1f64, extract_high_v2f64>; +defm : ConcatPat<v4i32, v2i32, extract_high_v4i32>; +defm : ConcatPat<v4f32, v2f32, extract_high_v4f32>; +defm : ConcatPat<v8i16, v4i16, extract_high_v8i16>; +defm : ConcatPat<v8f16, v4f16, extract_high_v8f16>; +defm : ConcatPat<v8bf16, v4bf16, extract_high_v8bf16>; +defm : ConcatPat<v16i8, v8i8, extract_high_v16i8>; //---------------------------------------------------------------------------- // AdvSIMD across lanes instructions @@ -7172,17 +7437,6 @@ multiclass SIMDAcrossLaneLongPairIntrinsicGISel<string Opc, SDPatternOperator ad defm : SIMDAcrossLaneLongPairIntrinsicGISel<"UADDLV", AArch64uaddlp>; defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>; -// Patterns for uaddlv(uaddlp(x)) ==> uaddlv -def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))), - (i64 (EXTRACT_SUBREG - (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub)), - dsub))>; - -def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))), - (i32 (EXTRACT_SUBREG - (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)), - ssub))>; - def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))), (v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>; @@ -7325,19 +7579,19 @@ def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))), } // For vecreduce_add, used by GlobalISel not SDAG -def : Pat<(i8 (vecreduce_add (v8i8 V64:$Rn))), +def : Pat<(i8 (vecreduce_add (v8i8 V64:$Rn))), (i8 (ADDVv8i8v V64:$Rn))>; -def : Pat<(i8 (vecreduce_add (v16i8 V128:$Rn))), +def : Pat<(i8 (vecreduce_add (v16i8 V128:$Rn))), (i8 (ADDVv16i8v V128:$Rn))>; -def : Pat<(i16 (vecreduce_add (v4i16 V64:$Rn))), +def : Pat<(i16 (vecreduce_add (v4i16 V64:$Rn))), (i16 (ADDVv4i16v V64:$Rn))>; -def : Pat<(i16 (vecreduce_add (v8i16 V128:$Rn))), +def : Pat<(i16 (vecreduce_add (v8i16 V128:$Rn))), (i16 (ADDVv8i16v V128:$Rn))>; -def : Pat<(i32 (vecreduce_add (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_add (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub))>; -def : Pat<(i32 (vecreduce_add (v4i32 V128:$Rn))), +def : Pat<(i32 (vecreduce_add (v4i32 V128:$Rn))), (i32 (ADDVv4i32v V128:$Rn))>; -def : Pat<(i64 (vecreduce_add (v2i64 V128:$Rn))), +def : Pat<(i64 (vecreduce_add (v2i64 V128:$Rn))), (i64 (ADDPv2i64p V128:$Rn))>; defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", AArch64saddv>; @@ -7382,103 +7636,33 @@ def : Pat<(i16 (opNode (v4i16 FPR64:$Rn))), def : Pat<(i16 (opNode (v8i16 FPR128:$Rn))), (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) FPR128:$Rn)>; -def : Pat<(i32 (opNode (v4i32 V128:$Rn))), +def : Pat<(i32 (opNode (v4i32 V128:$Rn))), (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn)>; } // For v2i32 source type, the pairwise instruction can be used instead defm : SIMDAcrossLanesVecReductionIntrinsic<"UMINV", vecreduce_umin>; -def : Pat<(i32 (vecreduce_umin (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_umin (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub))>; defm : SIMDAcrossLanesVecReductionIntrinsic<"UMAXV", vecreduce_umax>; -def : Pat<(i32 (vecreduce_umax (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_umax (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>; defm : SIMDAcrossLanesVecReductionIntrinsic<"SMINV", vecreduce_smin>; -def : Pat<(i32 (vecreduce_smin (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_smin (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub))>; defm : SIMDAcrossLanesVecReductionIntrinsic<"SMAXV", vecreduce_smax>; -def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))), +def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))), (i32 (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>; -multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> { - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), - (i64 0)))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), - (i64 0)))>; - -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), - ssub))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), - ssub))>; - -def : Pat<(i64 (intOp (v4i32 V128:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), - dsub))>; -} - -multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc, - Intrinsic intOp> { - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), - ssub))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), - ssub))>; - -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), - ssub))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), - ssub))>; - -def : Pat<(i64 (intOp (v4i32 V128:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), - dsub))>; -} - -defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>; -defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>; - -// The vaddlv_s32 intrinsic gets mapped to SADDLP. -def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (SADDLPv2i32_v1i64 V64:$Rn), dsub), - dsub))>; -// The vaddlv_u32 intrinsic gets mapped to UADDLP. -def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (UADDLPv2i32_v1i64 V64:$Rn), dsub), - dsub))>; +// The SADDLV v2i32 gets mapped to SADDLP. +def : Pat<(v2i64 (AArch64saddlv (v2i32 V64:$Rn))), + (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (SADDLPv2i32_v1i64 V64:$Rn), dsub))>; +// The UADDLV v2i32 gets mapped to UADDLP. +def : Pat<(v2i64 (AArch64uaddlv (v2i32 V64:$Rn))), + (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLPv2i32_v1i64 V64:$Rn), dsub))>; //------------------------------------------------------------------------------ // AdvSIMD modified immediate instructions @@ -8005,15 +8189,15 @@ def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>; defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn", - int_aarch64_neon_sqrshrn>; + BinOpFrag<(truncssat_s (AArch64srshri node:$LHS, node:$RHS))>>; defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun", - int_aarch64_neon_sqrshrun>; + BinOpFrag<(truncssat_u (AArch64srshri node:$LHS, node:$RHS))>>; defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>; defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>; defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", - int_aarch64_neon_sqshrn>; + BinOpFrag<(truncssat_s (AArch64vashr node:$LHS, node:$RHS))>>; defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", - int_aarch64_neon_sqshrun>; + BinOpFrag<(truncssat_u (AArch64vashr node:$LHS, node:$RHS))>>; defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>; def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), @@ -8031,10 +8215,10 @@ defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", int_aarch64_neon_vcvtfxu2fp>; defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", - int_aarch64_neon_uqrshrn>; + BinOpFrag<(truncusat_u (AArch64urshri node:$LHS, node:$RHS))>>; defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>; defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn", - int_aarch64_neon_uqshrn>; + BinOpFrag<(truncusat_u (AArch64vlshr node:$LHS, node:$RHS))>>; defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>; defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra", TriOpFrag<(add node:$LHS, @@ -8129,6 +8313,20 @@ def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd), (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn, vecshiftR32Narrow:$imm)>; +def : Pat<(shl (v8i16 (zext (v8i8 V64:$Rm))), (v8i16 (AArch64dup (i32 imm32_0_7:$size)))), + (USHLLv8i8_shift V64:$Rm, (i32 imm32_0_7:$size))>; +def : Pat<(shl (v4i32 (zext (v4i16 V64:$Rm))), (v4i32 (AArch64dup (i32 imm32_0_15:$size)))), + (USHLLv4i16_shift V64:$Rm, (i32 imm32_0_15:$size))>; +def : Pat<(shl (v2i64 (zext (v2i32 V64:$Rm))), (v2i64 (AArch64dup (i64 imm0_31:$size)))), + (USHLLv2i32_shift V64:$Rm, (trunc_imm imm0_31:$size))>; + +def : Pat<(shl (v8i16 (sext (v8i8 V64:$Rm))), (v8i16 (AArch64dup (i32 imm32_0_7:$size)))), + (SSHLLv8i8_shift V64:$Rm, (i32 imm32_0_7:$size))>; +def : Pat<(shl (v4i32 (sext (v4i16 V64:$Rm))), (v4i32 (AArch64dup (i32 imm32_0_15:$size)))), + (SSHLLv4i16_shift V64:$Rm, (i32 imm32_0_15:$size))>; +def : Pat<(shl (v2i64 (sext (v2i32 V64:$Rm))), (v2i64 (AArch64dup (i64 imm0_31:$size)))), + (SSHLLv2i32_shift V64:$Rm, (trunc_imm imm0_31:$size))>; + // Vector sign and zero extensions are implemented with SSHLL and USSHLL. // Anyexts are implemented as zexts. def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>; @@ -8140,8 +8338,6 @@ def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))> def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>; def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; -// Vector bf16 -> fp32 is implemented morally as a zext + shift. -def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))), (SHLLv4i16 V64:$Rn)>; // Also match an extend from the upper half of a 128 bit source register. def : Pat<(v8i16 (anyext (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn)) ))), (USHLLv16i8_shift V128:$Rn, (i32 0))>; @@ -8550,7 +8746,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH let Predicates = [HasNEON] in { class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy, SDPatternOperator ExtLoad, Instruction LD1> - : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))), + : Pat<(ResultTy (vec_ins_or_scal_vec (i32 (ExtLoad GPR64sp:$Rn)))), (ResultTy (EXTRACT_SUBREG (LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>; @@ -8983,11 +9179,11 @@ def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), (COPY_TO_REGCLASS V64:$Vn, GPR64)>; -def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), +def : Pat<(v1i64 (vec_ins_or_scal_vec GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), +def : Pat<(v1f64 (vec_ins_or_scal_vec GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>; +def : Pat<(v1f64 (vec_ins_or_scal_vec (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>; def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>; @@ -9587,6 +9783,18 @@ def : Pat<(v16i8 (add (AArch64uzp1 (v16i8 FPR128:$Rn), (v16i8 FPR128:$Rm)), (AArch64uzp2 (v16i8 FPR128:$Rn), (v16i8 FPR128:$Rm)))), (v16i8 (ADDPv16i8 $Rn, $Rm))>; +def : Pat<(v2i32 (add (AArch64zip1 (extract_subvector (v4i32 FPR128:$Rn), (i64 0)), + (extract_subvector (v4i32 FPR128:$Rn), (i64 2))), + (AArch64zip2 (extract_subvector (v4i32 FPR128:$Rn), (i64 0)), + (extract_subvector (v4i32 FPR128:$Rn), (i64 2))))), + (EXTRACT_SUBREG (ADDPv4i32 $Rn, $Rn), dsub)>; +def : Pat<(v4i16 (add (trunc (v4i32 (bitconvert FPR128:$Rn))), + (extract_subvector (AArch64uzp2 (v8i16 FPR128:$Rn), undef), (i64 0)))), + (EXTRACT_SUBREG (ADDPv8i16 $Rn, $Rn), dsub)>; +def : Pat<(v8i8 (add (trunc (v8i16 (bitconvert FPR128:$Rn))), + (extract_subvector (AArch64uzp2 (v16i8 FPR128:$Rn), undef), (i64 0)))), + (EXTRACT_SUBREG (ADDPv16i8 $Rn, $Rn), dsub)>; + def : Pat<(v2f64 (fadd (AArch64zip1 (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm)), (AArch64zip2 (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm)))), (v2f64 (FADDPv2f64 $Rn, $Rm))>; @@ -9689,6 +9897,7 @@ def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)), def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; +let Size = 8 in def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>; def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>; @@ -9826,8 +10035,10 @@ foreach i = 0-7 in { } let Predicates = [HasLS64] in { + let mayLoad = 1 in def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn), (outs GPR64x8:$Rt)>; + let mayStore = 1 in def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64x8:$Rt, GPR64sp:$Rn), (outs)>; def ST64BV: Store64BV<0b011, "st64bv">; @@ -9872,14 +10083,6 @@ let Predicates = [HasMOPS, HasMTE] in { } } -// MOPS Node operands: 0: Dst, 1: Src or Value, 2: Size, 3: Chain -// MOPS Node results: 0: Dst writeback, 1: Size writeback, 2: Chain -def SDT_AArch64mops : SDTypeProfile<2, 3, [ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2> ]>; -def AArch64mops_memset : SDNode<"AArch64ISD::MOPS_MEMSET", SDT_AArch64mops>; -def AArch64mops_memset_tagging : SDNode<"AArch64ISD::MOPS_MEMSET_TAGGING", SDT_AArch64mops>; -def AArch64mops_memcopy : SDNode<"AArch64ISD::MOPS_MEMCOPY", SDT_AArch64mops>; -def AArch64mops_memmove : SDNode<"AArch64ISD::MOPS_MEMMOVE", SDT_AArch64mops>; - // MOPS operations always contain three 4-byte instructions let Predicates = [HasMOPS], Defs = [NZCV], Size = 12, mayStore = 1 in { let mayLoad = 1 in { @@ -10122,44 +10325,60 @@ let Predicates = [HasD128] in { //===----------------------------===// let Predicates = [HasFP8] in { - defm F1CVTL : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">; - defm F2CVTL : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">; - defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">; - defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">; - defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">; - defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">; - defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>; + defm F1CVTL : SIMD_FP8_CVTL<0b00, "f1cvtl", v8f16, int_aarch64_neon_fp8_cvtl1>; + defm F2CVTL : SIMD_FP8_CVTL<0b01, "f2cvtl", v8f16, int_aarch64_neon_fp8_cvtl2>; + defm BF1CVTL : SIMD_FP8_CVTL<0b10, "bf1cvtl", v8bf16, int_aarch64_neon_fp8_cvtl1>; + defm BF2CVTL : SIMD_FP8_CVTL<0b11, "bf2cvtl", v8bf16, int_aarch64_neon_fp8_cvtl2>; + defm FCVTN_F16 : SIMD_FP8_CVTN_F16<"fcvtn", int_aarch64_neon_fp8_fcvtn>; + defm FCVTN_F32 : SIMD_FP8_CVTN_F32<"fcvtn", int_aarch64_neon_fp8_fcvtn>; + defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>; } // End let Predicates = [HasFP8] -let Predicates = [HasFAMINMAX] in { - defm FAMAX : SIMDThreeSameVectorFP<0b0, 0b1, 0b011, "famax", null_frag>; - defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", null_frag>; -} // End let Predicates = [HasFAMAXMIN] +// fminimum(abs(a), abs(b)) -> famin(a, b) +// fminnum[nnan](abs(a), abs(b)) -> famin(a, b) +def AArch64famin : PatFrags<(ops node:$Rn, node:$Rm), + [(int_aarch64_neon_famin node:$Rn, node:$Rm), + (fminimum (fabs node:$Rn), (fabs node:$Rm)), + (fminnum_nnan (fabs node:$Rn), (fabs node:$Rm))]>; + +// fmaximum(abs(a), abs(b)) -> famax(a, b) +// fmaxnum[nnan](abs(a), abs(b)) -> famax(a, b) +def AArch64famax : PatFrags<(ops node:$Rn, node:$Rm), + [(int_aarch64_neon_famax node:$Rn, node:$Rm), + (fmaximum (fabs node:$Rn), (fabs node:$Rm)), + (fmaxnum_nnan (fabs node:$Rn), (fabs node:$Rm))]>; + +let Predicates = [HasNEON, HasFAMINMAX] in { + defm FAMAX : SIMDThreeSameVectorFP<0b0, 0b1, 0b011, "famax", AArch64famax>; + defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", AArch64famin>; +} // End let Predicates = [HasNEON, HasFAMINMAX] let Predicates = [HasFP8FMA] in { - defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb">; - defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt">; - defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb">; - defm FMLALLBTlane : SIMDThreeSameVectorMLALIndex<0b0, 0b01, "fmlallbt">; - defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb">; - defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt">; - - defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb">; - defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt">; - defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb">; - defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt">; - defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb">; - defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">; + defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb_lane>; + defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt_lane>; + defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb_lane>; + defm FMLALLBTlane : SIMDThreeSameVectorMLALIndex<0b0, 0b01, "fmlallbt", int_aarch64_neon_fp8_fmlallbt_lane>; + defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb", int_aarch64_neon_fp8_fmlalltb_lane>; + defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt", int_aarch64_neon_fp8_fmlalltt_lane>; +} + +let Predicates = [HasFP8FMA], Uses = [FPMR, FPCR], mayLoad = 1 in { + defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb>; + defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt>; + defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb>; + defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt", int_aarch64_neon_fp8_fmlallbt>; + defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb", int_aarch64_neon_fp8_fmlalltb>; + defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt", int_aarch64_neon_fp8_fmlalltt>; } // End let Predicates = [HasFP8FMA] let Predicates = [HasFP8DOT2] in { - defm FDOTlane : SIMDThreeSameVectorFP8DOT2Index<"fdot">; - defm FDOT : SIMDThreeSameVectorDOT2<"fdot">; + defm FDOTlane : SIMD_FP8_Dot2_Index<"fdot", int_aarch64_neon_fp8_fdot2_lane>; + defm FDOT : SIMD_FP8_Dot2<"fdot", int_aarch64_neon_fp8_fdot2>; } // End let Predicates = [HasFP8DOT2] let Predicates = [HasFP8DOT4] in { - defm FDOTlane : SIMDThreeSameVectorFP8DOT4Index<"fdot">; - defm FDOT : SIMDThreeSameVectorDOT4<"fdot">; + defm FDOTlane : SIMD_FP8_Dot4_Index<"fdot", int_aarch64_neon_fp8_fdot4_lane>; + defm FDOT : SIMD_FP8_Dot4<"fdot", int_aarch64_neon_fp8_fdot4>; } // End let Predicates = [HasFP8DOT4] //===----------------------------------------------------------------------===// @@ -10203,9 +10422,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst> let Predicates = [HasBF16] in def : Pat<(InOp (v8bf16 V128:$Rn)), (v8bf16 (BFCVTN2 - (v8bf16 (BFCVTN - (v4f32 (OutInst - (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))), + (INSERT_SUBREG (IMPLICIT_DEF), + (v4bf16 (BFCVTN + (v4f32 (OutInst + (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))), + dsub), (v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>; let Predicates = [HasNoBF16] in @@ -10240,10 +10461,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst let Predicates = [HasBF16] in def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)), (v8bf16 (BFCVTN2 - (v8bf16 (BFCVTN - (v4f32 (OutInst - (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))), - (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))), + (INSERT_SUBREG (IMPLICIT_DEF), + (v4bf16 (BFCVTN + (v4f32 (OutInst + (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))), + (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))), + dsub), (v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)), (v4f32 (SHLLv8i16 V128:$Rm))))))>; @@ -10262,6 +10485,132 @@ defm : PromoteBinaryv8f16Tov4f32<any_fdiv, FDIVv4f32>; defm : PromoteBinaryv8f16Tov4f32<any_fmul, FMULv4f32>; defm : PromoteBinaryv8f16Tov4f32<any_fsub, FSUBv4f32>; +let Predicates = [HasCMPBR] in { + defm CBGT : CmpBranchRegister<0b000, "cbgt">; + defm CBGE : CmpBranchRegister<0b001, "cbge">; + defm CBHI : CmpBranchRegister<0b010, "cbhi">; + defm CBHS : CmpBranchRegister<0b011, "cbhs">; + defm CBEQ : CmpBranchRegister<0b110, "cbeq">; + defm CBNE : CmpBranchRegister<0b111, "cbne">; + + def CBHGTWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b000, 0b11, "cbhgt">; + def CBHGEWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b001, 0b11, "cbhge">; + def CBHHIWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b010, 0b11, "cbhhi">; + def CBHHSWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b011, 0b11, "cbhhs">; + def CBHEQWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b110, 0b11, "cbheq">; + def CBHNEWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b111, 0b11, "cbhne">; + + def CBBGTWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b000, 0b10, "cbbgt">; + def CBBGEWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b001, 0b10, "cbbge">; + def CBBHIWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b010, 0b10, "cbbhi">; + def CBBHSWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b011, 0b10, "cbbhs">; + def CBBEQWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b110, 0b10, "cbbeq">; + def CBBNEWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b111, 0b10, "cbbne">; + + defm CBGT : CmpBranchImmediate<0b000, "uimm6", "cbgt">; + defm CBLT : CmpBranchImmediate<0b001, "uimm6", "cblt">; + defm CBHI : CmpBranchImmediate<0b010, "uimm6", "cbhi">; + defm CBLO : CmpBranchImmediate<0b011, "uimm6", "cblo">; + defm CBEQ : CmpBranchImmediate<0b110, "uimm6", "cbeq">; + defm CBNE : CmpBranchImmediate<0b111, "uimm6", "cbne">; + + defm : CmpBranchImmediateAlias<"cbge", "CBGT", "uimm6p1">; + defm : CmpBranchImmediateAlias<"cbhs", "CBHI", "uimm6p1">; + defm : CmpBranchImmediateAlias<"cble", "CBLT", "uimm6m1">; + defm : CmpBranchImmediateAlias<"cbls", "CBLO", "uimm6m1">; + + defm : CmpBranchRegisterAlias<"cble", "CBGE">; + defm : CmpBranchRegisterAlias<"cblo", "CBHI">; + defm : CmpBranchRegisterAlias<"cbls", "CBHS">; + defm : CmpBranchRegisterAlias<"cblt", "CBGT">; + + defm : CmpBranchWRegisterAlias<"cbble", "CBBGE">; + defm : CmpBranchWRegisterAlias<"cbblo", "CBBHI">; + defm : CmpBranchWRegisterAlias<"cbbls", "CBBHS">; + defm : CmpBranchWRegisterAlias<"cbblt", "CBBGT">; + + defm : CmpBranchWRegisterAlias<"cbhle", "CBHGE">; + defm : CmpBranchWRegisterAlias<"cbhlo", "CBHHI">; + defm : CmpBranchWRegisterAlias<"cbhls", "CBHHS">; + defm : CmpBranchWRegisterAlias<"cbhlt", "CBHGT">; +} // HasCMPBR + + +//===-----------------------------------------------------===// +// Atomic floating-point in-memory instructions (FEAT_LSFE) +//===-----------------------------------------------------===// + +let Predicates = [HasLSFE] in { + // Floating-point Atomic Load + defm LDFADDA : AtomicFPLoad<0b10, 0b000, "ldfadda">; + defm LDFADDAL : AtomicFPLoad<0b11, 0b000, "ldfaddal">; + defm LDFADD : AtomicFPLoad<0b00, 0b000, "ldfadd">; + defm LDFADDL : AtomicFPLoad<0b01, 0b000, "ldfaddl">; + defm LDFMAXA : AtomicFPLoad<0b10, 0b100, "ldfmaxa">; + defm LDFMAXAL : AtomicFPLoad<0b11, 0b100, "ldfmaxal">; + defm LDFMAX : AtomicFPLoad<0b00, 0b100, "ldfmax">; + defm LDFMAXL : AtomicFPLoad<0b01, 0b100, "ldfmaxl">; + defm LDFMINA : AtomicFPLoad<0b10, 0b101, "ldfmina">; + defm LDFMINAL : AtomicFPLoad<0b11, 0b101, "ldfminal">; + defm LDFMIN : AtomicFPLoad<0b00, 0b101, "ldfmin">; + defm LDFMINL : AtomicFPLoad<0b01, 0b101, "ldfminl">; + defm LDFMAXNMA : AtomicFPLoad<0b10, 0b110, "ldfmaxnma">; + defm LDFMAXNMAL : AtomicFPLoad<0b11, 0b110, "ldfmaxnmal">; + defm LDFMAXNM : AtomicFPLoad<0b00, 0b110, "ldfmaxnm">; + defm LDFMAXNML : AtomicFPLoad<0b01, 0b110, "ldfmaxnml">; + defm LDFMINNMA : AtomicFPLoad<0b10, 0b111, "ldfminnma">; + defm LDFMINNMAL : AtomicFPLoad<0b11, 0b111, "ldfminnmal">; + defm LDFMINMN : AtomicFPLoad<0b00, 0b111, "ldfminnm">; + defm LDFMINNML : AtomicFPLoad<0b01, 0b111, "ldfminnml">; + // BFloat16 + def LDBFADDA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b000, "ldbfadda">; + def LDBFADDAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b000, "ldbfaddal">; + def LDBFADD : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b000, "ldbfadd">; + def LDBFADDL : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b000, "ldbfaddl">; + def LDBFMAXA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b100, "ldbfmaxa">; + def LDBFMAXAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b100, "ldbfmaxal">; + def LDBFMAX : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b100, "ldbfmax">; + def LDBFMAXL : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b100, "ldbfmaxl">; + def LDBFMINA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b101, "ldbfmina">; + def LDBFMINAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b101, "ldbfminal">; + def LDBFMIN : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b101, "ldbfmin">; + def LDBFMINL : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b101, "ldbfminl">; + def LDBFMAXNMA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b110, "ldbfmaxnma">; + def LDBFMAXNMAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b110, "ldbfmaxnmal">; + def LDBFMAXNM : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b110, "ldbfmaxnm">; + def LDBFMAXNML : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b110, "ldbfmaxnml">; + def LDBFMINNMA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b111, "ldbfminnma">; + def LDBFMINNMAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b111, "ldbfminnmal">; + def LDBFMINNM : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b111, "ldbfminnm">; + def LDBFMINNML : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b111, "ldbfminnml">; + + // Floating-point Atomic Store + defm STFADD : AtomicFPStore<0b0, 0b000, "stfadd">; + defm STFADDL : AtomicFPStore<0b1, 0b000, "stfaddl">; + defm STFMAX : AtomicFPStore<0b0, 0b100, "stfmax">; + defm STFMAXL : AtomicFPStore<0b1, 0b100, "stfmaxl">; + defm STFMIN : AtomicFPStore<0b0, 0b101, "stfmin">; + defm STFMINL : AtomicFPStore<0b1, 0b101, "stfminl">; + defm STFMAXNM : AtomicFPStore<0b0, 0b110, "stfmaxnm">; + defm STFMAXNML : AtomicFPStore<0b1, 0b110, "stfmaxnml">; + defm STFMINNM : AtomicFPStore<0b0, 0b111, "stfminnm">; + defm STFMINNML : AtomicFPStore<0b1, 0b111, "stfminnml">; + // BFloat16 + def STBFADD : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b000, "stbfadd">; + def STBFADDL : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b000, "stbfaddl">; + def STBFMAX : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b100, "stbfmax">; + def STBFMAXL : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b100, "stbfmaxl">; + def STBFMIN : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b101, "stbfmin">; + def STBFMINL : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b101, "stbfminl">; + def STBFMAXNM : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b110, "stbfmaxnm">; + def STBFMAXNML : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b110, "stbfmaxnml">; + def STBFMINNM : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b111, "stbfminnm">; + def STBFMINNML : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b111, "stbfminnml">; +} + +let Uses = [FPMR, FPCR] in +defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">; + include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" include "AArch64SMEInstrInfo.td" |
