aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64/AArch64InstrInfo.td
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64InstrInfo.td')
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td1379
1 files changed, 864 insertions, 515 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1053ba924276..d112d4f10e47 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -143,20 +143,20 @@ def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
"fuse-aes">;
def HasSVE : Predicate<"Subtarget->isSVEAvailable()">,
AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">;
+def HasSVEB16B16 : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEB16B16()">,
+ AssemblerPredicateWithAll<(all_of FeatureSVEB16B16), "sve-b16b16">;
def HasSVE2 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;
def HasSVE2p1 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">;
-def HasSVE2AES : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2AES()">,
- AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">;
+def HasSVEAES : Predicate<"Subtarget->hasSVEAES()">,
+ AssemblerPredicateWithAll<(all_of FeatureSVEAES), "sve-aes">;
def HasSVE2SM4 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SM4()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">;
def HasSVE2SHA3 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">,
AssemblerPredicateWithAll<(all_of FeatureSVE2SHA3), "sve2-sha3">;
-def HasSVE2BitPerm : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2BitPerm()">,
- AssemblerPredicateWithAll<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
-def HasB16B16 : Predicate<"Subtarget->hasB16B16()">,
- AssemblerPredicateWithAll<(all_of FeatureB16B16), "b16b16">;
+def HasSVEBitPerm : Predicate<"Subtarget->hasSVEBitPerm()">,
+ AssemblerPredicateWithAll<(all_of FeatureSVEBitPerm), "sve-bitperm">;
def HasSMEandIsNonStreamingSafe
: Predicate<"Subtarget->hasSME()">,
AssemblerPredicateWithAll<(all_of FeatureSME), "sme">;
@@ -170,6 +170,8 @@ def HasSMEFA64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF
AssemblerPredicateWithAll<(all_of FeatureSMEFA64), "sme-fa64">;
def HasSMEI16I64 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEI16I64()">,
AssemblerPredicateWithAll<(all_of FeatureSMEI16I64), "sme-i16i64">;
+def HasSMEB16B16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEB16B16()">,
+ AssemblerPredicateWithAll<(all_of FeatureSMEB16B16), "sme-b16b16">;
def HasSME2andIsNonStreamingSafe
: Predicate<"Subtarget->hasSME2()">,
AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">;
@@ -204,47 +206,104 @@ def HasSSVE_FP8DOT4 : Predicate<"Subtarget->hasSSVE_FP8DOT4() || "
"ssve-fp8dot4 or (sve2 and fp8dot4)">;
def HasLUT : Predicate<"Subtarget->hasLUT()">,
AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">;
-def HasSME_LUTv2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">,
+def HasSME_LUTv2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">,
AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">;
def HasSMEF8F16 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">,
AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">;
def HasSMEF8F32 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">,
AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">;
+def HasSME_MOP4 : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">,
+ AssemblerPredicateWithAll<(all_of FeatureSME_MOP4), "sme-mop4">;
+def HasSME_TMOP : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">,
+ AssemblerPredicateWithAll<(all_of FeatureSME_TMOP), "sme-tmop">;
+
+def HasCMPBR : Predicate<"Subtarget->hasCMPBR()">,
+ AssemblerPredicateWithAll<(all_of FeatureCMPBR), "cmpbr">;
+def HasF8F32MM : Predicate<"Subtarget->hasF8F32MM()">,
+ AssemblerPredicateWithAll<(all_of FeatureF8F32MM), "f8f32mm">;
+def HasF8F16MM : Predicate<"Subtarget->hasF8F16MM()">,
+ AssemblerPredicateWithAll<(all_of FeatureF8F16MM), "f8f16mm">;
+def HasFPRCVT : Predicate<"Subtarget->hasFPRCVT()">,
+ AssemblerPredicateWithAll<(all_of FeatureFPRCVT), "fprcvt">;
+def HasLSFE : Predicate<"Subtarget->hasLSFE()">,
+ AssemblerPredicateWithAll<(all_of FeatureLSFE), "lsfe">;
+def HasSME2p2 : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">,
+ AssemblerPredicateWithAll<(all_of FeatureSME2p2), "sme2p2">;
+def HasSVEAES2 : Predicate<"Subtarget->hasSVEAES2()">,
+ AssemblerPredicateWithAll<(all_of FeatureSVEAES2), "sve-aes2">;
+def HasSVEBFSCALE : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">,
+ AssemblerPredicateWithAll<(all_of FeatureSVEBFSCALE), "sve-bfscale">;
+def HasSVE_F16F32MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">,
+ AssemblerPredicateWithAll<(all_of FeatureSVE_F16F32MM), "sve-f16f32mm">;
+def HasPCDPHINT : Predicate<"Subtarget->hasPCDPHINT()">,
+ AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">;
+def HasLSUI : Predicate<"Subtarget->hasLSUI()">,
+ AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">;
+def HasOCCMO : Predicate<"Subtarget->hasOCCMO()">,
+ AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">;
// A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
// they should be enabled if either has been specified.
-def HasSVEorSME
+def HasSVE_or_SME
: Predicate<"Subtarget->hasSVE() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME),
"sve or sme">;
-def HasSVE2orSME
+def HasNonStreamingSVE_or_SME2p2
+ : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE()) ||"
+ "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
+ AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME2p2),
+ "sve or sme2p2">;
+def HasSVE2_or_SME
: Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME),
"sve2 or sme">;
-def HasSVE2orSME2
+def HasSVE2_or_SME2
: Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME2),
"sve2 or sme2">;
-def HasSVE2p1_or_HasSME
+def HasNonStreamingSVE2_or_SSVE_AES
+ : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||"
+ "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">,
+ AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_AES), "sve2 or ssve-aes">;
+def HasSVE2p1_or_SME
: Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">;
-def HasSVE2p1_or_HasSME2
+def HasSVE2p1_or_SME2
: Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
AssemblerPredicateWithAll<(any_of FeatureSME2, FeatureSVE2p1), "sme2 or sve2p1">;
-def HasSVE2p1_or_HasSME2p1
+def HasSVE2p1_or_SME2p1
: Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2p1())">,
AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">;
-
-def HasSMEF16F16orSMEF8F16
+def HasSVE2p2_or_SME2p2
+ : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">,
+ AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2), "sme2p2 or sve2p2">;
+def HasNonStreamingSVE2p1_or_SSVE_AES
+ : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()) ||"
+ "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">,
+ AssemblerPredicateWithAll<(any_of FeatureSVE2p1, FeatureSSVE_AES), "sve2p1 or ssve-aes">;
+def HasSMEF16F16_or_SMEF8F16
: Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
"sme-f16f16 or sme-f8f16">;
+def HasNonStreamingSVE2p2_or_SME2p2
+ : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p2()) ||"
+ "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
+ AssemblerPredicateWithAll<(any_of FeatureSVE2p2, FeatureSME2p2),
+ "sme2p2 or sve2p2">;
+def HasNonStreamingSVE2_or_SSVE_BitPerm
+ : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||"
+ "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_BitPerm())">,
+ AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_BitPerm), "sve2 or ssve-bitperm">;
// A subset of NEON instructions are legal in Streaming SVE execution mode,
// so don't need the additional check for 'isNeonAvailable'.
def HasNEONandIsStreamingSafe
: Predicate<"Subtarget->hasNEON()">,
AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
+// A subset of NEON instructions are legal in Streaming SVE mode only with +sme2p2.
+def HasNEONandIsSME2p2StreamingSafe
+ : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">,
+ AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
def HasRCPC : Predicate<"Subtarget->hasRCPC()">,
AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
def HasAltNZCV : Predicate<"Subtarget->hasAlternativeNZCV()">,
@@ -330,6 +389,8 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
+def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">;
+
def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
SDTCisInt<1>]>>;
@@ -387,9 +448,9 @@ def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
SDTCisInt<3>,
SDTCisInt<4>,
SDTCisVT<5, i32>]>;
-def SDT_AArch64FCmp : SDTypeProfile<0, 2,
- [SDTCisFP<0>,
- SDTCisSameAs<0, 1>]>;
+def SDT_AArch64FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCisFP<1>,
+ SDTCisSameAs<2, 1>]>;
def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>;
@@ -756,6 +817,8 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
+def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>;
+
def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
@@ -828,9 +891,11 @@ def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
SDT_AArch64TLSDescCallSeq,
- [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
- SDNPVariadic]>;
+ [SDNPOutGlue, SDNPHasChain, SDNPVariadic]>;
+def AArch64tlsdesc_auth_callseq : SDNode<"AArch64ISD::TLSDESC_AUTH_CALLSEQ",
+ SDT_AArch64TLSDescCallSeq,
+ [SDNPOutGlue, SDNPHasChain, SDNPVariadic]>;
def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
SDT_AArch64WrapperLarge>;
@@ -853,6 +918,7 @@ def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;
def AArch64sdot : SDNode<"AArch64ISD::SDOT", SDT_AArch64Dot>;
def AArch64udot : SDNode<"AArch64ISD::UDOT", SDT_AArch64Dot>;
+def AArch64usdot : SDNode<"AArch64ISD::USDOT", SDT_AArch64Dot>;
def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
@@ -938,8 +1004,10 @@ def AArch64probedalloca
[SDNPHasChain, SDNPMayStore]>;
def AArch64mrs : SDNode<"AArch64ISD::MRS",
- SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
- [SDNPHasChain, SDNPOutGlue]>;
+ SDTypeProfile<2, 1, [SDTCisVT<0, i64>,
+ SDTCisVT<1, i32>,
+ SDTCisVT<2, i32>]>,
+ [SDNPHasChain]>;
def SD_AArch64rshrnb : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<2>]>;
def AArch64rshrnb : SDNode<"AArch64ISD::RSHRNB_I", SD_AArch64rshrnb>;
@@ -971,6 +1039,17 @@ def smullwithsignbits : PatFrag<(ops node:$l, node:$r), (mul node:$l, node:$r),
CurDAG->ComputeNumSignBits(N->getOperand(1)) > 32;
}]>;
+// Match "nnan" flagged calls to fminnum and fmmaxnum. Then semantically equivalent
+// to fmaximum/fminimum.
+def fmaxnum_nnan : PatFrag<(ops node:$Rn, node:$Rm),
+ (fmaxnum node:$Rn, node:$Rm), [{
+ return N->getFlags().hasNoNaNs();
+ }]>;
+def fminnum_nnan : PatFrag<(ops node:$Rn, node:$Rm),
+ (fminnum node:$Rn, node:$Rm), [{
+ return N->getFlags().hasNoNaNs();
+ }]>;
+
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -1062,7 +1141,7 @@ def PROBED_STACKALLOC_DYN : Pseudo<(outs),
[(AArch64probedalloca GPR64common:$target)]>,
Sched<[]>;
-} // Defs = [SP, NZCV], Uses = [SP] in
+} // Defs = [SP, NZCV], Uses = [SP] in
} // hasSideEffects = 1, isCodeGenOnly = 1
let isReMaterializable = 1, isCodeGenOnly = 1 in {
@@ -1215,6 +1294,11 @@ def : InstAlias<"sevl", (HINT 0b101)>;
def : InstAlias<"dgh", (HINT 0b110)>;
def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>;
def : InstAlias<"csdb", (HINT 20)>;
+
+let Predicates = [HasPCDPHINT] in {
+ def STSHH: STSHHI;
+}
+
// In order to be able to write readable assembly, LLVM should accept assembly
// inputs that use Branch Target Indentification mnemonics, even with BTI disabled.
// However, in order to be compatible with other assemblers (e.g. GAS), LLVM
@@ -1372,8 +1456,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
def BFCVTN : SIMD_BFCVTN;
def BFCVTN2 : SIMD_BFCVTN2;
-def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
- (EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
+def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
+ (BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;
// Vector-scalar BFDOT:
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1395,8 +1479,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
-// Round FP32 to BF16.
-def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
}
// ARMv8.6A AArch64 matrix multiplication
@@ -1404,8 +1486,8 @@ let Predicates = [HasMatMulInt8] in {
def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
-defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
-defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>;
+defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", AArch64usdot>;
+defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", AArch64usdot>;
// sudot lane has a pattern where usdot is expected (there is no sudot).
// The second operand is used in the dup operation to repeat the indexed
@@ -1415,9 +1497,9 @@ class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
ValueType AccumType, ValueType InputType>
: BaseSIMDThreeSameVectorIndexS<Q, 0, 0b00, 0b1111, "sudot", dst_kind,
lhs_kind, rhs_kind, RegType, AccumType,
- InputType, null_frag> {
+ InputType, VectorIndexS, null_frag> {
let Pattern = [(set (AccumType RegType:$dst),
- (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd),
+ (AccumType (AArch64usdot (AccumType RegType:$Rd),
(InputType (bitconvert (AccumType
(AArch64duplane32 (v4i32 V128:$Rm),
VectorIndexS:$idx)))),
@@ -1770,28 +1852,28 @@ let Predicates = [HasPAuth] in {
// materialization here), in part because they're handled in a safer way by
// the kernel, notably on Darwin.
def BLRA : Pseudo<(outs), (ins GPR64noip:$Rn, i32imm:$Key, i64imm:$Disc,
- GPR64noip:$AddrDisc),
+ GPR64:$AddrDisc),
[(AArch64authcall GPR64noip:$Rn, timm:$Key, timm:$Disc,
- GPR64noip:$AddrDisc)]>, Sched<[]> {
+ GPR64:$AddrDisc)]>, Sched<[]> {
let isCodeGenOnly = 1;
let hasSideEffects = 1;
let mayStore = 0;
let mayLoad = 0;
let isCall = 1;
let Size = 12; // 4 fixed + 8 variable, to compute discriminator.
- let Defs = [X17,LR];
+ let Defs = [X16,X17,LR];
let Uses = [SP];
}
def BLRA_RVMARKER : Pseudo<
(outs), (ins i64imm:$rvfunc, GPR64noip:$Rn, i32imm:$Key, i64imm:$Disc,
- GPR64noip:$AddrDisc),
+ GPR64:$AddrDisc),
[(AArch64authcall_rvmarker tglobaladdr:$rvfunc,
GPR64noip:$Rn, timm:$Key, timm:$Disc,
- GPR64noip:$AddrDisc)]>, Sched<[]> {
+ GPR64:$AddrDisc)]>, Sched<[]> {
let isCodeGenOnly = 1;
let isCall = 1;
- let Defs = [X17,LR];
+ let Defs = [X16,X17,LR];
let Uses = [SP];
}
@@ -1872,8 +1954,15 @@ let Predicates = [HasPAuth] in {
Sched<[WriteI, ReadI]> {
let isReMaterializable = 1;
let isCodeGenOnly = 1;
- let Size = 40; // 12 fixed + 28 variable, for pointer offset, and discriminator
- let Defs = [X16,X17];
+ let Size = 68; // 12 fixed + 56 variable, for pointer offset, discriminator and
+ // ELF signed GOT signed pointer authentication (if no FPAC)
+ let Defs = [X16,X17,NZCV];
+ }
+
+ def LOADgotAUTH : Pseudo<(outs GPR64common:$dst), (ins i64imm:$addr), []>,
+ Sched<[WriteI, ReadI]> {
+ let Defs = [X16,X17,NZCV];
+ let Size = 44;
}
// Load a signed global address from a special $auth_ptr$ stub slot.
@@ -1887,30 +1976,36 @@ let Predicates = [HasPAuth] in {
}
// Size 16: 4 fixed + 8 variable, to compute discriminator.
+ // The size returned by getInstSizeInBytes() is incremented according
+ // to the variant of LR check.
+ // As the check requires either x16 or x17 as a scratch register and
+ // authenticated tail call instructions have two register operands,
+ // make sure at least one register is usable as a scratch one - for that
+ // purpose, use tcGPRnotx16x17 register class for one of the operands.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Size = 16,
- Uses = [SP] in {
+ Defs = [X16,X17], Uses = [SP] in {
def AUTH_TCRETURN
- : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff, i32imm:$Key,
+ : Pseudo<(outs), (ins tcGPRnotx16x17:$dst, i32imm:$FPDiff, i32imm:$Key,
i64imm:$Disc, tcGPR64:$AddrDisc),
[]>, Sched<[WriteBrReg]>;
def AUTH_TCRETURN_BTI
: Pseudo<(outs), (ins tcGPRx16x17:$dst, i32imm:$FPDiff, i32imm:$Key,
- i64imm:$Disc, tcGPR64:$AddrDisc),
+ i64imm:$Disc, tcGPRnotx16x17:$AddrDisc),
[]>, Sched<[WriteBrReg]>;
}
let Predicates = [TailCallAny] in
- def : Pat<(AArch64authtcret tcGPR64:$dst, (i32 timm:$FPDiff), (i32 timm:$Key),
+ def : Pat<(AArch64authtcret tcGPRnotx16x17:$dst, (i32 timm:$FPDiff), (i32 timm:$Key),
(i64 timm:$Disc), tcGPR64:$AddrDisc),
- (AUTH_TCRETURN tcGPR64:$dst, imm:$FPDiff, imm:$Key, imm:$Disc,
+ (AUTH_TCRETURN tcGPRnotx16x17:$dst, imm:$FPDiff, imm:$Key, imm:$Disc,
tcGPR64:$AddrDisc)>;
let Predicates = [TailCallX16X17] in
def : Pat<(AArch64authtcret tcGPRx16x17:$dst, (i32 timm:$FPDiff),
(i32 timm:$Key), (i64 timm:$Disc),
- tcGPR64:$AddrDisc),
+ tcGPRnotx16x17:$AddrDisc),
(AUTH_TCRETURN_BTI tcGPRx16x17:$dst, imm:$FPDiff, imm:$Key,
- imm:$Disc, tcGPR64:$AddrDisc)>;
+ imm:$Disc, tcGPRnotx16x17:$AddrDisc)>;
}
// v9.5-A pointer authentication extensions
@@ -1935,6 +2030,8 @@ let Predicates = [HasPAuthLR] in {
// opcode2, opcode, asm
def AUTIASPPCr : SignAuthOneReg<0b00001, 0b100100, "autiasppcr">;
def AUTIBSPPCr : SignAuthOneReg<0b00001, 0b100101, "autibsppcr">;
+ }
+ let Defs = [X17], Uses = [X15, X16, X17] in {
// opcode2, opcode, asm
def PACIA171615 : SignAuthFixedRegs<0b00001, 0b100010, "pacia171615">;
def PACIB171615 : SignAuthFixedRegs<0b00001, 0b100011, "pacib171615">;
@@ -2080,6 +2177,12 @@ def MSR_FPSR : Pseudo<(outs), (ins GPR64:$val),
PseudoInstExpansion<(MSR 0xda21, GPR64:$val)>,
Sched<[WriteSys]>;
+let Defs = [FPMR] in
+def MSR_FPMR : Pseudo<(outs), (ins GPR64:$val),
+ [(int_aarch64_set_fpmr i64:$val)]>,
+ PseudoInstExpansion<(MSR 0xda22, GPR64:$val)>,
+ Sched<[WriteSys]>;
+
// Generic system instructions
def SYSxt : SystemXtI<0, "sys">;
def SYSLxt : SystemLXtI<1, "sysl">;
@@ -2212,7 +2315,7 @@ def s64imm_32bit : ImmLeaf<i64, [{
}]>;
def trunc_imm : SDNodeXForm<imm, [{
- return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
+ return CurDAG->getTargetConstant((uint32_t)N->getZExtValue(), SDLoc(N), MVT::i32);
}]>;
def gi_trunc_imm : GICustomOperandRenderer<"renderTruncImm">,
@@ -2552,12 +2655,74 @@ defm CASPA : CompareAndSwapPair<1, 0, "a">;
defm CASPL : CompareAndSwapPair<0, 1, "l">;
defm CASPAL : CompareAndSwapPair<1, 1, "al">;
+// v9.6-a atomic CAST
+let Predicates = [HasLSUI] in {
+defm CAST : CompareAndSwapUnprivileged<0b11, 0, 0, "">;
+defm CASLT : CompareAndSwapUnprivileged<0b11, 0, 1, "l">;
+defm CASAT : CompareAndSwapUnprivileged<0b11, 1, 0, "a">;
+defm CASALT : CompareAndSwapUnprivileged<0b11, 1, 1, "al">;
+
+def : MnemonicAlias<"cas", "cast">;
+def : MnemonicAlias<"casl", "caslt">;
+def : MnemonicAlias<"casa", "casat">;
+def : MnemonicAlias<"casal", "casalt">;
+
+// v9.6-a atomic CASPT
+defm CASPT : CompareAndSwapPairUnprivileged<0b01, 0, 0, "">;
+defm CASPLT : CompareAndSwapPairUnprivileged<0b01, 0, 1, "l">;
+defm CASPAT : CompareAndSwapPairUnprivileged<0b01, 1, 0, "a">;
+defm CASPALT : CompareAndSwapPairUnprivileged<0b01, 1, 1, "al">;
+
+def : MnemonicAlias<"casp", "caspt">;
+def : MnemonicAlias<"caspl", "casplt">;
+def : MnemonicAlias<"caspa", "caspat">;
+def : MnemonicAlias<"caspal", "caspalt">;
+}
+
// v8.1 atomic SWP
defm SWP : Swap<0, 0, "">;
defm SWPA : Swap<1, 0, "a">;
defm SWPL : Swap<0, 1, "l">;
defm SWPAL : Swap<1, 1, "al">;
+// v9.6a atomic swap (FEAT_LSUI)
+let Predicates = [HasLSUI] in {
+ defm SWPT : SwapLSUI<0, 0, "">;
+ defm SWPTA : SwapLSUI<1, 0, "a">;
+ defm SWPTL : SwapLSUI<0, 1, "l">;
+ defm SWPTAL : SwapLSUI<1, 1, "al">;
+
+ def : MnemonicAlias<"swp", "swpt">;
+ def : MnemonicAlias<"swpa", "swpta">;
+ def : MnemonicAlias<"swpl", "swptl">;
+ def : MnemonicAlias<"swpal", "swptal">;
+}
+
+// v9.6-a unprivileged atomic LD<OP> (FEAT_LSUI)
+let Predicates = [HasLSUI] in {
+ defm LDTADD : LDOPregisterLSUI<0b000, "add", 0, 0, "">;
+ defm LDTADDA : LDOPregisterLSUI<0b000, "add", 1, 0, "a">;
+ defm LDTADDL : LDOPregisterLSUI<0b000, "add", 0, 1, "l">;
+ defm LDTADDAL : LDOPregisterLSUI<0b000, "add", 1, 1, "al">;
+
+ defm LDTCLR : LDOPregisterLSUI<0b001, "clr", 0, 0, "">;
+ defm LDTCLRA : LDOPregisterLSUI<0b001, "clr", 1, 0, "a">;
+ defm LDTCLRL : LDOPregisterLSUI<0b001, "clr", 0, 1, "l">;
+ defm LDTCLRAL : LDOPregisterLSUI<0b001, "clr", 1, 1, "al">;
+
+ defm LDTSET : LDOPregisterLSUI<0b011, "set", 0, 0, "">;
+ defm LDTSETA : LDOPregisterLSUI<0b011, "set", 1, 0, "a">;
+ defm LDTSETL : LDOPregisterLSUI<0b011, "set", 0, 1, "l">;
+ defm LDTSETAL : LDOPregisterLSUI<0b011, "set", 1, 1, "al">;
+
+ defm : STOPregisterLSUI<"sttadd","LDTADD">; // STTADDx
+ defm : STOPregisterLSUI<"sttclr","LDTCLR">; // STTCLRx
+ defm : STOPregisterLSUI<"sttset","LDTSET">; // STTSETx
+}
+
+// v9.6-a FEAT_RME_GPC3
+def APAS : APASI;
+
// v8.1 atomic LD<OP>(register). Performs load and then ST<OP>(register)
defm LDADD : LDOPregister<0b000, "add", 0, 0, "">;
defm LDADDA : LDOPregister<0b000, "add", 1, 0, "a">;
@@ -2640,14 +2805,17 @@ def : Pat<(int_aarch64_ldg GPR64:$Rt, (am_indexeds9s128 GPR64sp:$Rn, simm9s16:$
def : InstAlias<"ldg $Rt, [$Rn]", (LDG GPR64:$Rt, GPR64sp:$Rn, 0), 1>;
+let mayLoad = 1 in
def LDGM : MemTagVector<1, "ldgm", "\t$Rt, [$Rn]",
(outs GPR64:$Rt), (ins GPR64sp:$Rn)>;
+let mayStore = 1 in {
def STGM : MemTagVector<0, "stgm", "\t$Rt, [$Rn]",
(outs), (ins GPR64:$Rt, GPR64sp:$Rn)>;
def STZGM : MemTagVector<0, "stzgm", "\t$Rt, [$Rn]",
(outs), (ins GPR64:$Rt, GPR64sp:$Rn)> {
let Inst{23} = 0;
}
+} // mayStore = 1
defm STG : MemTagStore<0b00, "stg">;
defm STZG : MemTagStore<0b01, "stzg">;
@@ -2827,6 +2995,9 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;
+def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>;
+def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>;
+
def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)),
(and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))),
(REV16Xr GPR64:$Rn)>;
@@ -3157,8 +3328,16 @@ def TLSDESC_CALLSEQ
: Pseudo<(outs), (ins i64imm:$sym),
[(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
+let isCall = 1, Defs = [NZCV, LR, X0, X16], hasSideEffects = 1, Size = 16,
+ isCodeGenOnly = 1 in
+def TLSDESC_AUTH_CALLSEQ
+ : Pseudo<(outs), (ins i64imm:$sym),
+ [(AArch64tlsdesc_auth_callseq tglobaltlsaddr:$sym)]>,
+ Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
(TLSDESC_CALLSEQ texternalsym:$sym)>;
+def : Pat<(AArch64tlsdesc_auth_callseq texternalsym:$sym),
+ (TLSDESC_AUTH_CALLSEQ texternalsym:$sym)>;
//===----------------------------------------------------------------------===//
// Conditional branch (immediate) instruction.
@@ -3302,59 +3481,6 @@ defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
// Pre-fetch.
defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
- ValueType ScalTy, ValueType VecTy,
- Instruction LOADW, Instruction LOADX,
- SubRegIndex sub> {
- def : Pat<(VecTy (scalar_to_vector (ScalTy
- (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
- (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
- (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
- sub)>;
-
- def : Pat<(VecTy (scalar_to_vector (ScalTy
- (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
- (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
- (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
- sub)>;
-}
-
-let AddedComplexity = 10 in {
-defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
-defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
-
-defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
-defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
-
-defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
-defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;
-
-defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
-defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
-
-defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
-defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;
-
-defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;
-
-defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;
-
-
-def : Pat <(v1i64 (scalar_to_vector (i64
- (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
- ro_Wextend64:$extend))))),
- (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
-
-def : Pat <(v1i64 (scalar_to_vector (i64
- (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
- ro_Xextend64:$extend))))),
- (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
-}
-
// Match all load 64 bits width whose type is compatible with FPR64
multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
Instruction LOADW, Instruction LOADX> {
@@ -3478,42 +3604,6 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32
- (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
- (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
- (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32
- (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32
- (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
- (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
- (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32
- (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
- (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
- (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32
- (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
- (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
- (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32
- (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
- (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
- (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64
- (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
- (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
-def : Pat <(v2i64 (scalar_to_vector (i64
- (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
- (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
- (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
-
// Match all load 64 bits width whose type is compatible with FPR64
let Predicates = [IsLE] in {
// We must use LD1 to perform vector loads in big-endian.
@@ -3879,12 +3969,13 @@ def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
(LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
-// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros,
-// load, 0) can use a single load.
-multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
- ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
- ComplexPattern Addr, ComplexPattern UnscaledAddr, Operand AddrImm,
- SubRegIndex SubReg> {
+// A LDR will implicitly zero the rest of the vector, so vector_insert(zeros, load, 0)
+// can use a single load. Same for scalar_to_vector(load) or insert(undef, load, 0).
+multiclass LoadInsertVTPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType ScalarVT,
+ Instruction LoadInst, Instruction UnscaledLoadInst,
+ Instruction ROWLoadInst, Instruction ROXLoadInst,
+ ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
+ Operand AddrImm, SubRegIndex SubReg> {
// Scaled
def : Pat <(vector_insert (VT immAllZerosV),
(ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
@@ -3893,42 +3984,82 @@ multiclass LoadInsertZeroPatterns<SDPatternOperator LoadOp, ValueType VT, ValueT
def : Pat <(vector_insert (VT immAllZerosV),
(ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
+ // roW
+ def : Pat <(vector_insert (VT immAllZerosV),
+ (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), (i64 0)),
+ (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
+ // roX
+ def : Pat <(vector_insert (VT immAllZerosV),
+ (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), (i64 0)),
+ (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
- // Half-vector patterns
- def : Pat <(vector_insert (HVT immAllZerosV),
- (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
- (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
- // Unscaled
- def : Pat <(vector_insert (HVT immAllZerosV),
- (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
- (SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
-
- // SVE patterns
- def : Pat <(vector_insert (SVT immAllZerosV),
- (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))), (i64 0)),
- (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
- // Unscaled
- def : Pat <(vector_insert (SVT immAllZerosV),
- (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))), (i64 0)),
+ // Undef equivalents of the patterns above.
+ def : Pat <(VT (vec_ins_or_scal_vec
+ (ScalarVT (LoadOp (Addr GPR64sp:$Rn, AddrImm:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LoadInst GPR64sp:$Rn, AddrImm:$offset), SubReg)>;
+ def : Pat <(VT (vec_ins_or_scal_vec
+ (ScalarVT (LoadOp (UnscaledAddr GPR64sp:$Rn, simm9:$offset))))),
(SUBREG_TO_REG (i64 0), (UnscaledLoadInst GPR64sp:$Rn, simm9:$offset), SubReg)>;
-}
-
-defm : LoadInsertZeroPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32, LDRBui, LDURBi,
- am_indexed8, am_unscaled8, uimm12s1, bsub>;
-defm : LoadInsertZeroPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32, LDRHui, LDURHi,
- am_indexed16, am_unscaled16, uimm12s2, hsub>;
-defm : LoadInsertZeroPatterns<load, v4i32, v2i32, nxv4i32, i32, LDRSui, LDURSi,
- am_indexed32, am_unscaled32, uimm12s4, ssub>;
-defm : LoadInsertZeroPatterns<load, v2i64, v1i64, nxv2i64, i64, LDRDui, LDURDi,
- am_indexed64, am_unscaled64, uimm12s8, dsub>;
-defm : LoadInsertZeroPatterns<load, v8f16, v4f16, nxv8f16, f16, LDRHui, LDURHi,
- am_indexed16, am_unscaled16, uimm12s2, hsub>;
-defm : LoadInsertZeroPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16, LDRHui, LDURHi,
- am_indexed16, am_unscaled16, uimm12s2, hsub>;
-defm : LoadInsertZeroPatterns<load, v4f32, v2f32, nxv4f32, f32, LDRSui, LDURSi,
- am_indexed32, am_unscaled32, uimm12s4, ssub>;
-defm : LoadInsertZeroPatterns<load, v2f64, v1f64, nxv2f64, f64, LDRDui, LDURDi,
- am_indexed64, am_unscaled64, uimm12s8, dsub>;
+ def : Pat <(VT (vec_ins_or_scal_vec
+ (ScalarVT (LoadOp (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))))),
+ (SUBREG_TO_REG (i64 0), (ROWLoadInst GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), SubReg)>;
+ def : Pat <(VT (vec_ins_or_scal_vec
+ (ScalarVT (LoadOp (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))))),
+ (SUBREG_TO_REG (i64 0), (ROXLoadInst GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), SubReg)>;
+}
+
+multiclass LoadInsertPatterns<SDPatternOperator LoadOp, ValueType VT, ValueType HVT, ValueType SVT,
+ ValueType ScalarVT, Instruction LoadInst, Instruction UnscaledLoadInst,
+ Instruction ROWLoadInst, Instruction ROXLoadInst,
+ ROAddrMode ro, ComplexPattern Addr, ComplexPattern UnscaledAddr,
+ Operand AddrImm, SubRegIndex SubReg> {
+ defm : LoadInsertVTPatterns<LoadOp, VT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
+ defm : LoadInsertVTPatterns<LoadOp, HVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
+ defm : LoadInsertVTPatterns<LoadOp, SVT, ScalarVT, LoadInst, UnscaledLoadInst, ROWLoadInst,
+ ROXLoadInst, ro, Addr, UnscaledAddr, AddrImm, SubReg>;
+}
+
+defm : LoadInsertPatterns<extloadi8, v16i8, v8i8, nxv16i8, i32,
+ LDRBui, LDURBi, LDRBroW, LDRBroX,
+ ro8, am_indexed8, am_unscaled8, uimm12s1, bsub>;
+defm : LoadInsertPatterns<extloadi16, v8i16, v4i16, nxv8i16, i32,
+ LDRHui, LDURHi, LDRHroW, LDRHroX,
+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
+defm : LoadInsertPatterns<load, v4i32, v2i32, nxv4i32, i32,
+ LDRSui, LDURSi, LDRSroW, LDRSroX,
+ ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
+defm : LoadInsertPatterns<load, v2i64, isVoid, nxv2i64, i64,
+ LDRDui, LDURDi, LDRDroW, LDRDroX,
+ ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
+defm : LoadInsertPatterns<load, v8f16, v4f16, nxv8f16, f16,
+ LDRHui, LDURHi, LDRHroW, LDRHroX,
+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
+defm : LoadInsertPatterns<load, v8bf16, v4bf16, nxv8bf16, bf16,
+ LDRHui, LDURHi, LDRHroW, LDRHroX,
+ ro16, am_indexed16, am_unscaled16, uimm12s2, hsub>;
+defm : LoadInsertPatterns<load, v4f32, v2f32, nxv4f32, f32,
+ LDRSui, LDURSi, LDRSroW, LDRSroX,
+ ro32, am_indexed32, am_unscaled32, uimm12s4, ssub>;
+defm : LoadInsertPatterns<load, v2f64, isVoid, nxv2f64, f64,
+ LDRDui, LDURDi, LDRDroW, LDRDroX,
+ ro64, am_indexed64, am_unscaled64, uimm12s8, dsub>;
+
+// Extra patterns for v1f64 scalar_to_vector(load), which need to avoid the
+// SUBREG_TO_REG used above.
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (ro64.Wpat GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend))))),
+ (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro64.Wext:$extend)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+ (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
+ (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
// Pre-fetch.
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
@@ -4049,6 +4180,33 @@ defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
}
+// Armv9.6-a Load/store pair (FEAT_LSUI)
+let Predicates = [HasLSUI] in {
+ defm LDTP : LoadPairOffset<0b11, 0, GPR64z, simm7s8, "ldtp">;
+ def LDTPpre : LoadPairPreIdx<0b11, 0, GPR64z, simm7s8, "ldtp">;
+ def LDTPpost : LoadPairPostIdx<0b11, 0, GPR64z, simm7s8, "ldtp">;
+
+ defm STTNPX : StorePairNoAllocLSUI<0b11, 0, GPR64z, simm7s8, "sttnp">;
+ defm LDTNPX : LoadPairNoAllocLSUI<0b11, 0, GPR64z, simm7s8, "ldtnp">;
+
+ defm STTP : StorePairOffset<0b11, 0, GPR64z, simm7s8, "sttp">;
+ def STTPpre : StorePairPreIdx<0b11, 0, GPR64z, simm7s8, "sttp">;
+ def STTPpost : StorePairPostIdx<0b11, 0, GPR64z, simm7s8, "sttp">;
+}
+
+let Predicates = [HasLSUI, HasNEON] in {
+ defm LDTPQ : LoadPairOffset<0b11, 1, FPR128Op, simm7s16, "ldtp">;
+ def LDTPQpre : LoadPairPreIdx<0b11, 1, FPR128Op, simm7s16, "ldtp">;
+ def LDTPQpost : LoadPairPostIdx<0b11, 1, FPR128Op, simm7s16, "ldtp">;
+
+ defm STTNPQ : StorePairNoAllocLSUI<0b11, 1, FPR128Op, simm7s16, "sttnp">;
+ defm LDTNPQ : LoadPairNoAllocLSUI<0b11, 1, FPR128Op, simm7s16, "ldtnp">;
+
+ defm STTPQ : StorePairOffset<0b11, 1, FPR128Op, simm7s16, "sttp">;
+ def STTPQpre : StorePairPreIdx<0b11, 1, FPR128Op, simm7s16, "sttp">;
+ def STTPQpost : StorePairPostIdx<0b11, 1, FPR128Op, simm7s16, "sttp">;
+}
+
def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
(STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
@@ -4504,6 +4662,10 @@ def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128Op, "str", pre_store, f128>;
def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32z, "strb", pre_truncsti8, i32>;
def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32z, "strh", pre_truncsti16, i32>;
+// bf16 pre-index store
+def : Pat<(pre_store (bf16 FPR16:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRHpre FPR16:$Rt, GPR64sp:$addr, simm9:$off)>;
+
// truncstore i64
def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
(STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
@@ -4529,6 +4691,8 @@ def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4bf16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
@@ -4544,6 +4708,8 @@ def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8bf16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
//---
// (immediate post-indexed)
@@ -4689,6 +4855,29 @@ let Predicates = [HasLOR] in {
def STLLRH0 : InstAlias<"stllrh\t$Rt, [$Rn, #0]", (STLLRH GPR32: $Rt, GPR64sp:$Rn)>;
}
+// v9.6-a Unprivileged load store operations
+let Predicates = [HasLSUI] in {
+defm LDTXRW : LoadUnprivilegedLSUI<0b10, GPR32, "ldtxr">;
+defm LDTXRX : LoadUnprivilegedLSUI<0b11, GPR64, "ldtxr">;
+
+def : MnemonicAlias<"ldxr", "ldtxr">;
+
+def LDATXRW : LoadExclusiveLSUI <0b10, 1, 1, GPR32, "ldatxr">;
+def LDATXRX : LoadExclusiveLSUI <0b11, 1, 1, GPR64, "ldatxr">;
+
+def : MnemonicAlias<"ldaxr", "ldatxr">;
+
+defm STTXRW : StoreUnprivilegedLSUI<0b10, GPR32, "sttxr">;
+defm STTXRX : StoreUnprivilegedLSUI<0b11, GPR64, "sttxr">;
+
+def : MnemonicAlias<"stxr", "sttxr">;
+
+def STLTXRW : StoreExclusiveLSUI<0b10, 0, 1, GPR32, "stltxr">;
+def STLTXRX : StoreExclusiveLSUI<0b11, 0, 1, GPR64, "stltxr">;
+
+def : MnemonicAlias<"stlxr", "stltxr">;
+}
+
//===----------------------------------------------------------------------===//
// Scaled floating point to integer conversion instructions.
//===----------------------------------------------------------------------===//
@@ -4706,8 +4895,21 @@ defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
+let Predicates = [HasNEON, HasFPRCVT] in{
+ defm FCVTAS : FPToIntegerSIMDScalar<0b11, 0b010, "fcvtas">;
+ defm FCVTAU : FPToIntegerSIMDScalar<0b11, 0b011, "fcvtau">;
+ defm FCVTMS : FPToIntegerSIMDScalar<0b10, 0b100, "fcvtms">;
+ defm FCVTMU : FPToIntegerSIMDScalar<0b10, 0b101, "fcvtmu">;
+ defm FCVTNS : FPToIntegerSIMDScalar<0b01, 0b010, "fcvtns">;
+ defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu">;
+ defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps">;
+ defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu">;
+ defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">;
+ defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">;
+}
+
// AArch64's FCVT instructions saturate when out of range.
-multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> {
+multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> {
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (to_int_sat f16:$Rn, i32)),
(!cast<Instruction>(INST # UWHr) f16:$Rn)>;
@@ -4724,6 +4926,21 @@ multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> {
(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (to_int_sat_gi f16:$Rn)),
+ (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f16:$Rn)),
+ (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+ }
+ def : Pat<(i32 (to_int_sat_gi f32:$Rn)),
+ (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f32:$Rn)),
+ (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int_sat_gi f64:$Rn)),
+ (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f64:$Rn)),
+ (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+ let Predicates = [HasFullFP16] in {
def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)),
(!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)),
@@ -4737,10 +4954,25 @@ multiclass FPToIntegerSatPats<SDNode to_int_sat, string INST> {
(!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)),
(!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
+
+ let Predicates = [HasFullFP16] in {
+ def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
+ (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+ def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
+ (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+ }
+ def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
+ (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+ def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
+ (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+ def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
+ (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+ def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
+ (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
}
-defm : FPToIntegerSatPats<fp_to_sint_sat, "FCVTZS">;
-defm : FPToIntegerSatPats<fp_to_uint_sat, "FCVTZU">;
+defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">;
+defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">;
multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
let Predicates = [HasFullFP16] in {
@@ -4834,8 +5066,13 @@ def : Pat<(i64 (any_llround f64:$Rn)),
// Scaled integer to floating point conversion instructions.
//===----------------------------------------------------------------------===//
-defm SCVTF : IntegerToFP<0, "scvtf", any_sint_to_fp>;
-defm UCVTF : IntegerToFP<1, "ucvtf", any_uint_to_fp>;
+defm SCVTF : IntegerToFP<0b00, 0b010, "scvtf", any_sint_to_fp>;
+defm UCVTF : IntegerToFP<0b00, 0b011, "ucvtf", any_uint_to_fp>;
+
+let Predicates = [HasNEON, HasFPRCVT] in {
+ defm SCVTF : IntegerToFPSIMDScalar<0b11, 0b100, "scvtf">;
+ defm UCVTF : IntegerToFPSIMDScalar<0b11, 0b101, "ucvtf">;
+}
def : Pat<(f16 (fdiv (f16 (any_sint_to_fp (i32 GPR32:$Rn))), fixedpoint_f16_i32:$scale)),
(SCVTFSWHri GPR32:$Rn, fixedpoint_f16_i32:$scale)>;
@@ -4907,22 +5144,6 @@ let Predicates = [HasFullFP16] in {
//===----------------------------------------------------------------------===//
defm FCVT : FPConversion<"fcvt">;
-// Helper to get bf16 into fp32.
-def cvt_bf16_to_fp32 :
- OutPatFrag<(ops node:$Rn),
- (f32 (COPY_TO_REGCLASS
- (i32 (UBFMWri
- (i32 (COPY_TO_REGCLASS (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
- node:$Rn, hsub), GPR32)),
- (i64 (i32shift_a (i64 16))),
- (i64 (i32shift_b (i64 16))))),
- FPR32))>;
-// Pattern for bf16 -> fp32.
-def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))),
- (cvt_bf16_to_fp32 FPR16:$Rn)>;
-// Pattern for bf16 -> fp64.
-def : Pat<(f64 (any_fpextend (bf16 FPR16:$Rn))),
- (FCVTDSr (f32 (cvt_bf16_to_fp32 FPR16:$Rn)))>;
//===----------------------------------------------------------------------===//
// Floating point single operand instructions.
@@ -5049,6 +5270,27 @@ def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
(FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(fminnum_ieee (f64 FPR64:$a), (f64 FPR64:$b)),
+ (FMINNMDrr FPR64:$a, FPR64:$b)>;
+def : Pat<(fmaxnum_ieee (f64 FPR64:$a), (f64 FPR64:$b)),
+ (FMAXNMDrr FPR64:$a, FPR64:$b)>;
+def : Pat<(f64 (fcanonicalize f64:$a)),
+ (FMINNMDrr f64:$a, f64:$a)>;
+def : Pat<(fminnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)),
+ (FMINNMSrr FPR32:$a, FPR32:$b)>;
+def : Pat<(fmaxnum_ieee (f32 FPR32:$a), (f32 FPR32:$b)),
+ (FMAXNMSrr FPR32:$a, FPR32:$b)>;
+def : Pat<(f32 (fcanonicalize f32:$a)),
+ (FMINNMSrr f32:$a, f32:$a)>;
+
+let Predicates = [HasFullFP16] in {
+def : Pat<(fminnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)),
+ (FMINNMHrr FPR16:$a, FPR16:$b)>;
+def : Pat<(fmaxnum_ieee (f16 FPR16:$a), (f16 FPR16:$b)),
+ (FMAXNMHrr FPR16:$a, FPR16:$b)>;
+def : Pat<(f16 (fcanonicalize f16:$a)),
+ (FMINNMHrr f16:$a, f16:$a)>;
+}
//===----------------------------------------------------------------------===//
// Floating point three operand instructions.
//===----------------------------------------------------------------------===//
@@ -5159,7 +5401,7 @@ let isPseudo = 1 in {
//===----------------------------------------------------------------------===//
let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
isCodeGenOnly = 1, isReturn = 1, isEHScopeReturn = 1, isPseudo = 1 in {
- def CLEANUPRET : Pseudo<(outs), (ins), [(cleanupret)]>, Sched<[]>;
+ def CLEANUPRET : Pseudo<(outs), (ins), [(cleanupret bb)]>, Sched<[]>;
let usesCustomInserter = 1 in
def CATCHRET : Pseudo<(outs), (ins am_brcond:$dst, am_brcond:$src), [(catchret bb:$dst, bb:$src)]>,
Sched<[]>;
@@ -5286,12 +5528,17 @@ defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
// AArch64's FCVT instructions saturate when out of range.
-multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> {
+multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> {
let Predicates = [HasFullFP16] in {
def : Pat<(v4i16 (to_int_sat v4f16:$Rn, i16)),
(!cast<Instruction>(INST # v4f16) v4f16:$Rn)>;
def : Pat<(v8i16 (to_int_sat v8f16:$Rn, i16)),
(!cast<Instruction>(INST # v8f16) v8f16:$Rn)>;
+
+ def : Pat<(v4i16 (to_int_sat_gi v4f16:$Rn)),
+ (!cast<Instruction>(INST # v4f16) v4f16:$Rn)>;
+ def : Pat<(v8i16 (to_int_sat_gi v8f16:$Rn)),
+ (!cast<Instruction>(INST # v8f16) v8f16:$Rn)>;
}
def : Pat<(v2i32 (to_int_sat v2f32:$Rn, i32)),
(!cast<Instruction>(INST # v2f32) v2f32:$Rn)>;
@@ -5299,9 +5546,16 @@ multiclass SIMDTwoVectorFPToIntSatPats<SDNode to_int_sat, string INST> {
(!cast<Instruction>(INST # v4f32) v4f32:$Rn)>;
def : Pat<(v2i64 (to_int_sat v2f64:$Rn, i64)),
(!cast<Instruction>(INST # v2f64) v2f64:$Rn)>;
+
+ def : Pat<(v2i32 (to_int_sat_gi v2f32:$Rn)),
+ (!cast<Instruction>(INST # v2f32) v2f32:$Rn)>;
+ def : Pat<(v4i32 (to_int_sat_gi v4f32:$Rn)),
+ (!cast<Instruction>(INST # v4f32) v4f32:$Rn)>;
+ def : Pat<(v2i64 (to_int_sat_gi v2f64:$Rn)),
+ (!cast<Instruction>(INST # v2f64) v2f64:$Rn)>;
}
-defm : SIMDTwoVectorFPToIntSatPats<fp_to_sint_sat, "FCVTZS">;
-defm : SIMDTwoVectorFPToIntSatPats<fp_to_uint_sat, "FCVTZU">;
+defm : SIMDTwoVectorFPToIntSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">;
+defm : SIMDTwoVectorFPToIntSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">;
def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
@@ -5363,14 +5617,14 @@ defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", any_sint_to_fp>;
defm SHLL : SIMDVectorLShiftLongBySizeBHS;
defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
-defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
-defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
+defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", truncssat_s>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", truncssat_u>;
defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
BinOpFrag<(add node:$LHS, (AArch64uaddlp node:$RHS))> >;
defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", AArch64uaddlp>;
defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", any_uint_to_fp>;
-defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
+defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", truncusat_u>;
defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
@@ -5409,84 +5663,16 @@ defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
-// Constant vector values, used in the S/UQXTN patterns below.
-def VImmFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 85))))>;
-def VImmFFFF: PatLeaf<(AArch64NvCast (v2i64 (AArch64movi_edit (i32 51))))>;
-def VImm7F: PatLeaf<(AArch64movi_shift (i32 127), (i32 0))>;
-def VImm80: PatLeaf<(AArch64mvni_shift (i32 127), (i32 0))>;
-def VImm7FFF: PatLeaf<(AArch64movi_msl (i32 127), (i32 264))>;
-def VImm8000: PatLeaf<(AArch64mvni_msl (i32 127), (i32 264))>;
-
-// trunc(umin(X, 255)) -> UQXTRN v8i8
-def : Pat<(v8i8 (trunc (umin (v8i16 V128:$Vn), (v8i16 VImmFF)))),
- (UQXTNv8i8 V128:$Vn)>;
-// trunc(umin(X, 65535)) -> UQXTRN v4i16
-def : Pat<(v4i16 (trunc (umin (v4i32 V128:$Vn), (v4i32 VImmFFFF)))),
- (UQXTNv4i16 V128:$Vn)>;
-// trunc(smin(smax(X, -128), 128)) -> SQXTRN
-// with reversed min/max
-def : Pat<(v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)),
- (v8i16 VImm7F)))),
- (SQXTNv8i8 V128:$Vn)>;
-def : Pat<(v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)),
- (v8i16 VImm80)))),
- (SQXTNv8i8 V128:$Vn)>;
-// trunc(smin(smax(X, -32768), 32767)) -> SQXTRN
-// with reversed min/max
-def : Pat<(v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)),
- (v4i32 VImm7FFF)))),
- (SQXTNv4i16 V128:$Vn)>;
-def : Pat<(v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)),
- (v4i32 VImm8000)))),
- (SQXTNv4i16 V128:$Vn)>;
-
-// concat_vectors(Vd, trunc(umin(X, 255))) -> UQXTRN(Vd, Vn)
-def : Pat<(v16i8 (concat_vectors
- (v8i8 V64:$Vd),
- (v8i8 (trunc (umin (v8i16 V128:$Vn), (v8i16 VImmFF)))))),
- (UQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
-// concat_vectors(Vd, trunc(umin(X, 65535))) -> UQXTRN(Vd, Vn)
-def : Pat<(v8i16 (concat_vectors
- (v4i16 V64:$Vd),
- (v4i16 (trunc (umin (v4i32 V128:$Vn), (v4i32 VImmFFFF)))))),
- (UQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
-
-// concat_vectors(Vd, trunc(smin(smax Vm, -128), 127) ~> SQXTN2(Vd, Vn)
-// with reversed min/max
-def : Pat<(v16i8 (concat_vectors
- (v8i8 V64:$Vd),
- (v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)),
- (v8i16 VImm7F)))))),
- (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
-def : Pat<(v16i8 (concat_vectors
- (v8i8 V64:$Vd),
- (v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)),
- (v8i16 VImm80)))))),
- (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
-
-// concat_vectors(Vd, trunc(smin(smax Vm, -32768), 32767) ~> SQXTN2(Vd, Vn)
-// with reversed min/max
-def : Pat<(v8i16 (concat_vectors
- (v4i16 V64:$Vd),
- (v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)),
- (v4i32 VImm7FFF)))))),
- (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
-def : Pat<(v8i16 (concat_vectors
- (v4i16 V64:$Vd),
- (v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)),
- (v4i32 VImm8000)))))),
- (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>;
-
// Select BSWAP vector instructions into REV instructions
-def : Pat<(v4i16 (bswap (v4i16 V64:$Rn))),
+def : Pat<(v4i16 (bswap (v4i16 V64:$Rn))),
(v4i16 (REV16v8i8 (v4i16 V64:$Rn)))>;
-def : Pat<(v8i16 (bswap (v8i16 V128:$Rn))),
+def : Pat<(v8i16 (bswap (v8i16 V128:$Rn))),
(v8i16 (REV16v16i8 (v8i16 V128:$Rn)))>;
-def : Pat<(v2i32 (bswap (v2i32 V64:$Rn))),
+def : Pat<(v2i32 (bswap (v2i32 V64:$Rn))),
(v2i32 (REV32v8i8 (v2i32 V64:$Rn)))>;
-def : Pat<(v4i32 (bswap (v4i32 V128:$Rn))),
+def : Pat<(v4i32 (bswap (v4i32 V128:$Rn))),
(v4i32 (REV32v16i8 (v4i32 V128:$Rn)))>;
-def : Pat<(v2i64 (bswap (v2i64 V128:$Rn))),
+def : Pat<(v2i64 (bswap (v2i64 V128:$Rn))),
(v2i64 (REV64v16i8 (v2i64 V128:$Rn)))>;
//===----------------------------------------------------------------------===//
@@ -5530,6 +5716,42 @@ defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", any_fminnum>;
defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", any_fminimum>;
+let Predicates = [HasNEON] in {
+def : Pat<(v2f64 (fminnum_ieee (v2f64 V128:$Rn), (v2f64 V128:$Rm))),
+ (v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>;
+def : Pat<(v2f64 (fmaxnum_ieee (v2f64 V128:$Rn), (v2f64 V128:$Rm))),
+ (v2f64 (FMAXNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rm)))>;
+def : Pat<(v2f64 (fcanonicalize (v2f64 V128:$Rn))),
+ (v2f64 (FMINNMv2f64 (v2f64 V128:$Rn), (v2f64 V128:$Rn)))>;
+def : Pat<(v4f32 (fminnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+ (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>;
+def : Pat<(v4f32 (fmaxnum_ieee (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+ (v4f32 (FMAXNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rm)))>;
+def : Pat<(v4f32 (fcanonicalize (v4f32 V128:$Rn))),
+ (v4f32 (FMINNMv4f32 (v4f32 V128:$Rn), (v4f32 V128:$Rn)))>;
+def : Pat<(v2f32 (fminnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>;
+def : Pat<(v2f32 (fmaxnum_ieee (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (v2f32 (FMAXNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm)))>;
+def : Pat<(v2f32 (fcanonicalize (v2f32 V64:$Rn))),
+ (v2f32 (FMINNMv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rn)))>;
+}
+
+let Predicates = [HasNEON, HasFullFP16] in {
+def : Pat<(v8f16 (fminnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+ (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>;
+def : Pat<(v8f16 (fmaxnum_ieee (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+ (v8f16 (FMAXNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm)))>;
+def : Pat<(v8f16 (fcanonicalize (v8f16 V128:$Rn))),
+ (v8f16 (FMINNMv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rn)))>;
+def : Pat<(v4f16 (fminnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+ (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>;
+def : Pat<(v4f16 (fmaxnum_ieee (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+ (v4f16 (FMAXNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm)))>;
+def : Pat<(v4f16 (fcanonicalize (v4f16 V64:$Rn))),
+ (v4f16 (FMINNMv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rn)))>;
+}
+
// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
// instruction expects the addend first, while the fma intrinsic puts it last.
defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
@@ -6156,7 +6378,7 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
// Some float -> int -> float conversion patterns for which we want to keep the
// int values in FP registers using the corresponding NEON instructions to
// avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEONandIsStreamingSafe] in {
+let Predicates = [HasNEONandIsSME2p2StreamingSafe] in {
def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
(SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -6166,14 +6388,14 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
(UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
(SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}
-// int -> float conversion of value in lane 0 of simd vector should use
+// int -> float conversion of value in lane 0 of simd vector should use
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
(SCVTFv1i32 (i32 (EXTRACT_SUBREG (v4i32 FPR128:$Rn), ssub)))>;
@@ -6189,13 +6411,13 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
// fp16: integer extraction from vector must be at least 32-bits to be legal.
// Actual extraction result is then an in-reg sign-extension of lower 16-bits.
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
- (v8i16 FPR128:$Rn), (i64 0))), i16)))),
+let Predicates = [HasNEONandIsSME2p2StreamingSafe, HasFullFP16] in {
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
+ (v8i16 FPR128:$Rn), (i64 0))), i16)))),
(SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
// unsigned 32-bit extracted element is truncated to 16-bits using AND
-def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract
+def : Pat<(f16 (uint_to_fp (i32 (and (i32 (vector_extract
(v8i16 FPR128:$Rn), (i64 0))), (i32 65535))))),
(UCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
}
@@ -6286,7 +6508,7 @@ def : Pat <(f64 (uint_to_fp (i32
(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
// 64-bits -> double are handled in target specific dag combine:
// performIntToFpCombine.
-} // let Predicates = [HasNEONandIsStreamingSafe]
+} // let Predicates = [HasNEON]
//===----------------------------------------------------------------------===//
// Advanced SIMD three different-sized vector instructions.
@@ -6588,6 +6810,46 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
let Predicates = [HasLUT] in {
defm LUT2 : BaseSIMDTableLookupIndexed2<"luti2">;
defm LUT4 : BaseSIMDTableLookupIndexed4<"luti4">;
+
+ multiclass Luti2_patterns<Instruction Instr, ValueType VT64, ValueType VT128>{
+ def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT64:$Rn,
+ v8i8:$Rm, i32:$idx)),
+ (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>;
+ def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT64:$Rn,
+ v16i8:$Rm, i32:$idx)),
+ (Instr (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub),
+ V128:$Rm, VectorIndexS32b_timm:$idx)>;
+ def : Pat<(VT128 (int_aarch64_neon_vluti2_lane VT128:$Rn,
+ v8i8:$Rm, i32:$idx)),
+ (Instr V128:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub),
+ VectorIndexS32b_timm:$idx)>;
+ def : Pat<(VT128 (int_aarch64_neon_vluti2_laneq VT128:$Rn,
+ v16i8:$Rm, i32:$idx)),
+ (Instr V128:$Rn, V128:$Rm, VectorIndexS32b_timm:$idx)>;
+ }
+
+ defm : Luti2_patterns<LUT2_B, v8i8, v16i8>;
+ defm : Luti2_patterns<LUT2_H, v4i16, v8i16>;
+ defm : Luti2_patterns<LUT2_H, v4f16, v8f16>;
+ defm : Luti2_patterns<LUT2_H, v4bf16, v8bf16>;
+
+ def : Pat<(v16i8 (int_aarch64_neon_vluti4q_laneq v16i8:$Rn,
+ v16i8:$Rm, i32:$idx)),
+ (LUT4_B VecListOne16b:$Rn, V128:$Rm, VectorIndexD32b_timm:$idx)>;
+ def : Pat<(v16i8 (int_aarch64_neon_vluti4q_lane v16i8:$Rn,
+ v8i8:$Rm, i32:$idx)),
+ (LUT4_B VecListOne16b:$Rn, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexD32b_timm:$idx)>;
+
+ foreach VT = [v8i16, v8f16, v8bf16] in {
+ def : Pat<(VT (int_aarch64_neon_vluti4q_laneq_x2 VT:$Rn1,
+ VT:$Rn2, v16i8:$Rm, i32:$idx)),
+ (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1), V128:$Rm, VectorIndexS32b_timm:$idx)>;
+ def : Pat<(VT (int_aarch64_neon_vluti4q_lane_x2 VT:$Rn1,
+ VT:$Rn2, v8i8:$Rm, i32:$idx)),
+ (LUT4_H (REG_SEQUENCE QQ, VecListOne8h:$Rn1, qsub0, VecListOne8h:$Rn2, qsub1),
+ (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rm, dsub), VectorIndexS32b_timm:$idx)>;
+ }
}
//----------------------------------------------------------------------------
@@ -6728,6 +6990,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+// Also covers DUP (truncate i64 to i32)
+def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+ (DUPv2i32lane V128:$Rn, imm:$idx)>;
+def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+ (DUPv4i32lane V128:$Rn, imm:$idx)>;
+
// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
// instruction even if the types don't match: we just have to remap the lane
// carefully. N.b. this trick only applies to truncations.
@@ -6741,44 +7009,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;
-multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
- ValueType Src128VT, ValueType ScalVT,
- Instruction DUP, SDNodeXForm IdxXFORM> {
- def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
- imm:$idx)))),
- (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
- def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
- imm:$idx)))),
- (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
-
-defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
-
-multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
- SDNodeXForm IdxXFORM> {
- def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
- imm:$idx))))),
- (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
- def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
- imm:$idx))))),
- (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;
-
-defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT,
+ Instruction DUP, SDNodeXForm IdxXFORM>
+ : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))),
+ (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+// DUP (truncate i16 to i8)
+def : DUPWithTruncPat<v8i8, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
+def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+// DUP (truncate i32/64 to i8)
+def : DUPWithTruncPat<v8i8, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
+def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+// DUP (truncate i32/i64 to i16)
+def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
// SMOV and UMOV definitions, with some extra patterns for convenience
defm SMOV : SMov;
@@ -6824,10 +7068,10 @@ def : Pat<(i64 (and (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
defm INS : SIMDIns;
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v16i8 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i8 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
@@ -6835,50 +7079,49 @@ def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
def : Pat<(v8i8 (bitconvert (i64 (zext GPR32:$Rn)))),
(SUBREG_TO_REG (i32 0), (f32 (FMOVWSr GPR32:$Rn)), ssub)>;
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v8i16 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+def : Pat<(v4i16 (vec_ins_or_scal_vec GPR32:$Rn)),
(SUBREG_TO_REG (i32 0),
(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v2i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+def : Pat<(v4i32 (vec_ins_or_scal_vec (i32 FPR32:$Rn))),
(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
(i32 FPR32:$Rn), ssub))>;
-
-def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+def : Pat<(v2i64 (vec_ins_or_scal_vec (i64 FPR64:$Rn))),
(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
(i64 FPR64:$Rn), dsub))>;
-def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
+def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v4bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+def : Pat<(v8bf16 (vec_ins_or_scal_vec (bf16 FPR16:$Rn))),
(INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+def : Pat<(v4f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+def : Pat<(v2f32 (vec_ins_or_scal_vec (f32 FPR32:$Rn))),
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+def : Pat<(v2f64 (vec_ins_or_scal_vec (f64 FPR64:$Rn))),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
@@ -7002,8 +7245,23 @@ def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
)>;
-multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
- ValueType VTScal, Instruction INS> {
+// Move elements between vectors
+multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64, ValueType VTSVE,
+ ValueType VTScal, Operand SVEIdxTy, Instruction INS> {
+ // Extracting from the lowest 128-bits of an SVE vector
+ def : Pat<(VT128 (vector_insert VT128:$Rn,
+ (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))),
+ (i64 imm:$Immd))),
+ (INS VT128:$Rn, imm:$Immd, (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn)>;
+
+ def : Pat<(VT64 (vector_insert VT64:$Rn,
+ (VTScal (vector_extract VTSVE:$Rm, (i64 SVEIdxTy:$Immn))),
+ (i64 imm:$Immd))),
+ (EXTRACT_SUBREG
+ (INS (SUBREG_TO_REG (i64 0), VT64:$Rn, dsub), imm:$Immd,
+ (VT128 (EXTRACT_SUBREG VTSVE:$Rm, zsub)), SVEIdxTy:$Immn),
+ dsub)>;
+ // Extracting from another NEON vector
def : Pat<(VT128 (vector_insert V128:$src,
(VTScal (vector_extract (VT128 V128:$Rn), (i64 imm:$Immn))),
(i64 imm:$Immd))),
@@ -7031,15 +7289,15 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
dsub)>;
}
-defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
-defm : Neon_INS_elt_pattern<v8bf16, v4bf16, bf16, INSvi16lane>;
-defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
-defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v8f16, v4f16, nxv8f16, f16, VectorIndexH, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v8bf16, v4bf16, nxv8bf16, bf16, VectorIndexH, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4f32, v2f32, nxv4f32, f32, VectorIndexS, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, nxv2f64, f64, VectorIndexD, INSvi64lane>;
-defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, INSvi8lane>;
-defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
-defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
-defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8, nxv16i8, i32, VectorIndexB, INSvi8lane>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, nxv8i16, i32, VectorIndexH, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, nxv4i32, i32, VectorIndexS, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, nxv2i64, i64, VectorIndexD, INSvi64lane>;
// Insert from bitcast
// vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
@@ -7089,7 +7347,8 @@ def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx),
// All concat_vectors operations are canonicalised to act on i64 vectors for
// AArch64. In the general case we need an instruction, which had just as well be
// INS.
-multiclass ConcatPat<ValueType DstTy, ValueType SrcTy> {
+multiclass ConcatPat<ValueType DstTy, ValueType SrcTy,
+ ComplexPattern ExtractHigh> {
def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
(INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
@@ -7102,16 +7361,22 @@ multiclass ConcatPat<ValueType DstTy, ValueType SrcTy> {
// If the high lanes are undef we can just ignore them:
def : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+ // Concatting the high half of two vectors is the insert of the first
+ // into the low half of the second.
+ def : Pat<(DstTy (concat_vectors (ExtractHigh (DstTy V128:$Rn)),
+ (ExtractHigh (DstTy V128:$Rm)))),
+ (INSvi64lane V128:$Rm, (i64 0), V128:$Rn, (i64 1))>;
}
-defm : ConcatPat<v2i64, v1i64>;
-defm : ConcatPat<v2f64, v1f64>;
-defm : ConcatPat<v4i32, v2i32>;
-defm : ConcatPat<v4f32, v2f32>;
-defm : ConcatPat<v8i16, v4i16>;
-defm : ConcatPat<v8f16, v4f16>;
-defm : ConcatPat<v8bf16, v4bf16>;
-defm : ConcatPat<v16i8, v8i8>;
+defm : ConcatPat<v2i64, v1i64, extract_high_v2i64>;
+defm : ConcatPat<v2f64, v1f64, extract_high_v2f64>;
+defm : ConcatPat<v4i32, v2i32, extract_high_v4i32>;
+defm : ConcatPat<v4f32, v2f32, extract_high_v4f32>;
+defm : ConcatPat<v8i16, v4i16, extract_high_v8i16>;
+defm : ConcatPat<v8f16, v4f16, extract_high_v8f16>;
+defm : ConcatPat<v8bf16, v4bf16, extract_high_v8bf16>;
+defm : ConcatPat<v16i8, v8i8, extract_high_v16i8>;
//----------------------------------------------------------------------------
// AdvSIMD across lanes instructions
@@ -7172,17 +7437,6 @@ multiclass SIMDAcrossLaneLongPairIntrinsicGISel<string Opc, SDPatternOperator ad
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"UADDLV", AArch64uaddlp>;
defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>;
-// Patterns for uaddlv(uaddlp(x)) ==> uaddlv
-def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
- (i64 (EXTRACT_SUBREG
- (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub)),
- dsub))>;
-
-def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
- (i32 (EXTRACT_SUBREG
- (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
- ssub))>;
-
def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
(v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>;
@@ -7325,19 +7579,19 @@ def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))),
}
// For vecreduce_add, used by GlobalISel not SDAG
-def : Pat<(i8 (vecreduce_add (v8i8 V64:$Rn))),
+def : Pat<(i8 (vecreduce_add (v8i8 V64:$Rn))),
(i8 (ADDVv8i8v V64:$Rn))>;
-def : Pat<(i8 (vecreduce_add (v16i8 V128:$Rn))),
+def : Pat<(i8 (vecreduce_add (v16i8 V128:$Rn))),
(i8 (ADDVv16i8v V128:$Rn))>;
-def : Pat<(i16 (vecreduce_add (v4i16 V64:$Rn))),
+def : Pat<(i16 (vecreduce_add (v4i16 V64:$Rn))),
(i16 (ADDVv4i16v V64:$Rn))>;
-def : Pat<(i16 (vecreduce_add (v8i16 V128:$Rn))),
+def : Pat<(i16 (vecreduce_add (v8i16 V128:$Rn))),
(i16 (ADDVv8i16v V128:$Rn))>;
-def : Pat<(i32 (vecreduce_add (v2i32 V64:$Rn))),
+def : Pat<(i32 (vecreduce_add (v2i32 V64:$Rn))),
(i32 (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub))>;
-def : Pat<(i32 (vecreduce_add (v4i32 V128:$Rn))),
+def : Pat<(i32 (vecreduce_add (v4i32 V128:$Rn))),
(i32 (ADDVv4i32v V128:$Rn))>;
-def : Pat<(i64 (vecreduce_add (v2i64 V128:$Rn))),
+def : Pat<(i64 (vecreduce_add (v2i64 V128:$Rn))),
(i64 (ADDPv2i64p V128:$Rn))>;
defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", AArch64saddv>;
@@ -7382,103 +7636,33 @@ def : Pat<(i16 (opNode (v4i16 FPR64:$Rn))),
def : Pat<(i16 (opNode (v8i16 FPR128:$Rn))),
(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) FPR128:$Rn)>;
-def : Pat<(i32 (opNode (v4i32 V128:$Rn))),
+def : Pat<(i32 (opNode (v4i32 V128:$Rn))),
(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn)>;
}
// For v2i32 source type, the pairwise instruction can be used instead
defm : SIMDAcrossLanesVecReductionIntrinsic<"UMINV", vecreduce_umin>;
-def : Pat<(i32 (vecreduce_umin (v2i32 V64:$Rn))),
+def : Pat<(i32 (vecreduce_umin (v2i32 V64:$Rn))),
(i32 (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub))>;
defm : SIMDAcrossLanesVecReductionIntrinsic<"UMAXV", vecreduce_umax>;
-def : Pat<(i32 (vecreduce_umax (v2i32 V64:$Rn))),
+def : Pat<(i32 (vecreduce_umax (v2i32 V64:$Rn))),
(i32 (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>;
defm : SIMDAcrossLanesVecReductionIntrinsic<"SMINV", vecreduce_smin>;
-def : Pat<(i32 (vecreduce_smin (v2i32 V64:$Rn))),
+def : Pat<(i32 (vecreduce_smin (v2i32 V64:$Rn))),
(i32 (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub))>;
defm : SIMDAcrossLanesVecReductionIntrinsic<"SMAXV", vecreduce_smax>;
-def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))),
+def : Pat<(i32 (vecreduce_smax (v2i32 V64:$Rn))),
(i32 (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub))>;
-multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
- def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
- (i32 (SMOVvi16to32
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
- (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
- (i32 (SMOVvi16to32
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
- (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
- ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
- ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
- (i64 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
- dsub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
- Intrinsic intOp> {
- def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
- ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
- ssub))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
- ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
- (i32 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
- ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
- (i64 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
- dsub))>;
-}
-
-defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
-defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
-
-// The vaddlv_s32 intrinsic gets mapped to SADDLP.
-def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
- (i64 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (SADDLPv2i32_v1i64 V64:$Rn), dsub),
- dsub))>;
-// The vaddlv_u32 intrinsic gets mapped to UADDLP.
-def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
- (i64 (EXTRACT_SUBREG
- (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
- (UADDLPv2i32_v1i64 V64:$Rn), dsub),
- dsub))>;
+// The SADDLV v2i32 gets mapped to SADDLP.
+def : Pat<(v2i64 (AArch64saddlv (v2i32 V64:$Rn))),
+ (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (SADDLPv2i32_v1i64 V64:$Rn), dsub))>;
+// The UADDLV v2i32 gets mapped to UADDLP.
+def : Pat<(v2i64 (AArch64uaddlv (v2i32 V64:$Rn))),
+ (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (UADDLPv2i32_v1i64 V64:$Rn), dsub))>;
//------------------------------------------------------------------------------
// AdvSIMD modified immediate instructions
@@ -8005,15 +8189,15 @@ def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
(i32 vecshiftL64:$imm))),
(SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
- int_aarch64_neon_sqrshrn>;
+ BinOpFrag<(truncssat_s (AArch64srshri node:$LHS, node:$RHS))>>;
defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
- int_aarch64_neon_sqrshrun>;
+ BinOpFrag<(truncssat_u (AArch64srshri node:$LHS, node:$RHS))>>;
defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
- int_aarch64_neon_sqshrn>;
+ BinOpFrag<(truncssat_s (AArch64vashr node:$LHS, node:$RHS))>>;
defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
- int_aarch64_neon_sqshrun>;
+ BinOpFrag<(truncssat_u (AArch64vashr node:$LHS, node:$RHS))>>;
defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>;
def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
(i32 vecshiftR64:$imm))),
@@ -8031,10 +8215,10 @@ defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
int_aarch64_neon_vcvtfxu2fp>;
defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
- int_aarch64_neon_uqrshrn>;
+ BinOpFrag<(truncusat_u (AArch64urshri node:$LHS, node:$RHS))>>;
defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
- int_aarch64_neon_uqshrn>;
+ BinOpFrag<(truncusat_u (AArch64vlshr node:$LHS, node:$RHS))>>;
defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
TriOpFrag<(add node:$LHS,
@@ -8129,6 +8313,20 @@ def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
(SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(shl (v8i16 (zext (v8i8 V64:$Rm))), (v8i16 (AArch64dup (i32 imm32_0_7:$size)))),
+ (USHLLv8i8_shift V64:$Rm, (i32 imm32_0_7:$size))>;
+def : Pat<(shl (v4i32 (zext (v4i16 V64:$Rm))), (v4i32 (AArch64dup (i32 imm32_0_15:$size)))),
+ (USHLLv4i16_shift V64:$Rm, (i32 imm32_0_15:$size))>;
+def : Pat<(shl (v2i64 (zext (v2i32 V64:$Rm))), (v2i64 (AArch64dup (i64 imm0_31:$size)))),
+ (USHLLv2i32_shift V64:$Rm, (trunc_imm imm0_31:$size))>;
+
+def : Pat<(shl (v8i16 (sext (v8i8 V64:$Rm))), (v8i16 (AArch64dup (i32 imm32_0_7:$size)))),
+ (SSHLLv8i8_shift V64:$Rm, (i32 imm32_0_7:$size))>;
+def : Pat<(shl (v4i32 (sext (v4i16 V64:$Rm))), (v4i32 (AArch64dup (i32 imm32_0_15:$size)))),
+ (SSHLLv4i16_shift V64:$Rm, (i32 imm32_0_15:$size))>;
+def : Pat<(shl (v2i64 (sext (v2i32 V64:$Rm))), (v2i64 (AArch64dup (i64 imm0_31:$size)))),
+ (SSHLLv2i32_shift V64:$Rm, (trunc_imm imm0_31:$size))>;
+
// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
// Anyexts are implemented as zexts.
def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>;
@@ -8140,8 +8338,6 @@ def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>
def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-// Vector bf16 -> fp32 is implemented morally as a zext + shift.
-def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))), (SHLLv4i16 V64:$Rn)>;
// Also match an extend from the upper half of a 128 bit source register.
def : Pat<(v8i16 (anyext (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn)) ))),
(USHLLv16i8_shift V128:$Rn, (i32 0))>;
@@ -8550,7 +8746,7 @@ def : Ld1Lane64IdxOpPat<extloadi8, VectorIndexH, v4i16, i32, LD1i8, VectorIndexH
let Predicates = [HasNEON] in {
class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
SDPatternOperator ExtLoad, Instruction LD1>
- : Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
+ : Pat<(ResultTy (vec_ins_or_scal_vec (i32 (ExtLoad GPR64sp:$Rn)))),
(ResultTy (EXTRACT_SUBREG
(LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>;
@@ -8983,11 +9179,11 @@ def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
-def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+def : Pat<(v1i64 (vec_ins_or_scal_vec GPR64:$Xn)),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+def : Pat<(v1f64 (vec_ins_or_scal_vec GPR64:$Xn)),
(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+def : Pat<(v1f64 (vec_ins_or_scal_vec (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
(COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
@@ -9587,6 +9783,18 @@ def : Pat<(v16i8 (add (AArch64uzp1 (v16i8 FPR128:$Rn), (v16i8 FPR128:$Rm)),
(AArch64uzp2 (v16i8 FPR128:$Rn), (v16i8 FPR128:$Rm)))),
(v16i8 (ADDPv16i8 $Rn, $Rm))>;
+def : Pat<(v2i32 (add (AArch64zip1 (extract_subvector (v4i32 FPR128:$Rn), (i64 0)),
+ (extract_subvector (v4i32 FPR128:$Rn), (i64 2))),
+ (AArch64zip2 (extract_subvector (v4i32 FPR128:$Rn), (i64 0)),
+ (extract_subvector (v4i32 FPR128:$Rn), (i64 2))))),
+ (EXTRACT_SUBREG (ADDPv4i32 $Rn, $Rn), dsub)>;
+def : Pat<(v4i16 (add (trunc (v4i32 (bitconvert FPR128:$Rn))),
+ (extract_subvector (AArch64uzp2 (v8i16 FPR128:$Rn), undef), (i64 0)))),
+ (EXTRACT_SUBREG (ADDPv8i16 $Rn, $Rn), dsub)>;
+def : Pat<(v8i8 (add (trunc (v8i16 (bitconvert FPR128:$Rn))),
+ (extract_subvector (AArch64uzp2 (v16i8 FPR128:$Rn), undef), (i64 0)))),
+ (EXTRACT_SUBREG (ADDPv16i8 $Rn, $Rn), dsub)>;
+
def : Pat<(v2f64 (fadd (AArch64zip1 (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm)),
(AArch64zip2 (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm)))),
(v2f64 (FADDPv2f64 $Rn, $Rm))>;
@@ -9689,6 +9897,7 @@ def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+let Size = 8 in
def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>;
def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>;
@@ -9826,8 +10035,10 @@ foreach i = 0-7 in {
}
let Predicates = [HasLS64] in {
+ let mayLoad = 1 in
def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn),
(outs GPR64x8:$Rt)>;
+ let mayStore = 1 in
def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64x8:$Rt, GPR64sp:$Rn),
(outs)>;
def ST64BV: Store64BV<0b011, "st64bv">;
@@ -9872,14 +10083,6 @@ let Predicates = [HasMOPS, HasMTE] in {
}
}
-// MOPS Node operands: 0: Dst, 1: Src or Value, 2: Size, 3: Chain
-// MOPS Node results: 0: Dst writeback, 1: Size writeback, 2: Chain
-def SDT_AArch64mops : SDTypeProfile<2, 3, [ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2> ]>;
-def AArch64mops_memset : SDNode<"AArch64ISD::MOPS_MEMSET", SDT_AArch64mops>;
-def AArch64mops_memset_tagging : SDNode<"AArch64ISD::MOPS_MEMSET_TAGGING", SDT_AArch64mops>;
-def AArch64mops_memcopy : SDNode<"AArch64ISD::MOPS_MEMCOPY", SDT_AArch64mops>;
-def AArch64mops_memmove : SDNode<"AArch64ISD::MOPS_MEMMOVE", SDT_AArch64mops>;
-
// MOPS operations always contain three 4-byte instructions
let Predicates = [HasMOPS], Defs = [NZCV], Size = 12, mayStore = 1 in {
let mayLoad = 1 in {
@@ -10122,44 +10325,60 @@ let Predicates = [HasD128] in {
//===----------------------------===//
let Predicates = [HasFP8] in {
- defm F1CVTL : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">;
- defm F2CVTL : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">;
- defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">;
- defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
- defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
- defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
- defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>;
+ defm F1CVTL : SIMD_FP8_CVTL<0b00, "f1cvtl", v8f16, int_aarch64_neon_fp8_cvtl1>;
+ defm F2CVTL : SIMD_FP8_CVTL<0b01, "f2cvtl", v8f16, int_aarch64_neon_fp8_cvtl2>;
+ defm BF1CVTL : SIMD_FP8_CVTL<0b10, "bf1cvtl", v8bf16, int_aarch64_neon_fp8_cvtl1>;
+ defm BF2CVTL : SIMD_FP8_CVTL<0b11, "bf2cvtl", v8bf16, int_aarch64_neon_fp8_cvtl2>;
+ defm FCVTN_F16 : SIMD_FP8_CVTN_F16<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
+ defm FCVTN_F32 : SIMD_FP8_CVTN_F32<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
+ defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
} // End let Predicates = [HasFP8]
-let Predicates = [HasFAMINMAX] in {
- defm FAMAX : SIMDThreeSameVectorFP<0b0, 0b1, 0b011, "famax", null_frag>;
- defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", null_frag>;
-} // End let Predicates = [HasFAMAXMIN]
+// fminimum(abs(a), abs(b)) -> famin(a, b)
+// fminnum[nnan](abs(a), abs(b)) -> famin(a, b)
+def AArch64famin : PatFrags<(ops node:$Rn, node:$Rm),
+ [(int_aarch64_neon_famin node:$Rn, node:$Rm),
+ (fminimum (fabs node:$Rn), (fabs node:$Rm)),
+ (fminnum_nnan (fabs node:$Rn), (fabs node:$Rm))]>;
+
+// fmaximum(abs(a), abs(b)) -> famax(a, b)
+// fmaxnum[nnan](abs(a), abs(b)) -> famax(a, b)
+def AArch64famax : PatFrags<(ops node:$Rn, node:$Rm),
+ [(int_aarch64_neon_famax node:$Rn, node:$Rm),
+ (fmaximum (fabs node:$Rn), (fabs node:$Rm)),
+ (fmaxnum_nnan (fabs node:$Rn), (fabs node:$Rm))]>;
+
+let Predicates = [HasNEON, HasFAMINMAX] in {
+ defm FAMAX : SIMDThreeSameVectorFP<0b0, 0b1, 0b011, "famax", AArch64famax>;
+ defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", AArch64famin>;
+} // End let Predicates = [HasNEON, HasFAMINMAX]
let Predicates = [HasFP8FMA] in {
- defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb">;
- defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt">;
- defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb">;
- defm FMLALLBTlane : SIMDThreeSameVectorMLALIndex<0b0, 0b01, "fmlallbt">;
- defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb">;
- defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt">;
-
- defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb">;
- defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt">;
- defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb">;
- defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt">;
- defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb">;
- defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">;
+ defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb_lane>;
+ defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt_lane>;
+ defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb_lane>;
+ defm FMLALLBTlane : SIMDThreeSameVectorMLALIndex<0b0, 0b01, "fmlallbt", int_aarch64_neon_fp8_fmlallbt_lane>;
+ defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb", int_aarch64_neon_fp8_fmlalltb_lane>;
+ defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt", int_aarch64_neon_fp8_fmlalltt_lane>;
+}
+
+let Predicates = [HasFP8FMA], Uses = [FPMR, FPCR], mayLoad = 1 in {
+ defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb>;
+ defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt>;
+ defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb>;
+ defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt", int_aarch64_neon_fp8_fmlallbt>;
+ defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb", int_aarch64_neon_fp8_fmlalltb>;
+ defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt", int_aarch64_neon_fp8_fmlalltt>;
} // End let Predicates = [HasFP8FMA]
let Predicates = [HasFP8DOT2] in {
- defm FDOTlane : SIMDThreeSameVectorFP8DOT2Index<"fdot">;
- defm FDOT : SIMDThreeSameVectorDOT2<"fdot">;
+ defm FDOTlane : SIMD_FP8_Dot2_Index<"fdot", int_aarch64_neon_fp8_fdot2_lane>;
+ defm FDOT : SIMD_FP8_Dot2<"fdot", int_aarch64_neon_fp8_fdot2>;
} // End let Predicates = [HasFP8DOT2]
let Predicates = [HasFP8DOT4] in {
- defm FDOTlane : SIMDThreeSameVectorFP8DOT4Index<"fdot">;
- defm FDOT : SIMDThreeSameVectorDOT4<"fdot">;
+ defm FDOTlane : SIMD_FP8_Dot4_Index<"fdot", int_aarch64_neon_fp8_fdot4_lane>;
+ defm FDOT : SIMD_FP8_Dot4<"fdot", int_aarch64_neon_fp8_fdot4>;
} // End let Predicates = [HasFP8DOT4]
//===----------------------------------------------------------------------===//
@@ -10203,9 +10422,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn)),
(v8bf16 (BFCVTN2
- (v8bf16 (BFCVTN
- (v4f32 (OutInst
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+ (INSERT_SUBREG (IMPLICIT_DEF),
+ (v4bf16 (BFCVTN
+ (v4f32 (OutInst
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
+ dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
let Predicates = [HasNoBF16] in
@@ -10240,10 +10461,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
let Predicates = [HasBF16] in
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
(v8bf16 (BFCVTN2
- (v8bf16 (BFCVTN
- (v4f32 (OutInst
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
- (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+ (INSERT_SUBREG (IMPLICIT_DEF),
+ (v4bf16 (BFCVTN
+ (v4f32 (OutInst
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
+ (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
+ dsub),
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
(v4f32 (SHLLv8i16 V128:$Rm))))))>;
@@ -10262,6 +10485,132 @@ defm : PromoteBinaryv8f16Tov4f32<any_fdiv, FDIVv4f32>;
defm : PromoteBinaryv8f16Tov4f32<any_fmul, FMULv4f32>;
defm : PromoteBinaryv8f16Tov4f32<any_fsub, FSUBv4f32>;
+let Predicates = [HasCMPBR] in {
+ defm CBGT : CmpBranchRegister<0b000, "cbgt">;
+ defm CBGE : CmpBranchRegister<0b001, "cbge">;
+ defm CBHI : CmpBranchRegister<0b010, "cbhi">;
+ defm CBHS : CmpBranchRegister<0b011, "cbhs">;
+ defm CBEQ : CmpBranchRegister<0b110, "cbeq">;
+ defm CBNE : CmpBranchRegister<0b111, "cbne">;
+
+ def CBHGTWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b000, 0b11, "cbhgt">;
+ def CBHGEWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b001, 0b11, "cbhge">;
+ def CBHHIWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b010, 0b11, "cbhhi">;
+ def CBHHSWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b011, 0b11, "cbhhs">;
+ def CBHEQWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b110, 0b11, "cbheq">;
+ def CBHNEWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b111, 0b11, "cbhne">;
+
+ def CBBGTWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b000, 0b10, "cbbgt">;
+ def CBBGEWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b001, 0b10, "cbbge">;
+ def CBBHIWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b010, 0b10, "cbbhi">;
+ def CBBHSWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b011, 0b10, "cbbhs">;
+ def CBBEQWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b110, 0b10, "cbbeq">;
+ def CBBNEWrr : BaseCmpBranchRegister<GPR32, 0b0, 0b111, 0b10, "cbbne">;
+
+ defm CBGT : CmpBranchImmediate<0b000, "uimm6", "cbgt">;
+ defm CBLT : CmpBranchImmediate<0b001, "uimm6", "cblt">;
+ defm CBHI : CmpBranchImmediate<0b010, "uimm6", "cbhi">;
+ defm CBLO : CmpBranchImmediate<0b011, "uimm6", "cblo">;
+ defm CBEQ : CmpBranchImmediate<0b110, "uimm6", "cbeq">;
+ defm CBNE : CmpBranchImmediate<0b111, "uimm6", "cbne">;
+
+ defm : CmpBranchImmediateAlias<"cbge", "CBGT", "uimm6p1">;
+ defm : CmpBranchImmediateAlias<"cbhs", "CBHI", "uimm6p1">;
+ defm : CmpBranchImmediateAlias<"cble", "CBLT", "uimm6m1">;
+ defm : CmpBranchImmediateAlias<"cbls", "CBLO", "uimm6m1">;
+
+ defm : CmpBranchRegisterAlias<"cble", "CBGE">;
+ defm : CmpBranchRegisterAlias<"cblo", "CBHI">;
+ defm : CmpBranchRegisterAlias<"cbls", "CBHS">;
+ defm : CmpBranchRegisterAlias<"cblt", "CBGT">;
+
+ defm : CmpBranchWRegisterAlias<"cbble", "CBBGE">;
+ defm : CmpBranchWRegisterAlias<"cbblo", "CBBHI">;
+ defm : CmpBranchWRegisterAlias<"cbbls", "CBBHS">;
+ defm : CmpBranchWRegisterAlias<"cbblt", "CBBGT">;
+
+ defm : CmpBranchWRegisterAlias<"cbhle", "CBHGE">;
+ defm : CmpBranchWRegisterAlias<"cbhlo", "CBHHI">;
+ defm : CmpBranchWRegisterAlias<"cbhls", "CBHHS">;
+ defm : CmpBranchWRegisterAlias<"cbhlt", "CBHGT">;
+} // HasCMPBR
+
+
+//===-----------------------------------------------------===//
+// Atomic floating-point in-memory instructions (FEAT_LSFE)
+//===-----------------------------------------------------===//
+
+let Predicates = [HasLSFE] in {
+ // Floating-point Atomic Load
+ defm LDFADDA : AtomicFPLoad<0b10, 0b000, "ldfadda">;
+ defm LDFADDAL : AtomicFPLoad<0b11, 0b000, "ldfaddal">;
+ defm LDFADD : AtomicFPLoad<0b00, 0b000, "ldfadd">;
+ defm LDFADDL : AtomicFPLoad<0b01, 0b000, "ldfaddl">;
+ defm LDFMAXA : AtomicFPLoad<0b10, 0b100, "ldfmaxa">;
+ defm LDFMAXAL : AtomicFPLoad<0b11, 0b100, "ldfmaxal">;
+ defm LDFMAX : AtomicFPLoad<0b00, 0b100, "ldfmax">;
+ defm LDFMAXL : AtomicFPLoad<0b01, 0b100, "ldfmaxl">;
+ defm LDFMINA : AtomicFPLoad<0b10, 0b101, "ldfmina">;
+ defm LDFMINAL : AtomicFPLoad<0b11, 0b101, "ldfminal">;
+ defm LDFMIN : AtomicFPLoad<0b00, 0b101, "ldfmin">;
+ defm LDFMINL : AtomicFPLoad<0b01, 0b101, "ldfminl">;
+ defm LDFMAXNMA : AtomicFPLoad<0b10, 0b110, "ldfmaxnma">;
+ defm LDFMAXNMAL : AtomicFPLoad<0b11, 0b110, "ldfmaxnmal">;
+ defm LDFMAXNM : AtomicFPLoad<0b00, 0b110, "ldfmaxnm">;
+ defm LDFMAXNML : AtomicFPLoad<0b01, 0b110, "ldfmaxnml">;
+ defm LDFMINNMA : AtomicFPLoad<0b10, 0b111, "ldfminnma">;
+ defm LDFMINNMAL : AtomicFPLoad<0b11, 0b111, "ldfminnmal">;
+ defm LDFMINMN : AtomicFPLoad<0b00, 0b111, "ldfminnm">;
+ defm LDFMINNML : AtomicFPLoad<0b01, 0b111, "ldfminnml">;
+ // BFloat16
+ def LDBFADDA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b000, "ldbfadda">;
+ def LDBFADDAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b000, "ldbfaddal">;
+ def LDBFADD : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b000, "ldbfadd">;
+ def LDBFADDL : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b000, "ldbfaddl">;
+ def LDBFMAXA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b100, "ldbfmaxa">;
+ def LDBFMAXAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b100, "ldbfmaxal">;
+ def LDBFMAX : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b100, "ldbfmax">;
+ def LDBFMAXL : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b100, "ldbfmaxl">;
+ def LDBFMINA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b101, "ldbfmina">;
+ def LDBFMINAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b101, "ldbfminal">;
+ def LDBFMIN : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b101, "ldbfmin">;
+ def LDBFMINL : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b101, "ldbfminl">;
+ def LDBFMAXNMA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b110, "ldbfmaxnma">;
+ def LDBFMAXNMAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b110, "ldbfmaxnmal">;
+ def LDBFMAXNM : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b110, "ldbfmaxnm">;
+ def LDBFMAXNML : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b110, "ldbfmaxnml">;
+ def LDBFMINNMA : BaseAtomicFPLoad<FPR16, 0b00, 0b10, 0b111, "ldbfminnma">;
+ def LDBFMINNMAL : BaseAtomicFPLoad<FPR16, 0b00, 0b11, 0b111, "ldbfminnmal">;
+ def LDBFMINNM : BaseAtomicFPLoad<FPR16, 0b00, 0b00, 0b111, "ldbfminnm">;
+ def LDBFMINNML : BaseAtomicFPLoad<FPR16, 0b00, 0b01, 0b111, "ldbfminnml">;
+
+ // Floating-point Atomic Store
+ defm STFADD : AtomicFPStore<0b0, 0b000, "stfadd">;
+ defm STFADDL : AtomicFPStore<0b1, 0b000, "stfaddl">;
+ defm STFMAX : AtomicFPStore<0b0, 0b100, "stfmax">;
+ defm STFMAXL : AtomicFPStore<0b1, 0b100, "stfmaxl">;
+ defm STFMIN : AtomicFPStore<0b0, 0b101, "stfmin">;
+ defm STFMINL : AtomicFPStore<0b1, 0b101, "stfminl">;
+ defm STFMAXNM : AtomicFPStore<0b0, 0b110, "stfmaxnm">;
+ defm STFMAXNML : AtomicFPStore<0b1, 0b110, "stfmaxnml">;
+ defm STFMINNM : AtomicFPStore<0b0, 0b111, "stfminnm">;
+ defm STFMINNML : AtomicFPStore<0b1, 0b111, "stfminnml">;
+ // BFloat16
+ def STBFADD : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b000, "stbfadd">;
+ def STBFADDL : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b000, "stbfaddl">;
+ def STBFMAX : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b100, "stbfmax">;
+ def STBFMAXL : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b100, "stbfmaxl">;
+ def STBFMIN : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b101, "stbfmin">;
+ def STBFMINL : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b101, "stbfminl">;
+ def STBFMAXNM : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b110, "stbfmaxnm">;
+ def STBFMAXNML : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b110, "stbfmaxnml">;
+ def STBFMINNM : BaseAtomicFPStore<FPR16, 0b00, 0b0, 0b111, "stbfminnm">;
+ def STBFMINNML : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b111, "stbfminnml">;
+}
+
+let Uses = [FPMR, FPCR] in
+defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
+
include "AArch64InstrAtomics.td"
include "AArch64SVEInstrInfo.td"
include "AArch64SMEInstrInfo.td"