aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
commitcfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/Target/X86/X86ISelLowering.cpp
parent706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
downloadsrc-cfca06d7963fa0909f90483b42a6d7d194d01e08.tar.gz
src-cfca06d7963fa0909f90483b42a6d7d194d01e08.zip
Vendor import of llvm-project master 2e10b7a39b9, the last commit beforevendor/llvm-project/llvmorg-11-init-20887-g2e10b7a39b9vendor/llvm-project/master
the llvmorg-12-init tag, from which release/11.x was branched.
Notes
Notes: svn path=/vendor/llvm-project/master/; revision=363578 svn path=/vendor/llvm-project/llvmorg-11-init-20887-g2e10b7a39b9/; revision=363579; tag=vendor/llvm-project/llvmorg-11-init-20887-g2e10b7a39b9
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp10153
1 files changed, 6524 insertions, 3629 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0f152968ddfd..450927aaf5cc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12,7 +12,8 @@
//===----------------------------------------------------------------------===//
#include "X86ISelLowering.h"
-#include "Utils/X86ShuffleDecode.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
+#include "X86.h"
#include "X86CallingConv.h"
#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
@@ -28,6 +29,7 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -37,7 +39,6 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
-#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
@@ -75,13 +76,6 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
" of the loop header PC will be 0)."),
cl::Hidden);
-// Added in 10.0.
-static cl::opt<bool> EnableOldKNLABI(
- "x86-enable-old-knl-abi", cl::init(false),
- cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
- "one ZMM register on AVX512F, but not AVX512BW targets."),
- cl::Hidden);
-
static cl::opt<bool> MulConstantOptimization(
"mul-constant-optimization", cl::init(true),
cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -164,7 +158,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
- // FIXME: Should we be limitting the atomic size on other configs? Default is
+ // FIXME: Should we be limiting the atomic size on other configs? Default is
// 1024.
if (!Subtarget.hasCmpxchg8b())
setMaxAtomicSizeInBitsSupported(32);
@@ -190,12 +184,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
// SETOEQ and SETUNE require checking two conditions.
- setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
- setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
- setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
- setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
- setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+ for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
+ setCondCodeAction(ISD::SETOEQ, VT, Expand);
+ setCondCodeAction(ISD::SETUNE, VT, Expand);
+ }
// Integer absolute.
if (Subtarget.hasCMov()) {
@@ -206,10 +198,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
+ // For slow shld targets we only lower for code size.
+ LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
+
+ setOperationAction(ShiftOp , MVT::i8 , Custom);
setOperationAction(ShiftOp , MVT::i16 , Custom);
- setOperationAction(ShiftOp , MVT::i32 , Custom);
+ setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
if (Subtarget.is64Bit())
- setOperationAction(ShiftOp , MVT::i64 , Custom);
+ setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
}
if (!Subtarget.useSoftFloat()) {
@@ -270,6 +266,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+
+ setOperationAction(ISD::LRINT, MVT::f32, Custom);
+ setOperationAction(ISD::LRINT, MVT::f64, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f32, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f64, Custom);
+
+ if (!Subtarget.is64Bit()) {
+ setOperationAction(ISD::LRINT, MVT::i64, Custom);
+ setOperationAction(ISD::LLRINT, MVT::i64, Custom);
+ }
}
// Handle address space casts between mixed sized pointers.
@@ -347,34 +353,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
} else {
- setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
- setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
- setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
- if (Subtarget.is64Bit()) {
- setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::CTLZ , VT, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
}
}
- // Special handling for half-precision floating point conversions.
- // If we don't have F16C support, then lower half float conversions
- // into library calls.
- if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
- setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
+ ISD::STRICT_FP_TO_FP16}) {
+ // Special handling for half-precision floating point conversions.
+ // If we don't have F16C support, then lower half float conversions
+ // into library calls.
+ setOperationAction(
+ Op, MVT::f32,
+ (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
+ // There's never any support for operations beyond MVT::f32.
+ setOperationAction(Op, MVT::f64, Expand);
+ setOperationAction(Op, MVT::f80, Expand);
+ setOperationAction(Op, MVT::f128, Expand);
}
- // There's never any support for operations beyond MVT::f32.
- setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
- setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
- setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
-
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
@@ -542,7 +542,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
- } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
+ } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
+ (UseX87 || Is64Bit)) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
addRegisterClass(MVT::f32, &X86::FR32RegClass);
@@ -663,8 +664,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f80, Expand);
setOperationAction(ISD::LROUND, MVT::f80, Expand);
setOperationAction(ISD::LLROUND, MVT::f80, Expand);
- setOperationAction(ISD::LRINT, MVT::f80, Expand);
- setOperationAction(ISD::LLRINT, MVT::f80, Expand);
+ setOperationAction(ISD::LRINT, MVT::f80, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f80, Custom);
// Handle constrained floating-point operations of scalar.
setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
@@ -1038,8 +1039,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
- // With AVX512, expanding (and promoting the shifts) is better.
- if (!Subtarget.hasAVX512())
+ // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
+ // shifts) is better.
+ if (!Subtarget.useAVX512Regs() &&
+ !(Subtarget.hasBWI() && Subtarget.hasVLX()))
setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
@@ -1078,6 +1081,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
+
+ setOperationAction(ISD::FROUND, RoundedTy, Custom);
}
setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
@@ -1170,6 +1175,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+
+ setOperationAction(ISD::FROUND, VT, Custom);
+
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
@@ -1221,7 +1229,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
// With BWI, expanding (and promoting the shifts) is the better.
- if (!Subtarget.hasBWI())
+ if (!Subtarget.useBWIRegs())
setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
@@ -1412,19 +1420,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
- for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
+ for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::UADDSAT, VT, Custom);
+ setOperationAction(ISD::SADDSAT, VT, Custom);
+ setOperationAction(ISD::USUBSAT, VT, Custom);
+ setOperationAction(ISD::SSUBSAT, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ }
+
+ for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
- setOperationAction(ISD::UADDSAT, VT, Custom);
- setOperationAction(ISD::SADDSAT, VT, Custom);
- setOperationAction(ISD::USUBSAT, VT, Custom);
- setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
@@ -1432,7 +1444,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Expand);
}
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
@@ -1443,10 +1454,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// elements. 512-bits can be disabled based on prefer-vector-width and
// required-vector-width function attributes.
if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
+ bool HasBWI = Subtarget.hasBWI();
+
addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+ addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
@@ -1454,6 +1469,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
+ if (HasBWI)
+ setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
}
for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
@@ -1497,6 +1514,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ if (HasBWI)
+ setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
// to 512-bit rather than use the AVX2 instructions so that we can use
@@ -1509,19 +1528,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
- setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- // Need to custom widen this if we don't have AVX512BW.
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
+ if (HasBWI) {
+ // Extends from v64i1 masks to 512-bit vectors.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
+ }
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
@@ -1535,47 +1561,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
- setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::FROUND, VT, Custom);
}
- // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
- for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
+ for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
}
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+ setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v64i8, Custom);
- setOperationAction(ISD::MUL, MVT::v8i64, Custom);
- setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+ setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
- setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+
+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+ // setcc all the way to isel and prefer SETGT in some isel patterns.
+ setCondCodeAction(ISD::SETLT, VT, Custom);
+ setCondCodeAction(ISD::SETLE, VT, Custom);
+ }
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
- setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
+ }
- // The condition codes aren't legal in SSE/AVX and under AVX512 we use
- // setcc all the way to isel and prefer SETGT in some isel patterns.
- setCondCodeAction(ISD::SETLT, VT, Custom);
- setCondCodeAction(ISD::SETLE, VT, Custom);
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
}
if (Subtarget.hasDQI()) {
@@ -1610,36 +1658,42 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MVT::v8f32, MVT::v4f64 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
+ MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ }
+
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
- if (!Subtarget.hasBWI()) {
- // Need to custom split v32i16/v64i8 bitcasts.
- setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
- setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
-
- // Better to split these into two 256-bit ops.
- setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
- setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
+ if (HasBWI) {
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
+ } else {
+ setOperationAction(ISD::STORE, MVT::v32i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v64i8, Custom);
}
if (Subtarget.hasVBMI2()) {
- for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+ for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}
}
- }// has AVX-512
+ }// useAVX512Regs
// This block controls legalization for operations that don't have
// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
@@ -1667,6 +1721,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
+ if (Subtarget.hasDQI()) {
+ // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
+ // v2f32 UINT_TO_FP is already custom under SSE2.
+ assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
+ isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
+ "Unexpected operation action!");
+ // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
+ }
+
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
@@ -1746,12 +1813,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
}
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
for (auto VT : { MVT::v16i1, MVT::v32i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
@@ -1759,93 +1824,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
- }
-
- // This block controls legalization for v32i16 and v64i8. 512-bits can be
- // disabled based on prefer-vector-width and required-vector-width function
- // attributes.
- if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
- addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
- addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
-
- // Extends from v64i1 masks to 512-bit vectors.
- setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
-
- setOperationAction(ISD::MUL, MVT::v32i16, Legal);
- setOperationAction(ISD::MUL, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
- setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
- setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
- setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
-
- setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
-
- for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
- setOperationAction(ISD::ABS, VT, Legal);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::MLOAD, VT, Legal);
- setOperationAction(ISD::MSTORE, VT, Legal);
- setOperationAction(ISD::CTPOP, VT, Custom);
- setOperationAction(ISD::CTLZ, VT, Custom);
- setOperationAction(ISD::SMAX, VT, Legal);
- setOperationAction(ISD::UMAX, VT, Legal);
- setOperationAction(ISD::SMIN, VT, Legal);
- setOperationAction(ISD::UMIN, VT, Legal);
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::UADDSAT, VT, Legal);
- setOperationAction(ISD::SADDSAT, VT, Legal);
- setOperationAction(ISD::USUBSAT, VT, Legal);
- setOperationAction(ISD::SSUBSAT, VT, Legal);
- setOperationAction(ISD::SELECT, VT, Custom);
-
- // The condition codes aren't legal in SSE/AVX and under AVX512 we use
- // setcc all the way to isel and prefer SETGT in some isel patterns.
- setCondCodeAction(ISD::SETLT, VT, Custom);
- setCondCodeAction(ISD::SETLE, VT, Custom);
- }
-
- for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
- setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
- }
-
- if (Subtarget.hasBITALG()) {
- for (auto VT : { MVT::v64i8, MVT::v32i16 })
- setOperationAction(ISD::CTPOP, VT, Legal);
- }
- if (Subtarget.hasVBMI2()) {
- setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
- setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
- }
- }
-
- if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
@@ -1874,19 +1853,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
- if (Subtarget.hasDQI()) {
- // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
- // v2f32 UINT_TO_FP is already custom under SSE2.
- assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
- isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
- "Unexpected operation action!");
- // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
- setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
- }
-
if (Subtarget.hasBWI()) {
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
@@ -1983,6 +1949,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
@@ -2000,6 +1967,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA);
+ setTargetDAGCombine(ISD::STRICT_FMA);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SUB);
@@ -2024,6 +1992,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::MSCATTER);
setTargetDAGCombine(ISD::MGATHER);
+ setTargetDAGCombine(ISD::FP16_TO_FP);
+ setTargetDAGCombine(ISD::FP_EXTEND);
+ setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
+ setTargetDAGCombine(ISD::FP_ROUND);
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -2075,7 +2047,8 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(MVT VT) const {
- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
+ !Subtarget.hasBWI())
return TypeSplitVector;
if (VT.getVectorNumElements() != 1 &&
@@ -2085,51 +2058,73 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+static std::pair<MVT, unsigned>
+handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
+ const X86Subtarget &Subtarget) {
+ // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
+ // convention is one that uses k registers.
+ if (NumElts == 2)
+ return {MVT::v2i64, 1};
+ if (NumElts == 4)
+ return {MVT::v4i32, 1};
+ if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
+ CC != CallingConv::Intel_OCL_BI)
+ return {MVT::v8i16, 1};
+ if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
+ CC != CallingConv::Intel_OCL_BI)
+ return {MVT::v16i8, 1};
+ // v32i1 passes in ymm unless we have BWI and the calling convention is
+ // regcall.
+ if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
+ return {MVT::v32i8, 1};
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
+ if (Subtarget.useAVX512Regs())
+ return {MVT::v64i8, 1};
+ return {MVT::v32i8, 2};
+ }
+
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
+ NumElts > 64)
+ return {MVT::i8, NumElts};
+
+ return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
+}
+
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- // v32i1 vectors should be promoted to v32i8 to match avx2.
- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
- return MVT::v32i8;
- // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512() &&
- (!isPowerOf2_32(VT.getVectorNumElements()) ||
- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
- return MVT::i8;
- // Split v64i1 vectors if we don't have v64i8 available.
- if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
- CC != CallingConv::X86_RegCall)
- return MVT::v32i1;
- // FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
- Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
- return MVT::v16i32;
+ Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return RegisterVT;
+ }
+
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- // v32i1 vectors should be promoted to v32i8 to match avx2.
- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
- return 1;
- // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512() &&
- (!isPowerOf2_32(VT.getVectorNumElements()) ||
- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
- return VT.getVectorNumElements();
- // Split v64i1 vectors if we don't have v64i8 available.
- if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
- CC != CallingConv::X86_RegCall)
- return 2;
- // FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
- Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
- return 1;
+ Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return NumRegisters;
+ }
+
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
@@ -2140,8 +2135,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
(!isPowerOf2_32(VT.getVectorNumElements()) ||
- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
+ (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
+ VT.getVectorNumElements() > 64)) {
RegisterVT = MVT::i8;
IntermediateVT = MVT::i1;
NumIntermediates = VT.getVectorNumElements();
@@ -2151,7 +2146,7 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
// Split v64i1 vectors if we don't have v64i8 available.
if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
CC != CallingConv::X86_RegCall) {
- RegisterVT = MVT::v32i1;
+ RegisterVT = MVT::v32i8;
IntermediateVT = MVT::v32i1;
NumIntermediates = 2;
return 2;
@@ -2194,20 +2189,20 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
/// Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
-static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
if (MaxAlign == 16)
return;
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
- if (VTy->getBitWidth() == 128)
- MaxAlign = 16;
+ if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
+ MaxAlign = Align(16);
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
- unsigned EltAlign = 0;
+ Align EltAlign;
getMaxByValAlign(ATy->getElementType(), EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
for (auto *EltTy : STy->elements()) {
- unsigned EltAlign = 0;
+ Align EltAlign;
getMaxByValAlign(EltTy, EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
@@ -2225,46 +2220,34 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
const DataLayout &DL) const {
if (Subtarget.is64Bit()) {
// Max of 8 and alignment of type.
- unsigned TyAlign = DL.getABITypeAlignment(Ty);
+ Align TyAlign = DL.getABITypeAlign(Ty);
if (TyAlign > 8)
- return TyAlign;
+ return TyAlign.value();
return 8;
}
- unsigned Align = 4;
+ Align Alignment(4);
if (Subtarget.hasSSE1())
- getMaxByValAlign(Ty, Align);
- return Align;
-}
-
-/// Returns the target specific optimal type for load
-/// and store operations as a result of memset, memcpy, and memmove
-/// lowering. If DstAlign is zero that means it's safe to destination
-/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
-/// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If 'IsMemset' is
-/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
-/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
-/// source is constant so it does not need to be loaded.
+ getMaxByValAlign(Ty, Alignment);
+ return Alignment.value();
+}
+
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
/// For vector ops we check that the overall size isn't larger than our
/// preferred vector width.
EVT X86TargetLowering::getOptimalMemOpType(
- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const {
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
- if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
- ((DstAlign == 0 || DstAlign >= 16) &&
- (SrcAlign == 0 || SrcAlign >= 16)))) {
+ if (Op.size() >= 16 &&
+ (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
// FIXME: Check if unaligned 64-byte accesses are slow.
- if (Size >= 64 && Subtarget.hasAVX512() &&
+ if (Op.size() >= 64 && Subtarget.hasAVX512() &&
(Subtarget.getPreferVectorWidth() >= 512)) {
return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
}
// FIXME: Check if unaligned 32-byte accesses are slow.
- if (Size >= 32 && Subtarget.hasAVX() &&
+ if (Op.size() >= 32 && Subtarget.hasAVX() &&
(Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
@@ -2280,8 +2263,8 @@ EVT X86TargetLowering::getOptimalMemOpType(
if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
(Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
- } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
- !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
+ } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
+ Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
// Do not use f64 to lower memcpy if source is string constant. It's
// better to use i32 to avoid the loads.
// Also, do not use f64 to lower memset unless this is a memset of zeros.
@@ -2294,7 +2277,7 @@ EVT X86TargetLowering::getOptimalMemOpType(
// This is a compromise. If we reach here, unaligned accesses may be slow on
// this target. However, creating smaller, aligned accesses could be even
// slower and would certainly be a lot more code.
- if (Subtarget.is64Bit() && Size >= 8)
+ if (Subtarget.is64Bit() && Op.size() >= 8)
return MVT::i64;
return MVT::i32;
}
@@ -2611,7 +2594,7 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
/// Breaks v64i1 value into two registers and adds the new node to the DAG
static void Passv64i1ArgInRegs(
const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
- SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
+ SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
CCValAssign &NextVA, const X86Subtarget &Subtarget) {
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
@@ -2656,14 +2639,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
- SDValue Flag;
- SmallVector<SDValue, 6> RetOps;
- RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
- // Operand #1 = Bytes To Pop
- RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
- MVT::i32));
-
- // Copy the result values into the output registers.
+ SmallVector<std::pair<Register, SDValue>, 4> RetVals;
for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
++I, ++OutsIndex) {
CCValAssign &VA = RVLocs[I];
@@ -2715,7 +2691,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// change the value to the FP stack register class.
if (isScalarFPTypeInSSEReg(VA.getValVT()))
ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
- RetOps.push_back(ValToCopy);
+ RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
// Don't emit a copytoreg.
continue;
}
@@ -2736,31 +2712,39 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
}
- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
- Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
+ Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
Subtarget);
- assert(2 == RegsToPass.size() &&
- "Expecting two registers after Pass64BitArgInRegs");
-
// Add the second register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
} else {
- RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
+ RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
}
+ }
- // Add nodes to the DAG and add the values into the RetOps list
- for (auto &Reg : RegsToPass) {
- Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
- Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+ SDValue Flag;
+ SmallVector<SDValue, 6> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ // Operand #1 = Bytes To Pop
+ RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
+ MVT::i32));
+
+ // Copy the result values into the output registers.
+ for (auto &RetVal : RetVals) {
+ if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
+ RetOps.push_back(RetVal.second);
+ continue; // Don't emit a copytoreg.
}
+
+ Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(
+ DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
}
// Swift calling convention does not require we copy the sret argument
@@ -2775,7 +2759,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
// false, then an sret argument may be implicitly inserted in the SelDAG. In
// either case FuncInfo->setSRetReturnReg() will have been called.
- if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+ if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
// When we have both sret and another return value, we should use the
// original Chain stored in RetOps[0], instead of the current Chain updated
// in the above loop. If we only have sret, RetOps[0] equals to Chain.
@@ -2798,7 +2782,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
getPointerTy(MF.getDataLayout()));
- unsigned RetValReg
+ Register RetValReg
= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
X86::RAX : X86::EAX;
Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
@@ -2924,7 +2908,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
if (nullptr == InFlag) {
// When no physical register is present,
// create an intermediate virtual register.
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
@@ -3133,10 +3117,10 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
SelectionDAG &DAG, const SDLoc &dl) {
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
- return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
- /*isVolatile*/false, /*AlwaysInline=*/true,
- /*isTailCall*/false,
- MachinePointerInfo(), MachinePointerInfo());
+ return DAG.getMemcpy(
+ Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
+ /*isVolatile*/ false, /*AlwaysInline=*/true,
+ /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
}
/// Return true if the calling convention is one that we can guarantee TCO for.
@@ -3176,8 +3160,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
if (!CI->isTailCall())
return false;
- ImmutableCallSite CS(CI);
- CallingConv::ID CalleeCC = CS.getCallingConv();
+ CallingConv::ID CalleeCC = CI->getCallingConv();
if (!mayTailCallThisCC(CalleeCC))
return false;
@@ -3341,20 +3324,223 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
#ifndef NDEBUG
static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
- return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
- [](const CCValAssign &A, const CCValAssign &B) -> bool {
- return A.getValNo() < B.getValNo();
- });
+ return llvm::is_sorted(
+ ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
+ return A.getValNo() < B.getValNo();
+ });
}
#endif
+namespace {
+/// This is a helper class for lowering variable arguments parameters.
+class VarArgsLoweringHelper {
+public:
+ VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ CallingConv::ID CallConv, CCState &CCInfo)
+ : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
+ TheMachineFunction(DAG.getMachineFunction()),
+ TheFunction(TheMachineFunction.getFunction()),
+ FrameInfo(TheMachineFunction.getFrameInfo()),
+ FrameLowering(*Subtarget.getFrameLowering()),
+ TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
+ CCInfo(CCInfo) {}
+
+ // Lower variable arguments parameters.
+ void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
+
+private:
+ void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
+
+ void forwardMustTailParameters(SDValue &Chain);
+
+ bool is64Bit() { return Subtarget.is64Bit(); }
+ bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }
+
+ X86MachineFunctionInfo *FuncInfo;
+ const SDLoc &DL;
+ SelectionDAG &DAG;
+ const X86Subtarget &Subtarget;
+ MachineFunction &TheMachineFunction;
+ const Function &TheFunction;
+ MachineFrameInfo &FrameInfo;
+ const TargetFrameLowering &FrameLowering;
+ const TargetLowering &TargLowering;
+ CallingConv::ID CallConv;
+ CCState &CCInfo;
+};
+} // namespace
+
+void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
+ SDValue &Chain, unsigned StackSize) {
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start. We
+ // can skip this if there are no va_start calls.
+ if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
+ CallConv != CallingConv::X86_ThisCall)) {
+ FuncInfo->setVarArgsFrameIndex(
+ FrameInfo.CreateFixedObject(1, StackSize, true));
+ }
+
+ // Figure out if XMM registers are in use.
+ assert(!(Subtarget.useSoftFloat() &&
+ TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ // 64-bit calling conventions support varargs and register parameters, so we
+ // have to do extra work to spill them in the prologue.
+ if (is64Bit()) {
+ // Find the first unallocated argument registers.
+ ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+ ArrayRef<MCPhysReg> ArgXMMs =
+ get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
+
+ assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ if (isWin64()) {
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(
+ FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+ // Fixup to set vararg frame on shadow area (4 x i64).
+ if (NumIntRegs < 4)
+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+ } else {
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by dereferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
+ ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
+ }
+
+ SmallVector<SDValue, 6>
+ LiveGPRs; // list of SDValue for GPR registers keeping live input value
+ SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
+ // keeping live input value
+ SDValue ALVal; // if applicable keeps SDValue for %al register
+
+ // Gather all the live in physical registers.
+ for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+ Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
+ LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
+ }
+ const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
+ if (!AvailableXmms.empty()) {
+ Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
+ ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
+ for (MCPhysReg Reg : AvailableXmms) {
+ Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);
+ LiveXMMRegs.push_back(
+ DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));
+ }
+ }
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN =
+ DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ TargLowering.getPointerTy(DAG.getDataLayout()));
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (SDValue Val : LiveGPRs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, DL,
+ TargLowering.getPointerTy(DAG.getDataLayout()),
+ RSFIN, DAG.getIntPtrConstant(Offset, DL));
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(),
+ FuncInfo->getRegSaveFrameIndex(), Offset));
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ // Now store the XMM (fp + vector) parameter registers.
+ if (!LiveXMMRegs.empty()) {
+ SmallVector<SDValue, 12> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+ SaveXMMOps.push_back(ALVal);
+ SaveXMMOps.push_back(
+ DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));
+ SaveXMMOps.push_back(
+ DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));
+ SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+ LiveXMMRegs.end());
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
+ MVT::Other, SaveXMMOps));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+ }
+}
+
+void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
+ // Find the largest legal vector type.
+ MVT VecVT = MVT::Other;
+ // FIXME: Only some x86_32 calling conventions support AVX512.
+ if (Subtarget.useAVX512Regs() &&
+ (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
+ CallConv == CallingConv::Intel_OCL_BI)))
+ VecVT = MVT::v16f32;
+ else if (Subtarget.hasAVX())
+ VecVT = MVT::v8f32;
+ else if (Subtarget.hasSSE2())
+ VecVT = MVT::v4f32;
+
+ // We forward some GPRs and some vector types.
+ SmallVector<MVT, 2> RegParmTypes;
+ MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
+ RegParmTypes.push_back(IntVT);
+ if (VecVT != MVT::Other)
+ RegParmTypes.push_back(VecVT);
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+ // Forward AL for SysV x86_64 targets, since it is used for varargs.
+ if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
+ Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
+ Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+ }
+
+ // Copy all forwards from physical to virtual registers.
+ for (ForwardedRegister &FR : Forwards) {
+ // FIXME: Can we use a less constrained schedule?
+ SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
+ FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
+ TargLowering.getRegClassFor(FR.VT));
+ Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
+ }
+}
+
+void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
+ unsigned StackSize) {
+ // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
+ // If necessary, it would be set into the correct value later.
+ FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+ FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+
+ if (FrameInfo.hasVAStart())
+ createVarArgAreaAndStoreRegisters(Chain, StackSize);
+
+ if (FrameInfo.hasMustTailInVarArgFunc())
+ forwardMustTailParameters(Chain);
+}
+
SDValue X86TargetLowering::LowerFormalArguments(
- SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
const Function &F = MF.getFunction();
if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
@@ -3366,16 +3552,16 @@ SDValue X86TargetLowering::LowerFormalArguments(
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
assert(
- !(isVarArg && canGuaranteeTCO(CallConv)) &&
+ !(IsVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64.
if (IsWin64)
- CCInfo.AllocateStack(32, 8);
+ CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeArguments(Ins, CC_X86);
@@ -3446,7 +3632,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
else
llvm_unreachable("Unknown argument type!");
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
}
@@ -3500,7 +3686,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// the argument into a virtual register so that we can access it from the
// return points.
if (Ins[I].Flags.isSRet()) {
- unsigned Reg = FuncInfo->getSRetReturnReg();
+ Register Reg = FuncInfo->getSRetReturnReg();
if (!Reg) {
MVT PtrTy = getPointerTy(DAG.getDataLayout());
Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
@@ -3518,147 +3704,12 @@ SDValue X86TargetLowering::LowerFormalArguments(
MF.getTarget().Options.GuaranteedTailCallOpt))
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
- // If the function takes variable number of arguments, make a frame index for
- // the start of the first vararg value... for expansion of llvm.va_start. We
- // can skip this if there are no va_start calls.
- if (MFI.hasVAStart() &&
- (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
- CallConv != CallingConv::X86_ThisCall))) {
- FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
- }
-
- // Figure out if XMM registers are in use.
- assert(!(Subtarget.useSoftFloat() &&
- F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
- "SSE register cannot be used when SSE is disabled!");
-
- // 64-bit calling conventions support varargs and register parameters, so we
- // have to do extra work to spill them in the prologue.
- if (Is64Bit && isVarArg && MFI.hasVAStart()) {
- // Find the first unallocated argument registers.
- ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
- ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
- unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
- unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
- assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
- "SSE register cannot be used when SSE is disabled!");
-
- // Gather all the live in physical registers.
- SmallVector<SDValue, 6> LiveGPRs;
- SmallVector<SDValue, 8> LiveXMMRegs;
- SDValue ALVal;
- for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
- unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
- LiveGPRs.push_back(
- DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
- }
- if (!ArgXMMs.empty()) {
- unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
- ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
- for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
- unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
- LiveXMMRegs.push_back(
- DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
- }
- }
-
- if (IsWin64) {
- // Get to the caller-allocated home save location. Add 8 to account
- // for the return address.
- int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
- FuncInfo->setRegSaveFrameIndex(
- MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
- // Fixup to set vararg frame on shadow area (4 x i64).
- if (NumIntRegs < 4)
- FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
- } else {
- // For X86-64, if there are vararg parameters that are passed via
- // registers, then we must store them to their spots on the stack so
- // they may be loaded by dereferencing the result of va_next.
- FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
- FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
- FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
- ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
- }
-
- // Store the integer parameter registers.
- SmallVector<SDValue, 8> MemOps;
- SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
- getPointerTy(DAG.getDataLayout()));
- unsigned Offset = FuncInfo->getVarArgsGPOffset();
- for (SDValue Val : LiveGPRs) {
- SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
- RSFIN, DAG.getIntPtrConstant(Offset, dl));
- SDValue Store =
- DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo::getFixedStack(
- DAG.getMachineFunction(),
- FuncInfo->getRegSaveFrameIndex(), Offset));
- MemOps.push_back(Store);
- Offset += 8;
- }
-
- if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
- // Now store the XMM (fp + vector) parameter registers.
- SmallVector<SDValue, 12> SaveXMMOps;
- SaveXMMOps.push_back(Chain);
- SaveXMMOps.push_back(ALVal);
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getRegSaveFrameIndex(), dl));
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getVarArgsFPOffset(), dl));
- SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
- LiveXMMRegs.end());
- MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
- MVT::Other, SaveXMMOps));
- }
-
- if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
- }
-
- if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
- // Find the largest legal vector type.
- MVT VecVT = MVT::Other;
- // FIXME: Only some x86_32 calling conventions support AVX512.
- if (Subtarget.useAVX512Regs() &&
- (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
- CallConv == CallingConv::Intel_OCL_BI)))
- VecVT = MVT::v16f32;
- else if (Subtarget.hasAVX())
- VecVT = MVT::v8f32;
- else if (Subtarget.hasSSE2())
- VecVT = MVT::v4f32;
-
- // We forward some GPRs and some vector types.
- SmallVector<MVT, 2> RegParmTypes;
- MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
- RegParmTypes.push_back(IntVT);
- if (VecVT != MVT::Other)
- RegParmTypes.push_back(VecVT);
-
- // Compute the set of forwarded registers. The rest are scratch.
- SmallVectorImpl<ForwardedRegister> &Forwards =
- FuncInfo->getForwardedMustTailRegParms();
- CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
-
- // Forward AL for SysV x86_64 targets, since it is used for varargs.
- if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {
- unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
- Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
- }
-
- // Copy all forwards from physical to virtual registers.
- for (ForwardedRegister &FR : Forwards) {
- // FIXME: Can we use a less constrained schedule?
- SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
- FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
- Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
- }
- }
+ if (IsVarArg)
+ VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
+ .lowerVarArgsParameters(Chain, StackSize);
// Some CCs need callee pop.
- if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
@@ -3677,10 +3728,6 @@ SDValue X86TargetLowering::LowerFormalArguments(
if (!Is64Bit) {
// RegSaveFrameIndex is X86-64 only.
FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
- if (CallConv == CallingConv::X86_FastCall ||
- CallConv == CallingConv::X86_ThisCall)
- // fastcc functions can't have varargs.
- FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
}
FuncInfo->setArgumentStackSize(StackSize);
@@ -3697,7 +3744,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// same, so the size of funclets' (mostly empty) frames is dictated by
// how far this slot is from the bottom (since they allocate just enough
// space to accommodate holding this slot at the correct offset).
- int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
+ int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSS=*/false);
EHInfo->PSPSymFrameIdx = PSPSymFI;
}
}
@@ -3705,7 +3752,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
if (CallConv == CallingConv::X86_RegCall ||
F.hasFnAttribute("no_caller_saved_registers")) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
+ for (std::pair<Register, Register> Pair : MRI.liveins())
MRI.disableCalleeSavedRegister(Pair.first);
}
@@ -3716,12 +3763,13 @@ SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
SDValue Arg, const SDLoc &dl,
SelectionDAG &DAG,
const CCValAssign &VA,
- ISD::ArgFlagsTy Flags) const {
+ ISD::ArgFlagsTy Flags,
+ bool isByVal) const {
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, PtrOff);
- if (Flags.isByVal())
+ if (isByVal)
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
return DAG.getStore(
@@ -3796,18 +3844,17 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
CallConv == CallingConv::Tail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
- const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
+ const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);
const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
- const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
+ const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
bool HasNoCfCheck =
(CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
MachineFunction::CallSiteInfo CSInfo;
-
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
@@ -3823,7 +3870,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
isTailCall = false;
}
- bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
+ bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
if (IsMustTail) {
// Force this to be a tail call. The verifier rules are enough to ensure
// that we can lower this successfully without moving the return address
@@ -3854,7 +3901,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Allocate shadow area for Win64.
if (IsWin64)
- CCInfo.AllocateStack(32, 8);
+ CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeArguments(Outs, CC_X86);
@@ -3900,6 +3947,21 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (ArgLocs.back().getLocMemOffset() != 0)
report_fatal_error("any parameter with the inalloca attribute must be "
"the only memory argument");
+ } else if (CLI.IsPreallocated) {
+ assert(ArgLocs.back().isMemLoc() &&
+ "cannot use preallocated attribute on a register "
+ "parameter");
+ SmallVector<size_t, 4> PreallocatedOffsets;
+ for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
+ if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
+ PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
+ }
+ }
+ auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
+ MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
+ MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
+ NumBytesToPush = 0;
}
if (!IsSibcall && !IsMustTail)
@@ -3912,7 +3974,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
Is64Bit, FPDiff, dl);
- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
@@ -3927,9 +3989,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
++I, ++OutIndex) {
assert(OutIndex < Outs.size() && "Invalid Out index");
- // Skip inalloca arguments, they have already been written.
+ // Skip inalloca/preallocated arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
- if (Flags.isInAlloca())
+ if (Flags.isInAlloca() || Flags.isPreallocated())
continue;
CCValAssign &VA = ArgLocs[I];
@@ -3968,8 +4030,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// the caller from seeing any modifications the callee may make
// as guaranteed by the `byval` attribute.
int FrameIdx = MF.getFrameInfo().CreateStackObject(
- Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
- false);
+ Flags.getByValSize(),
+ std::max(Align(16), Flags.getNonZeroByValAlign()), false);
SDValue StackSlot =
DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
Chain =
@@ -3998,12 +4060,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
const TargetOptions &Options = DAG.getTarget().Options;
- if (Options.EnableDebugEntryValues)
+ if (Options.EmitCallSiteInfo)
CSInfo.emplace_back(VA.getLocReg(), I);
if (isVarArg && IsWin64) {
// Win64 ABI requires argument XMM reg to be copied to the corresponding
// shadow reg if callee is a varargs function.
- unsigned ShadowReg = 0;
+ Register ShadowReg;
switch (VA.getLocReg()) {
case X86::XMM0: ShadowReg = X86::RCX; break;
case X86::XMM1: ShadowReg = X86::RDX; break;
@@ -4019,7 +4081,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
getPointerTy(DAG.getDataLayout()));
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
- dl, DAG, VA, Flags));
+ dl, DAG, VA, Flags, isByVal));
}
}
@@ -4031,7 +4093,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// GOT pointer.
if (!isTailCall) {
RegsToPass.push_back(std::make_pair(
- unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+ Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
getPointerTy(DAG.getDataLayout()))));
} else {
// If we are tail calling and generating PIC/GOT style code load the
@@ -4069,8 +4131,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget.hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
-
- RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+ RegsToPass.push_back(std::make_pair(Register(X86::AL),
DAG.getConstant(NumXMMRegs, dl,
MVT::i8)));
}
@@ -4079,7 +4140,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const auto &Forwards = X86Info->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
- RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+ RegsToPass.push_back(std::make_pair(F.PReg, Val));
}
}
@@ -4117,8 +4178,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert(VA.isMemLoc());
SDValue Arg = OutVals[OutsIndex];
ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
- // Skip inalloca arguments. They don't require any work.
- if (Flags.isInAlloca())
+ // Skip inalloca/preallocated arguments. They don't require any work.
+ if (Flags.isInAlloca() || Flags.isPreallocated())
continue;
// Create frame index.
int32_t Offset = VA.getLocMemOffset()+FPDiff;
@@ -4219,7 +4280,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// is thrown, the runtime will not restore CSRs.
// FIXME: Model this more precisely so that we can register allocate across
// the normal edge and spill and fill across the exceptional edge.
- if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
+ if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
const Function &CallerFn = MF.getFunction();
EHPersonality Pers =
CallerFn.hasPersonalityFn()
@@ -4278,11 +4339,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
}
InFlag = Chain.getValue(1);
+ DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
// Save heapallocsite metadata.
- if (CLI.CS)
- if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
+ if (CLI.CB)
+ if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
// Create the CALLSEQ_END node.
@@ -4301,12 +4363,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
else
NumBytesForCalleeToPop = 0; // Callee pops nothing.
- if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
- // No need to reset the stack after the call if the call doesn't return. To
- // make the MI verify, we'll pretend the callee does it for us.
- NumBytesForCalleeToPop = NumBytes;
- }
-
// Returns a flag for retval copy to use.
if (!IsSibcall) {
Chain = DAG.getCALLSEQ_END(Chain,
@@ -4337,7 +4393,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// (within module) calls are supported at the moment.
// To keep the stack aligned according to platform abi the function
// GetAlignedArgumentStackSize ensures that argument delta is always multiples
-// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
// If a tail called function callee has more arguments than the caller the
// caller needs to make sure that there is room to move the RETADDR to. This is
// achieved by reserving an area the size of the argument delta right after the
@@ -4359,7 +4415,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
SelectionDAG &DAG) const {
- const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
+ const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
assert(StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize");
@@ -4395,7 +4451,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
- unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+ Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
if (!Register::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
@@ -4578,7 +4634,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
// Allocate shadow area for Win64
if (IsCalleeWin64)
- CCInfo.AllocateStack(32, 8);
+ CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
StackArgsSize = CCInfo.getNextStackOffset();
@@ -4693,6 +4749,7 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::INSERTPS:
case X86ISD::EXTRQI:
case X86ISD::INSERTQI:
+ case X86ISD::VALIGN:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -4739,6 +4796,13 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
}
}
+static bool isTargetShuffleSplat(SDValue Op) {
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::EXTRACT_SUBVECTOR)
+ return isTargetShuffleSplat(Op.getOperand(0));
+ return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
+}
+
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -4972,7 +5036,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
ScalarVT = MVT::i32;
Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags |= MachineMemOperand::MOStore;
break;
}
@@ -4985,7 +5049,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags |= MachineMemOperand::MOLoad;
break;
}
@@ -4997,7 +5061,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags |= MachineMemOperand::MOStore;
break;
}
@@ -5146,7 +5210,8 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
}
-bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
+bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool) const {
// TODO: Allow vectors?
if (VT.isVector())
return false;
@@ -5374,6 +5439,19 @@ static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
}
+/// Return true if the value of any element in Mask is the zero sentinel value.
+static bool isAnyZero(ArrayRef<int> Mask) {
+ return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+}
+
+/// Return true if the value of any element in Mask is the zero or undef
+/// sentinel values.
+static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
+ return llvm::any_of(Mask, [](int M) {
+ return M == SM_SentinelZero || M == SM_SentinelUndef;
+ });
+}
+
/// Return true if Val is undef or if its value falls within the
/// specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -5511,6 +5589,36 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask) {
return canWidenShuffleElements(Mask, WidenedMask);
}
+// Attempt to narrow/widen shuffle mask until it matches the target number of
+// elements.
+static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
+ SmallVectorImpl<int> &ScaledMask) {
+ unsigned NumSrcElts = Mask.size();
+ assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
+ "Illegal shuffle scale factor");
+
+ // Narrowing is guaranteed to work.
+ if (NumDstElts >= NumSrcElts) {
+ int Scale = NumDstElts / NumSrcElts;
+ llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
+ return true;
+ }
+
+ // We have to repeat the widening until we reach the target size, but we can
+ // split out the first widening as it sets up ScaledMask for us.
+ if (canWidenShuffleElements(Mask, ScaledMask)) {
+ while (ScaledMask.size() > NumDstElts) {
+ SmallVector<int, 16> WidenedMask;
+ if (!canWidenShuffleElements(ScaledMask, WidenedMask))
+ return false;
+ ScaledMask = std::move(WidenedMask);
+ }
+ return true;
+ }
+
+ return false;
+}
+
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
@@ -5725,7 +5833,7 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
}
-// Helper function to collect subvector ops that are concated together,
+// Helper function to collect subvector ops that are concatenated together,
// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
// The subvectors in Ops are guaranteed to be the same type.
static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
@@ -5736,8 +5844,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
return true;
}
- if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
- isa<ConstantSDNode>(N->getOperand(2))) {
+ if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Src = N->getOperand(0);
SDValue Sub = N->getOperand(1);
const APInt &Idx = N->getConstantOperandAPInt(2);
@@ -5746,19 +5853,93 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
// TODO - Handle more general insert_subvector chains.
if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
- Idx == (VT.getVectorNumElements() / 2) &&
- Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Src.getOperand(1).getValueType() == SubVT &&
- isNullConstant(Src.getOperand(2))) {
- Ops.push_back(Src.getOperand(1));
- Ops.push_back(Sub);
- return true;
+ Idx == (VT.getVectorNumElements() / 2)) {
+ // insert_subvector(insert_subvector(undef, x, lo), y, hi)
+ if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueType() == SubVT &&
+ isNullConstant(Src.getOperand(2))) {
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
+ // insert_subvector(x, extract_subvector(x, lo), hi)
+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
+ Ops.append(2, Sub);
+ return true;
+ }
}
}
return false;
}
+static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ EVT VT = Op.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned SizeInBits = VT.getSizeInBits();
+ assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
+ "Can't split odd sized vector");
+
+ SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
+ SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
+ return std::make_pair(Lo, Hi);
+}
+
+// Split an unary integer op into 2 half sized ops.
+static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // Make sure we only try to split 256/512-bit types to avoid creating
+ // narrow vectors.
+ assert((Op.getOperand(0).getValueType().is256BitVector() ||
+ Op.getOperand(0).getValueType().is512BitVector()) &&
+ (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
+ assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
+ VT.getVectorNumElements() &&
+ "Unexpected VTs!");
+
+ SDLoc dl(Op);
+
+ // Extract the Lo/Hi vectors
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
+ DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
+}
+
+/// Break a binary integer operation into 2 half sized ops and then
+/// concatenate the result back.
+static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // Sanity check that all the types match.
+ assert(Op.getOperand(0).getValueType() == VT &&
+ Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
+ assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
+
+ SDLoc dl(Op);
+
+ // Extract the LHS Lo/Hi vectors
+ SDValue LHS1, LHS2;
+ std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
+
+ // Extract the RHS Lo/Hi vectors
+ SDValue RHS1, RHS2;
+ std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
+}
+
// Helper for splitting operands of an operation to legal target size and
// apply a function on each part.
// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
@@ -5815,21 +5996,17 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
SDValue Vec = Op.getOperand(0);
SDValue SubVec = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
-
- if (!isa<ConstantSDNode>(Idx))
- return SDValue();
+ unsigned IdxVal = Op.getConstantOperandVal(2);
// Inserting undef is a nop. We can just return the original vector.
if (SubVec.isUndef())
return Vec;
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
return Op;
MVT OpVT = Op.getSimpleValueType();
unsigned NumElems = OpVT.getVectorNumElements();
-
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
// Extend to natively supported kshift.
@@ -5849,7 +6026,6 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
MVT SubVecVT = SubVec.getSimpleValueType();
unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
-
assert(IdxVal + SubVecNumElems <= NumElems &&
IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR");
@@ -5900,7 +6076,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
if (SubVecNumElems * 2 == NumElems) {
// Special case, use legal zero extending insert_subvector. This allows
- // isel to opimitize when bits are known zero.
+ // isel to optimize when bits are known zero.
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
DAG.getConstant(0, dl, WideOpVT),
@@ -6042,8 +6218,8 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
// Match (xor X, -1) -> X.
// Match extract_subvector(xor X, -1) -> extract_subvector(X).
// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
-static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
- V = peekThroughBitcasts(V);
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
+ V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
if (V.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
return V.getOperand(0);
@@ -6067,6 +6243,35 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
return SDValue();
}
+void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+ bool Lo, bool Unary) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+ for (int i = 0; i < NumElts; ++i) {
+ unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+ int Pos = (i % NumEltsInLane) / 2 + LaneStart;
+ Pos += (Unary ? 0 : NumElts * (i % 2));
+ Pos += (Lo ? 0 : NumEltsInLane / 2);
+ Mask.push_back(Pos);
+ }
+}
+
+/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+/// imposed by AVX and specific to the unary pattern. Example:
+/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+ bool Lo) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ int NumElts = VT.getVectorNumElements();
+ for (int i = 0; i < NumElts; ++i) {
+ int Pos = i / 2;
+ Pos += (Lo ? 0 : NumElts / 2);
+ Mask.push_back(Pos);
+ }
+}
+
/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
@@ -6102,14 +6307,10 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
}
-static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
- if (!Load || !ISD::isNormalLoad(Load))
- return nullptr;
-
- SDValue Ptr = Load->getBasePtr();
- if (Ptr->getOpcode() == X86ISD::Wrapper ||
- Ptr->getOpcode() == X86ISD::WrapperRIP)
- Ptr = Ptr->getOperand(0);
+static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
+ if (Ptr.getOpcode() == X86ISD::Wrapper ||
+ Ptr.getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr.getOperand(0);
auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
@@ -6118,6 +6319,12 @@ static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
return CNode->getConstVal();
}
+static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
+ if (!Load || !ISD::isNormalLoad(Load))
+ return nullptr;
+ return getTargetConstantFromBasePtr(Load->getBasePtr());
+}
+
static const Constant *getTargetConstantFromNode(SDValue Op) {
Op = peekThroughBitcasts(Op);
return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
@@ -6298,23 +6505,6 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
// Extract constant bits from a broadcasted constant pool scalar.
- if (Op.getOpcode() == X86ISD::VBROADCAST &&
- EltSizeInBits <= VT.getScalarSizeInBits()) {
- if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
- unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
- unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
-
- APInt UndefSrcElts(NumSrcElts, 0);
- SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
- if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
- if (UndefSrcElts[0])
- UndefSrcElts.setBits(0, NumSrcElts);
- SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
- return CastBitData(UndefSrcElts, SrcEltBits);
- }
- }
- }
-
if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
EltSizeInBits <= VT.getScalarSizeInBits()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
@@ -6322,16 +6512,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
return false;
SDValue Ptr = MemIntr->getBasePtr();
- if (Ptr->getOpcode() == X86ISD::Wrapper ||
- Ptr->getOpcode() == X86ISD::WrapperRIP)
- Ptr = Ptr->getOperand(0);
-
- auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
- if (!CNode || CNode->isMachineConstantPoolEntry() ||
- CNode->getOffset() != 0)
- return false;
-
- if (const Constant *C = CNode->getConstVal()) {
+ if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
@@ -6375,8 +6556,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
// Insert constant bits from a base and sub vector sources.
- if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
- isa<ConstantSDNode>(Op.getOperand(2))) {
+ if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
// TODO - support insert_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
@@ -6398,8 +6578,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
// Extract constant bits from a subvector's source.
- if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- isa<ConstantSDNode>(Op.getOperand(1))) {
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
// TODO - support extract_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
@@ -6468,11 +6647,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
namespace llvm {
namespace X86 {
-bool isConstantSplat(SDValue Op, APInt &SplatVal) {
+bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
- UndefElts, EltBits, true, false)) {
+ UndefElts, EltBits, true,
+ AllowPartialUndefs)) {
int SplatIndex = -1;
for (int i = 0, e = EltBits.size(); i != e; ++i) {
if (UndefElts[i])
@@ -6513,20 +6693,26 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
}
/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
+/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
/// Note: This ignores saturation, so inputs must be checked first.
static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
- bool Unary) {
+ bool Unary, unsigned NumStages = 1) {
assert(Mask.empty() && "Expected an empty shuffle mask vector");
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
unsigned Offset = Unary ? 0 : NumElts;
+ unsigned Repetitions = 1u << (NumStages - 1);
+ unsigned Increment = 1u << NumStages;
+ assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
- Mask.push_back(Elt + (Lane * NumEltsPerLane));
- for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
- Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
+ for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
+ Mask.push_back(Elt + (Lane * NumEltsPerLane));
+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
+ Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
+ }
}
}
@@ -6597,7 +6783,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
unsigned MaskEltSize = VT.getScalarSizeInBits();
SmallVector<uint64_t, 32> RawMask;
APInt RawUndefs;
- SDValue ImmN;
+ uint64_t ImmN;
assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
@@ -6608,23 +6794,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::BLENDI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeBLENDMask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUFP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeSHUFPMask(NumElems, MaskEltSize,
- cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::INSERTPS:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeINSERTPSMask(ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::EXTRQI:
@@ -6672,13 +6857,23 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodeMOVLHPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
+ case X86ISD::VALIGN:
+ assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
+ "Only 32-bit and 64-bit elements are supported!");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeVALIGNMask(NumElems, ImmN, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(0));
+ break;
case X86ISD::PALIGNR:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePALIGNRMask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
Ops.push_back(N->getOperand(1));
Ops.push_back(N->getOperand(0));
@@ -6686,39 +6881,34 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::VSHLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSLLDQMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::VSRLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSRLDQMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::PSHUFD:
case X86ISD::VPERMILPI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSHUFMask(NumElems, MaskEltSize,
- cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::PSHUFHW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSHUFHWMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::PSHUFLW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSHUFLWMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::VZEXT_MOVL:
@@ -6770,8 +6960,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
}
case X86ISD::VPERMI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeVPERMMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::MOVSS:
@@ -6783,17 +6973,15 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::VPERM2X128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUF128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
- cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVSLDUP:
@@ -6875,9 +7063,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return false;
// Check if we're getting a shuffle mask with zero'd elements.
- if (!AllowSentinelZero)
- if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
- return false;
+ if (!AllowSentinelZero && isAnyZero(Mask))
+ return false;
// If we have a fake unary shuffle, the shuffle mask is spread across two
// inputs that are actually the same node. Re-map the mask to always point
@@ -7060,6 +7247,20 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
continue;
}
+ // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
+ // base vectors.
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue Vec = V.getOperand(0);
+ int NumVecElts = Vec.getValueType().getVectorNumElements();
+ if (Vec.isUndef() && Size == NumVecElts) {
+ int Idx = V.getConstantOperandVal(2);
+ int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
+ if (M < Idx || (Idx + NumSubElts) <= M)
+ KnownUndef.setBit(i);
+ }
+ continue;
+ }
+
// Attempt to extract from the source's constant bits.
if (IsSrcConstant[SrcIdx]) {
if (UndefSrcElts[SrcIdx][M])
@@ -7111,7 +7312,7 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
// TODO: Use DemandedElts variant.
static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- SelectionDAG &DAG, unsigned Depth,
+ const SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts);
// Attempt to decode ops that could be represented as a shuffle mask.
@@ -7120,7 +7321,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
- SelectionDAG &DAG, unsigned Depth,
+ const SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts) {
Mask.clear();
Ops.clear();
@@ -7132,6 +7333,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
return false;
assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
+ unsigned NumSizeInBytes = NumSizeInBits / 8;
+ unsigned NumBytesPerElt = NumBitsPerElt / 8;
unsigned Opcode = N.getOpcode();
switch (Opcode) {
@@ -7179,8 +7382,6 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
bool IsByteMask = true;
- unsigned NumSizeInBytes = NumSizeInBits / 8;
- unsigned NumBytesPerElt = NumBitsPerElt / 8;
APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
@@ -7220,10 +7421,21 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
true))
return false;
+
+ // Shuffle inputs must be the same size as the result.
+ if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
+ return VT.getSizeInBits() != Op.getValueSizeInBits();
+ }))
+ return false;
+ if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
+ return VT.getSizeInBits() != Op.getValueSizeInBits();
+ }))
+ return false;
+
size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
- scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
- scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
+ narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
+ narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
for (size_t i = 0; i != MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
@@ -7245,14 +7457,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SDValue Sub = N.getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
- if (!isa<ConstantSDNode>(N.getOperand(2)) ||
- !N->isOnlyUserOf(Sub.getNode()))
+ if (!N->isOnlyUserOf(Sub.getNode()))
return false;
uint64_t InsertIdx = N.getConstantOperandVal(2);
// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- Sub.getOperand(0).getValueType() == VT &&
- isa<ConstantSDNode>(Sub.getOperand(1))) {
+ Sub.getOperand(0).getValueType() == VT) {
uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
@@ -7268,13 +7478,20 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
SubMask, DAG, Depth + 1, ResolveKnownElts))
return false;
+
+ // Subvector shuffle inputs must not be larger than the subvector.
+ if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
+ return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();
+ }))
+ return false;
+
if (SubMask.size() != NumSubElts) {
assert(((SubMask.size() % NumSubElts) == 0 ||
(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
if ((NumSubElts % SubMask.size()) == 0) {
int Scale = NumSubElts / SubMask.size();
SmallVector<int,64> ScaledSubMask;
- scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
+ narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
SubMask = ScaledSubMask;
} else {
int Scale = SubMask.size() / NumSubElts;
@@ -7284,14 +7501,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
}
Ops.push_back(Src);
- for (SDValue &SubInput : SubInputs) {
- EVT SubSVT = SubInput.getValueType().getScalarType();
- EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
- NumSizeInBits / SubSVT.getSizeInBits());
- Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
- DAG.getUNDEF(AltVT), SubInput,
- DAG.getIntPtrConstant(0, SDLoc(N))));
- }
+ Ops.append(SubInputs.begin(), SubInputs.end());
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i) {
@@ -7304,75 +7514,83 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
return true;
}
- case ISD::SCALAR_TO_VECTOR: {
- // Match against a scalar_to_vector of an extract from a vector,
- // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
- SDValue N0 = N.getOperand(0);
- SDValue SrcExtract;
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW:
+ case ISD::SCALAR_TO_VECTOR:
+ case ISD::INSERT_VECTOR_ELT: {
+ // Match against a insert_vector_elt/scalar_to_vector of an extract from a
+ // vector, for matching src/dst vector types.
+ SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
+
+ unsigned DstIdx = 0;
+ if (Opcode != ISD::SCALAR_TO_VECTOR) {
+ // Check we have an in-range constant insertion index.
+ if (!isa<ConstantSDNode>(N.getOperand(2)) ||
+ N.getConstantOperandAPInt(2).uge(NumElts))
+ return false;
+ DstIdx = N.getConstantOperandVal(2);
- if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- N0.getOperand(0).getValueType() == VT) ||
- (N0.getOpcode() == X86ISD::PEXTRW &&
- N0.getOperand(0).getValueType() == MVT::v8i16) ||
- (N0.getOpcode() == X86ISD::PEXTRB &&
- N0.getOperand(0).getValueType() == MVT::v16i8)) {
- SrcExtract = N0;
+ // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
+ if (X86::isZeroNode(Scl)) {
+ Ops.push_back(N.getOperand(0));
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
+ return true;
+ }
}
+ // Peek through trunc/aext/zext.
+ // TODO: aext shouldn't require SM_SentinelZero padding.
+ // TODO: handle shift of scalars.
+ unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
+ while (Scl.getOpcode() == ISD::TRUNCATE ||
+ Scl.getOpcode() == ISD::ANY_EXTEND ||
+ Scl.getOpcode() == ISD::ZERO_EXTEND) {
+ Scl = Scl.getOperand(0);
+ MinBitsPerElt =
+ std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
+ }
+ if ((MinBitsPerElt % 8) != 0)
+ return false;
+
+ // Attempt to find the source vector the scalar was extracted from.
+ SDValue SrcExtract;
+ if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ Scl.getOpcode() == X86ISD::PEXTRW ||
+ Scl.getOpcode() == X86ISD::PEXTRB) &&
+ Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
+ SrcExtract = Scl;
+ }
if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
return false;
SDValue SrcVec = SrcExtract.getOperand(0);
EVT SrcVT = SrcVec.getValueType();
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
- unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
-
- unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
- if (NumSrcElts <= SrcIdx)
+ if (!SrcVT.getScalarType().isByteSized())
return false;
-
- Ops.push_back(SrcVec);
- Mask.push_back(SrcIdx);
- Mask.append(NumZeros, SM_SentinelZero);
- Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
- return true;
- }
- case X86ISD::PINSRB:
- case X86ISD::PINSRW: {
- SDValue InVec = N.getOperand(0);
- SDValue InScl = N.getOperand(1);
- SDValue InIndex = N.getOperand(2);
- if (!isa<ConstantSDNode>(InIndex) ||
- cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
- return false;
- uint64_t InIdx = N.getConstantOperandVal(2);
-
- // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
- if (X86::isZeroNode(InScl)) {
- Ops.push_back(InVec);
- for (unsigned i = 0; i != NumElts; ++i)
- Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
- return true;
+ unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
+ unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
+ unsigned DstByte = DstIdx * NumBytesPerElt;
+ MinBitsPerElt =
+ std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
+
+ // Create 'identity' byte level shuffle mask and then add inserted bytes.
+ if (Opcode == ISD::SCALAR_TO_VECTOR) {
+ Ops.push_back(SrcVec);
+ Mask.append(NumSizeInBytes, SM_SentinelUndef);
+ } else {
+ Ops.push_back(SrcVec);
+ Ops.push_back(N.getOperand(0));
+ for (int i = 0; i != (int)NumSizeInBytes; ++i)
+ Mask.push_back(NumSizeInBytes + i);
}
- // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
- // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
- unsigned ExOp =
- (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
- if (InScl.getOpcode() != ExOp)
- return false;
-
- SDValue ExVec = InScl.getOperand(0);
- SDValue ExIndex = InScl.getOperand(1);
- if (!isa<ConstantSDNode>(ExIndex) ||
- cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
- return false;
- uint64_t ExIdx = InScl.getConstantOperandVal(1);
-
- Ops.push_back(InVec);
- Ops.push_back(ExVec);
- for (unsigned i = 0; i != NumElts; ++i)
- Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
+ unsigned MinBytesPerElts = MinBitsPerElt / 8;
+ MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
+ for (unsigned i = 0; i != MinBytesPerElts; ++i)
+ Mask[DstByte + i] = SrcByte + i;
+ for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
+ Mask[DstByte + i] = SM_SentinelZero;
return true;
}
case X86ISD::PACKSS:
@@ -7412,6 +7630,23 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
createPackShuffleMask(VT, Mask, IsUnary);
return true;
}
+ case X86ISD::VTRUNC: {
+ SDValue Src = N.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ // Truncated source must be a simple vector.
+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+ (SrcVT.getScalarSizeInBits() % 8) != 0)
+ return false;
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
+ unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
+ assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
+ for (unsigned i = 0; i != NumSrcElts; ++i)
+ Mask.push_back(i * Scale);
+ Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
+ Ops.push_back(Src);
+ return true;
+ }
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
uint64_t ShiftVal = N.getConstantOperandVal(1);
@@ -7426,40 +7661,43 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
break;
uint64_t ByteShift = ShiftVal / 8;
- unsigned NumBytes = NumSizeInBits / 8;
- unsigned NumBytesPerElt = NumBitsPerElt / 8;
Ops.push_back(N.getOperand(0));
// Clear mask to all zeros and insert the shifted byte indices.
- Mask.append(NumBytes, SM_SentinelZero);
+ Mask.append(NumSizeInBytes, SM_SentinelZero);
if (X86ISD::VSHLI == Opcode) {
- for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j] = i + j - ByteShift;
} else {
- for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j - ByteShift] = i + j;
}
return true;
}
+ case X86ISD::VROTLI:
+ case X86ISD::VROTRI: {
+ // We can only decode 'whole byte' bit rotates as shuffles.
+ uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
+ if ((RotateVal % 8) != 0)
+ return false;
+ Ops.push_back(N.getOperand(0));
+ int Offset = RotateVal / 8;
+ Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
+ for (int i = 0; i != (int)NumElts; ++i) {
+ int BaseIdx = i * NumBytesPerElt;
+ for (int j = 0; j != (int)NumBytesPerElt; ++j) {
+ Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
+ }
+ }
+ return true;
+ }
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
- MVT SrcVT = Src.getSimpleValueType();
- if (!SrcVT.isVector())
+ if (!Src.getSimpleValueType().isVector())
return false;
-
- if (NumSizeInBits != SrcVT.getSizeInBits()) {
- assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
- "Illegal broadcast type");
- SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
- NumSizeInBits / SrcVT.getScalarSizeInBits());
- Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
- DAG.getUNDEF(SrcVT), Src,
- DAG.getIntPtrConstant(0, SDLoc(N)));
- }
-
Ops.push_back(Src);
Mask.append(NumElts, 0);
return true;
@@ -7476,22 +7714,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
(SrcVT.getScalarSizeInBits() % 8) != 0)
return false;
- unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
bool IsAnyExtend =
(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
- DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
- Mask);
-
- if (NumSizeInBits != SrcVT.getSizeInBits()) {
- assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
- "Illegal zero-extension type");
- SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
- NumSizeInBits / NumSrcBitsPerElt);
- Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
- DAG.getUNDEF(SrcVT), Src,
- DAG.getIntPtrConstant(0, SDLoc(N)));
- }
-
+ DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+ IsAnyExtend, Mask);
Ops.push_back(Src);
return true;
}
@@ -7549,7 +7775,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
APInt &KnownUndef, APInt &KnownZero,
- SelectionDAG &DAG, unsigned Depth,
+ const SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts) {
EVT VT = Op.getValueType();
if (!VT.isSimple() || !VT.isVector())
@@ -7570,7 +7796,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- SelectionDAG &DAG, unsigned Depth = 0,
+ const SelectionDAG &DAG, unsigned Depth = 0,
bool ResolveKnownElts = true) {
EVT VT = Op.getValueType();
if (!VT.isSimple() || !VT.isVector())
@@ -7583,93 +7809,107 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
KnownZero, DAG, Depth, ResolveKnownElts);
}
-/// Returns the scalar element that will make up the ith
+/// Returns the scalar element that will make up the i'th
/// element of the result of the vector shuffle.
-static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
- unsigned Depth) {
- if (Depth == 6)
- return SDValue(); // Limit search depth.
+static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
+ SelectionDAG &DAG, unsigned Depth) {
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue(); // Limit search depth.
- SDValue V = SDValue(N, 0);
- EVT VT = V.getValueType();
- unsigned Opcode = V.getOpcode();
+ EVT VT = Op.getValueType();
+ unsigned Opcode = Op.getOpcode();
+ unsigned NumElems = VT.getVectorNumElements();
// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
- if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+ if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
int Elt = SV->getMaskElt(Index);
if (Elt < 0)
return DAG.getUNDEF(VT.getVectorElementType());
- unsigned NumElems = VT.getVectorNumElements();
- SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
- : SV->getOperand(1);
- return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+ SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
+ return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
}
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
- MVT ShufVT = V.getSimpleValueType();
+ MVT ShufVT = VT.getSimpleVT();
MVT ShufSVT = ShufVT.getVectorElementType();
int NumElems = (int)ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 16> ShuffleOps;
bool IsUnary;
- if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
+ if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
+ ShuffleMask, IsUnary))
return SDValue();
int Elt = ShuffleMask[Index];
if (Elt == SM_SentinelZero)
- return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
- : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
+ return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
+ : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
if (Elt == SM_SentinelUndef)
return DAG.getUNDEF(ShufSVT);
- assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
- SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
- return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
- Depth+1);
+ assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
+ SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
+ return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
}
// Recurse into insert_subvector base/sub vector to find scalars.
- if (Opcode == ISD::INSERT_SUBVECTOR &&
- isa<ConstantSDNode>(N->getOperand(2))) {
- SDValue Vec = N->getOperand(0);
- SDValue Sub = N->getOperand(1);
- EVT SubVT = Sub.getValueType();
- unsigned NumSubElts = SubVT.getVectorNumElements();
- uint64_t SubIdx = N->getConstantOperandVal(2);
+ if (Opcode == ISD::INSERT_SUBVECTOR) {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Sub = Op.getOperand(1);
+ uint64_t SubIdx = Op.getConstantOperandVal(2);
+ unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
- return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
- return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
+ return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
+ return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
+ }
+
+ // Recurse into concat_vectors sub vector to find scalars.
+ if (Opcode == ISD::CONCAT_VECTORS) {
+ EVT SubVT = Op.getOperand(0).getValueType();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
+ uint64_t SubIdx = Index / NumSubElts;
+ uint64_t SubElt = Index % NumSubElts;
+ return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
}
// Recurse into extract_subvector src vector to find scalars.
- if (Opcode == ISD::EXTRACT_SUBVECTOR &&
- isa<ConstantSDNode>(N->getOperand(1))) {
- SDValue Src = N->getOperand(0);
- uint64_t SrcIdx = N->getConstantOperandVal(1);
- return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
+ if (Opcode == ISD::EXTRACT_SUBVECTOR) {
+ SDValue Src = Op.getOperand(0);
+ uint64_t SrcIdx = Op.getConstantOperandVal(1);
+ return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
}
- // Actual nodes that may contain scalar elements
+ // We only peek through bitcasts of the same vector width.
if (Opcode == ISD::BITCAST) {
- V = V.getOperand(0);
- EVT SrcVT = V.getValueType();
- unsigned NumElems = VT.getVectorNumElements();
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
+ return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
+ return SDValue();
+ }
- if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
- return SDValue();
+ // Actual nodes that may contain scalar elements
+
+ // For insert_vector_elt - either return the index matching scalar or recurse
+ // into the base vector.
+ if (Opcode == ISD::INSERT_VECTOR_ELT &&
+ isa<ConstantSDNode>(Op.getOperand(2))) {
+ if (Op.getConstantOperandAPInt(2) == Index)
+ return Op.getOperand(1);
+ return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
}
- if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
- return (Index == 0) ? V.getOperand(0)
+ if (Opcode == ISD::SCALAR_TO_VECTOR)
+ return (Index == 0) ? Op.getOperand(0)
: DAG.getUNDEF(VT.getVectorElementType());
- if (V.getOpcode() == ISD::BUILD_VECTOR)
- return V.getOperand(Index);
+ if (Opcode == ISD::BUILD_VECTOR)
+ return Op.getOperand(Index);
return SDValue();
}
@@ -7762,10 +8002,11 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
Elt = NextElt;
}
- // If our first insertion is not the first index then insert into zero
- // vector to break any register dependency else use SCALAR_TO_VECTOR.
+ // If our first insertion is not the first index or zeros are needed, then
+ // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
+ // elements undefined).
if (!V) {
- if (i != 0)
+ if (i != 0 || NumZero)
V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
else {
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
@@ -7964,11 +8205,12 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
// FIXME: 256-bit vector instructions don't require a strict alignment,
// improve this code to support it better.
- unsigned RequiredAlign = VT.getSizeInBits()/8;
+ Align RequiredAlign(VT.getSizeInBits() / 8);
SDValue Chain = LD->getChain();
// Make sure the stack object alignment is at least 16 or 32.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+ MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
+ if (!InferredAlign || *InferredAlign < RequiredAlign) {
if (MFI.isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
@@ -7983,9 +8225,9 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
// Ptr + (Offset & ~15).
if (Offset < 0)
return SDValue();
- if ((Offset % RequiredAlign) & 3)
+ if ((Offset % RequiredAlign.value()) & 3)
return SDValue();
- int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
+ int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
if (StartOffset) {
SDLoc DL(Ptr);
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@@ -8024,8 +8266,8 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
case ISD::SCALAR_TO_VECTOR:
return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
case ISD::SRL:
- if (isa<ConstantSDNode>(Elt.getOperand(1))) {
- uint64_t Idx = Elt.getConstantOperandVal(1);
+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
+ uint64_t Idx = IdxC->getZExtValue();
if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
ByteOffset += Idx / 8;
return true;
@@ -8033,13 +8275,13 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
}
break;
case ISD::EXTRACT_VECTOR_ELT:
- if (isa<ConstantSDNode>(Elt.getOperand(1))) {
+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
SDValue Src = Elt.getOperand(0);
unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
findEltLoadSrc(Src, Ld, ByteOffset)) {
- uint64_t Idx = Elt.getConstantOperandVal(1);
+ uint64_t Idx = IdxC->getZExtValue();
ByteOffset += Idx * (SrcSizeInBits / 8);
return true;
}
@@ -8169,7 +8411,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
"Cannot merge volatile or atomic loads.");
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
+ LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
+ MMOFlags);
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, NewLd);
@@ -8247,14 +8490,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
: MVT::getIntegerVT(LoadSizeInBits);
MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
+ // Allow v4f32 on SSE1 only targets.
+ // FIXME: Add more isel patterns so we can just use VT directly.
+ if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
+ VecVT = MVT::v4f32;
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
- SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
- LDBase->getPointerInfo(),
- LDBase->getAlignment(),
- MachineMemOperand::MOLoad);
+ SDValue ResNode = DAG.getMemIntrinsicNode(
+ X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
+ LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, ResNode);
@@ -8318,13 +8563,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
// are consecutive, non-overlapping, and in the right order.
-static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
+static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
SmallVector<SDValue, 64> Elts;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
- if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+ if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
Elts.push_back(Elt);
continue;
}
@@ -8439,7 +8684,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
SDValue Ld = BVOp->getSplatValue(&UndefElements);
// Attempt to use VBROADCASTM
- // From this paterrn:
+ // From this pattern:
// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
// b. t1 = (build_vector t0 t0)
//
@@ -8486,8 +8731,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
LLVMContext *Ctx = DAG.getContext();
MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
if (Subtarget.hasAVX()) {
- if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
- !(SplatBitSize == 64 && Subtarget.is32Bit())) {
+ if (SplatBitSize == 32 || SplatBitSize == 64 ||
+ (SplatBitSize < 32 && Subtarget.hasAVX2())) {
// Splatted value can fit in one INTEGER constant in constant pool.
// Load the constant and broadcast it.
MVT CVT = MVT::getIntegerVT(SplatBitSize);
@@ -8496,46 +8741,25 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
SDValue CP = DAG.getConstantPool(C, PVT);
unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- Ld = DAG.getLoad(
- CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- Alignment);
- SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
- MVT::getVectorVT(CVT, Repeat), Ld);
- return DAG.getBitcast(VT, Brdcst);
- } else if (SplatBitSize == 32 || SplatBitSize == 64) {
- // Splatted value can fit in one FLOAT constant in constant pool.
- // Load the constant and broadcast it.
- // AVX have support for 32 and 64 bit broadcast for floats only.
- // No 64bit integer in 32bit subtarget.
- MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
- // Lower the splat via APFloat directly, to avoid any conversion.
- Constant *C =
- SplatBitSize == 32
- ? ConstantFP::get(*Ctx,
- APFloat(APFloat::IEEEsingle(), SplatValue))
- : ConstantFP::get(*Ctx,
- APFloat(APFloat::IEEEdouble(), SplatValue));
- SDValue CP = DAG.getConstantPool(C, PVT);
- unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
-
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- Ld = DAG.getLoad(
- CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- Alignment);
- SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
- MVT::getVectorVT(CVT, Repeat), Ld);
+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+ SDVTList Tys =
+ DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), CP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ SDValue Brdcst = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
+ MachineMemOperand::MOLoad);
return DAG.getBitcast(VT, Brdcst);
- } else if (SplatBitSize > 64) {
+ }
+ if (SplatBitSize > 64) {
// Load the vector of constants and broadcast it.
MVT CVT = VT.getScalarType();
Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
*Ctx);
SDValue VCP = DAG.getConstantPool(VecC, PVT);
unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
- unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
+ Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
Ld = DAG.getLoad(
MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
@@ -8560,10 +8784,12 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
bool ConstSplatVal =
(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
+ bool IsLoad = ISD::isNormalLoad(Ld.getNode());
// Make sure that all of the users of a non-constant load are from the
// BUILD_VECTOR node.
- if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
+ // FIXME: Is the use count needed for non-constant, non-load case?
+ if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
return SDValue();
unsigned ScalarSize = Ld.getValueSizeInBits();
@@ -8603,18 +8829,17 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CP =
DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- Ld = DAG.getLoad(
- CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- Alignment);
+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), CP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
+ MPI, Alignment, MachineMemOperand::MOLoad);
}
}
- bool IsLoad = ISD::isNormalLoad(Ld.getNode());
-
// Handle AVX2 in-register broadcasts.
if (!IsLoad && Subtarget.hasInt256() &&
(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
@@ -8624,15 +8849,34 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
if (!IsLoad)
return SDValue();
+ // Make sure the non-chain result is only used by this build vector.
+ if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
+ return SDValue();
+
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
- (Subtarget.hasVLX() && ScalarSize == 64))
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ (Subtarget.hasVLX() && ScalarSize == 64)) {
+ auto *LN = cast<LoadSDNode>(Ld);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue BCast =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
+ return BCast;
+ }
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
// double since there is no vbroadcastsd xmm
- if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
- if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
+ (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
+ auto *LN = cast<LoadSDNode>(Ld);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue BCast =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
+ return BCast;
}
// Unsupported broadcast.
@@ -8746,20 +8990,6 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
return NV;
}
-static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
- assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
- Op.getScalarValueSizeInBits() == 1 &&
- "Can not convert non-constant vector");
- uint64_t Immediate = 0;
- for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
- SDValue In = Op.getOperand(idx);
- if (!In.isUndef())
- Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
- }
- SDLoc dl(Op);
- MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
- return DAG.getConstant(Immediate, dl, VT);
-}
// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -8782,11 +9012,11 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
SDValue In = Op.getOperand(idx);
if (In.isUndef())
continue;
- if (!isa<ConstantSDNode>(In))
- NonConstIdx.push_back(idx);
- else {
- Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
+ if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
+ Immediate |= (InC->getZExtValue() & 0x1) << idx;
HasConstElts = true;
+ } else {
+ NonConstIdx.push_back(idx);
}
if (SplatIdx < 0)
SplatIdx = idx;
@@ -8805,9 +9035,24 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
if (Cond.getOpcode() != ISD::SETCC)
Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
DAG.getConstant(1, dl, MVT::i8));
- return DAG.getSelect(dl, VT, Cond,
- DAG.getConstant(1, dl, VT),
- DAG.getConstant(0, dl, VT));
+
+ // Perform the select in the scalar domain so we can use cmov.
+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
+ DAG.getAllOnesConstant(dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ Select = DAG.getBitcast(MVT::v32i1, Select);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
+ } else {
+ MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
+ SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
+ DAG.getAllOnesConstant(dl, ImmVT),
+ DAG.getConstant(0, dl, ImmVT));
+ MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
+ Select = DAG.getBitcast(VecVT, Select);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
+ DAG.getIntPtrConstant(0, dl));
+ }
}
// insert elements one by one
@@ -8907,8 +9152,8 @@ static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
if (!CanFold)
break;
- unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
- unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+ unsigned I0 = Op0.getConstantOperandVal(1);
+ unsigned I1 = Op1.getConstantOperandVal(1);
if (i * 2 < NumElts) {
if (V0.isUndef()) {
@@ -9056,11 +9301,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
- !isa<ConstantSDNode>(Op1.getOperand(1)) ||
Op0.getOperand(1) != Op1.getOperand(1))
return false;
- unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ unsigned I0 = Op0.getConstantOperandVal(1);
if (I0 != i)
return false;
@@ -9445,6 +9689,9 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
return SDValue();
}
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG);
+
/// If a BUILD_VECTOR's source elements all apply the same bit operation and
/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
/// just apply the bit to the vectors.
@@ -9452,6 +9699,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
/// from this, but enough scalar bit operations are created from the later
/// legalization + scalarization stages to need basic support.
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
MVT VT = Op->getSimpleValueType(0);
@@ -9515,7 +9763,14 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
- return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+ SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
+
+ if (!IsShift)
+ return Res;
+
+ // Immediately lower the shift to ensure the constant build vector doesn't
+ // get converted to a constant pool before the shift is lowered.
+ return LowerShift(Res, Subtarget, DAG);
}
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
@@ -9571,9 +9826,11 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
Subtarget, DAG, SDLoc(IndicesVec));
- return extractSubVector(
- createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
- DAG, DL, SizeInBits);
+ SDValue NewSrcVec =
+ createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
+ if (NewSrcVec)
+ return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
+ return SDValue();
} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
// Widen smaller SrcVec to match VT.
SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
@@ -9869,7 +10126,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return HorizontalOp;
if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
return Broadcast;
- if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
+ if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
return BitOp;
unsigned EVTBits = EltVT.getSizeInBits();
@@ -9929,7 +10186,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
assert(!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector");
VarElt = Elt;
- InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
+ InsIndex = DAG.getVectorIdxConstant(i, dl);
}
}
Constant *CV = ConstantVector::get(ConstVecOps);
@@ -10929,6 +11186,71 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
return SDValue();
}
+/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
+/// followed by unpack 256-bit.
+static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ SmallVector<int, 32> Unpckl, Unpckh;
+ createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
+ createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
+
+ unsigned UnpackOpcode;
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ UnpackOpcode = X86ISD::UNPCKL;
+ else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ UnpackOpcode = X86ISD::UNPCKH;
+ else
+ return SDValue();
+
+ // This is a "natural" unpack operation (rather than the 128-bit sectored
+ // operation implemented by AVX). We need to rearrange 64-bit chunks of the
+ // input in order to use the x86 instruction.
+ V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
+ DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
+ V1 = DAG.getBitcast(VT, V1);
+ return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
+}
+
+// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
+// source into the lower elements and zeroing the upper elements.
+// TODO: Merge with matchShuffleAsVPMOV.
+static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
+ ArrayRef<int> Mask, const APInt &Zeroable,
+ const X86Subtarget &Subtarget) {
+ if (!VT.is512BitVector() && !Subtarget.hasVLX())
+ return false;
+
+ unsigned NumElts = Mask.size();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ unsigned SrcEltBits = EltSizeInBits * Scale;
+ if (SrcEltBits < 32 && !Subtarget.hasBWI())
+ continue;
+ unsigned NumSrcElts = NumElts / Scale;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
+ continue;
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+ SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
+ SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
+ DstVT = MVT::getIntegerVT(EltSizeInBits);
+ if ((NumSrcElts * EltSizeInBits) >= 128) {
+ // ISD::TRUNCATE
+ DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
+ } else {
+ // X86ISD::VTRUNC
+ DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
+ }
+ return true;
+ }
+
+ return false;
+}
+
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
int Delta) {
int Size = (int)Mask.size();
@@ -11022,22 +11344,93 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
}
+/// Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
+/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
+/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
+/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
+ bool IsSingleInput) {
+ // The modulus for the shuffle vector entries is based on whether this is
+ // a single input or not.
+ int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+ assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+ "We should only be called with masks with a power-of-2 size!");
+
+ uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+ // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+ // and 2^3 simultaneously. This is because we may have ambiguity with
+ // partially undef inputs.
+ bool ViableForN[3] = {true, true, true};
+
+ for (int i = 0, e = Mask.size(); i < e; ++i) {
+ // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+ // want.
+ if (Mask[i] < 0)
+ continue;
+
+ bool IsAnyViable = false;
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j]) {
+ uint64_t N = j + 1;
+
+ // The shuffle mask must be equal to (i * 2^N) % M.
+ if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+ IsAnyViable = true;
+ else
+ ViableForN[j] = false;
+ }
+ // Early exit if we exhaust the possible powers of two.
+ if (!IsAnyViable)
+ break;
+ }
+
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j])
+ return j + 1;
+
+ // Return 0 as there is no viable power of two.
+ return 0;
+}
+
// X86 has dedicated pack instructions that can handle specific truncation
// operations: PACKSS and PACKUS.
+// Checks for compaction shuffle masks if MaxStages > 1.
+// TODO: Add support for matching multiple PACKSS/PACKUS stages.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
unsigned &PackOpcode, ArrayRef<int> TargetMask,
SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget,
+ unsigned MaxStages = 1) {
unsigned NumElts = VT.getVectorNumElements();
unsigned BitSize = VT.getScalarSizeInBits();
- MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
- MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
+ assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
+ "Illegal maximum compaction");
- auto MatchPACK = [&](SDValue N1, SDValue N2) {
+ auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
+ unsigned NumSrcBits = PackVT.getScalarSizeInBits();
+ unsigned NumPackedBits = NumSrcBits - BitSize;
SDValue VV1 = DAG.getBitcast(PackVT, N1);
SDValue VV2 = DAG.getBitcast(PackVT, N2);
- if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
- APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
+ if (Subtarget.hasSSE41() || BitSize == 8) {
+ APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
(N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
V1 = VV1;
@@ -11047,8 +11440,8 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
return true;
}
}
- if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
- (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
+ if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
+ (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
V1 = VV1;
V2 = VV2;
SrcVT = PackVT;
@@ -11058,19 +11451,25 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
return false;
};
- // Try binary shuffle.
- SmallVector<int, 32> BinaryMask;
- createPackShuffleMask(VT, BinaryMask, false);
- if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
- if (MatchPACK(V1, V2))
- return true;
+ // Attempt to match against wider and wider compaction patterns.
+ for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
+ MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
+ MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
- // Try unary shuffle.
- SmallVector<int, 32> UnaryMask;
- createPackShuffleMask(VT, UnaryMask, true);
- if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
- if (MatchPACK(V1, V1))
- return true;
+ // Try binary shuffle.
+ SmallVector<int, 32> BinaryMask;
+ createPackShuffleMask(VT, BinaryMask, false, NumStages);
+ if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
+ if (MatchPACK(V1, V2, PackVT))
+ return true;
+
+ // Try unary shuffle.
+ SmallVector<int, 32> UnaryMask;
+ createPackShuffleMask(VT, UnaryMask, true, NumStages);
+ if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
+ if (MatchPACK(V1, V1, PackVT))
+ return true;
+ }
return false;
}
@@ -11080,12 +11479,44 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
const X86Subtarget &Subtarget) {
MVT PackVT;
unsigned PackOpcode;
- if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
- Subtarget))
- return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
- DAG.getBitcast(PackVT, V2));
+ unsigned SizeBits = VT.getSizeInBits();
+ unsigned EltBits = VT.getScalarSizeInBits();
+ unsigned MaxStages = Log2_32(64 / EltBits);
+ if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
+ Subtarget, MaxStages))
+ return SDValue();
- return SDValue();
+ unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
+ unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
+
+ // Don't lower multi-stage packs on AVX512, truncation is better.
+ if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
+ return SDValue();
+
+ // Pack to the largest type possible:
+ // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
+ unsigned MaxPackBits = 16;
+ if (CurrentEltBits > 16 &&
+ (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
+ MaxPackBits = 32;
+
+ // Repeatedly pack down to the target size.
+ SDValue Res;
+ for (unsigned i = 0; i != NumStages; ++i) {
+ unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
+ unsigned NumSrcElts = SizeBits / SrcEltBits;
+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+ MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
+ MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+ MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
+ Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
+ DAG.getBitcast(SrcVT, V2));
+ V1 = V2 = Res;
+ CurrentEltBits /= 2;
+ }
+ assert(Res && Res.getValueType() == VT &&
+ "Failed to lower compaction shuffle");
+ return Res;
}
/// Try to emit a bitmask instruction for a shuffle.
@@ -11109,8 +11540,9 @@ static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
MVT LogicVT = VT;
if (EltVT == MVT::f32 || EltVT == MVT::f64) {
Zero = DAG.getConstantFP(0.0, DL, EltVT);
- AllOnes = DAG.getConstantFP(
- APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
+ APFloat AllOnesValue = APFloat::getAllOnesValue(
+ SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
+ AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
LogicVT =
MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
} else {
@@ -11312,6 +11744,12 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
+ // If we have VPTERNLOG, we can use that as a bit blend.
+ if (Subtarget.hasVLX())
+ if (SDValue BitBlend =
+ lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return BitBlend;
+
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;
@@ -11622,10 +12060,101 @@ static SDValue lowerShuffleAsDecomposedShuffleBlend(
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
}
-/// Try to lower a vector shuffle as a rotation.
+/// Try to lower a vector shuffle as a bit rotation.
+///
+/// Look for a repeated rotation pattern in each sub group.
+/// Returns a ISD::ROTL element rotation amount or -1 if failed.
+static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
+ int NumElts = Mask.size();
+ assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
+
+ int RotateAmt = -1;
+ for (int i = 0; i != NumElts; i += NumSubElts) {
+ for (int j = 0; j != NumSubElts; ++j) {
+ int M = Mask[i + j];
+ if (M < 0)
+ continue;
+ if (!isInRange(M, i, i + NumSubElts))
+ return -1;
+ int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
+ if (0 <= RotateAmt && Offset != RotateAmt)
+ return -1;
+ RotateAmt = Offset;
+ }
+ }
+ return RotateAmt;
+}
+
+static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
+ const X86Subtarget &Subtarget,
+ ArrayRef<int> Mask) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
+
+ // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
+ int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
+ int MaxSubElts = 64 / EltSizeInBits;
+ for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
+ int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
+ if (RotateAmt < 0)
+ continue;
+
+ int NumElts = Mask.size();
+ MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
+ RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
+ return RotateAmt * EltSizeInBits;
+ }
+
+ return -1;
+}
+
+/// Lower shuffle using X86ISD::VROTLI rotations.
+static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // Only XOP + AVX512 targets have bit rotation instructions.
+ // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
+ bool IsLegal =
+ (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
+ if (!IsLegal && Subtarget.hasSSE3())
+ return SDValue();
+
+ MVT RotateVT;
+ int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
+ Subtarget, Mask);
+ if (RotateAmt < 0)
+ return SDValue();
+
+ // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
+ // expanded to OR(SRL,SHL), will be more efficient, but if they can
+ // widen to vXi16 or more then existing lowering should will be better.
+ if (!IsLegal) {
+ if ((RotateAmt % 16) == 0)
+ return SDValue();
+ // TODO: Use getTargetVShiftByConstNode.
+ unsigned ShlAmt = RotateAmt;
+ unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
+ V1 = DAG.getBitcast(RotateVT, V1);
+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
+ DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
+ DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
+ SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
+ return DAG.getBitcast(VT, Rot);
+ }
+
+ SDValue Rot =
+ DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, Rot);
+}
+
+/// Try to match a vector shuffle as an element rotation.
///
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
-static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
+static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
int NumElts = Mask.size();
// We need to detect various ways of spelling a rotation:
@@ -11712,7 +12241,7 @@ static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask) {
// Don't accept any shuffles with zero elements.
- if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+ if (isAnyZero(Mask))
return -1;
// PALIGNR works on 128-bit lanes.
@@ -11720,7 +12249,7 @@ static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
return -1;
- int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
+ int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
if (Rotation <= 0)
return -1;
@@ -11788,7 +12317,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
+static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11800,7 +12329,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
&& "VLX required for 128/256-bit vectors");
SDValue Lo = V1, Hi = V2;
- int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
+ int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
if (Rotation <= 0)
return SDValue();
@@ -12566,13 +13095,13 @@ static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
assert(Subtarget.hasAVX2() &&
"We can only lower integer broadcasts with AVX2!");
- EVT EltVT = VT.getVectorElementType();
- EVT V0VT = V0.getValueType();
+ MVT EltVT = VT.getVectorElementType();
+ MVT V0VT = V0.getSimpleValueType();
assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
- EVT V0EltVT = V0VT.getVectorElementType();
+ MVT V0EltVT = V0VT.getVectorElementType();
if (!V0EltVT.isInteger())
return SDValue();
@@ -12636,7 +13165,7 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
SDValue N1, ArrayRef<int> Mask,
SelectionDAG &DAG) {
- EVT VT = N0.getValueType();
+ MVT VT = N0.getSimpleValueType();
assert((VT.is128BitVector() &&
(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
"VPERM* family of shuffles requires 32-bit or 64-bit elements");
@@ -12649,9 +13178,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
return SDValue();
SDValue WideVec = N0.getOperand(0);
- EVT WideVT = WideVec.getValueType();
- if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
- !isa<ConstantSDNode>(N1.getOperand(1)))
+ MVT WideVT = WideVec.getSimpleValueType();
+ if (!WideVT.is256BitVector())
return SDValue();
// Match extracts of each half of the wide source vector. Commute the shuffle
@@ -12699,7 +13227,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
// we can only broadcast from a register with AVX2.
- unsigned NumElts = Mask.size();
unsigned NumEltBits = VT.getScalarSizeInBits();
unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
? X86ISD::MOVDDUP
@@ -12707,15 +13234,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
// Check that the mask is a broadcast.
- int BroadcastIdx = -1;
- for (int i = 0; i != (int)NumElts; ++i) {
- SmallVector<int, 8> BroadcastMask(NumElts, i);
- if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
- BroadcastIdx = i;
- break;
- }
- }
-
+ int BroadcastIdx = getSplatIndex(Mask);
if (BroadcastIdx < 0)
return SDValue();
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
@@ -12724,6 +13243,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
+ // TODO: Combine this logic with findEltLoadSrc() used by
+ // EltsFromConsecutiveLoads().
int BitOffset = BroadcastIdx * NumEltBits;
SDValue V = V1;
for (;;) {
@@ -12739,14 +13260,19 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
BitOffset %= OpBitWidth;
continue;
}
+ case ISD::EXTRACT_SUBVECTOR: {
+ // The extraction index adds to the existing offset.
+ unsigned EltBitWidth = V.getScalarValueSizeInBits();
+ unsigned Idx = V.getConstantOperandVal(1);
+ unsigned BeginOffset = Idx * EltBitWidth;
+ BitOffset += BeginOffset;
+ V = V.getOperand(0);
+ continue;
+ }
case ISD::INSERT_SUBVECTOR: {
SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
- auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
- if (!ConstantIdx)
- break;
-
int EltBitWidth = VOuter.getScalarValueSizeInBits();
- int Idx = (int)ConstantIdx->getZExtValue();
+ int Idx = (int)V.getConstantOperandVal(2);
int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
int BeginOffset = Idx * EltBitWidth;
int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
@@ -12777,8 +13303,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
DL, VT, V, BroadcastIdx, Subtarget, DAG))
return TruncBroadcast;
- MVT BroadcastVT = VT;
-
// Also check the simpler case, where we can directly reuse the scalar.
if (!BitCastSrc &&
((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
@@ -12788,23 +13312,34 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
- } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {
- // 32-bit targets need to load i64 as a f64 and then bitcast the result.
- if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
- BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
- Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
- ? X86ISD::MOVDDUP
- : Opcode;
- }
+ } else if (ISD::isNormalLoad(V.getNode()) &&
+ cast<LoadSDNode>(V)->isSimple()) {
+ // We do not check for one-use of the vector load because a broadcast load
+ // is expected to be a win for code size, register pressure, and possibly
+ // uops even if the original vector load is not eliminated.
- // If we are broadcasting a load that is only used by the shuffle
- // then we can reduce the vector load to the broadcasted scalar load.
+ // Reduce the vector load and shuffle to a broadcasted scalar load.
LoadSDNode *Ld = cast<LoadSDNode>(V);
SDValue BaseAddr = Ld->getOperand(1);
- EVT SVT = BroadcastVT.getScalarType();
+ MVT SVT = VT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+
+ // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
+ // than MOVDDUP.
+ // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
+ if (Opcode == X86ISD::VBROADCAST) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), NewAddr};
+ V = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+ DAG.makeEquivalentMemoryOrdering(Ld, V);
+ return DAG.getBitcast(VT, V);
+ }
+ assert(SVT == MVT::f64 && "Unexpected VT!");
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
Ld->getMemOperand(), Offset, SVT.getStoreSize()));
@@ -12839,38 +13374,26 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
DAG.getBitcast(MVT::f64, V));
- // Bitcast back to the same scalar type as BroadcastVT.
- if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
- assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
- "Unexpected vector element size");
- MVT ExtVT;
- if (V.getValueType().isVector()) {
- unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
- ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
- } else {
- ExtVT = BroadcastVT.getScalarType();
- }
- V = DAG.getBitcast(ExtVT, V);
- }
-
- // 32-bit targets need to load i64 as a f64 and then bitcast the result.
- if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
- V = DAG.getBitcast(MVT::f64, V);
- unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
- BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
+ // If this is a scalar, do the broadcast on this type and bitcast.
+ if (!V.getValueType().isVector()) {
+ assert(V.getScalarValueSizeInBits() == NumEltBits &&
+ "Unexpected scalar size");
+ MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
+ VT.getVectorNumElements());
+ return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
// We only support broadcasting from 128-bit vectors to minimize the
// number of patterns we need to deal with in isel. So extract down to
// 128-bits, removing as many bitcasts as possible.
- if (V.getValueSizeInBits() > 128) {
- MVT ExtVT = V.getSimpleValueType().getScalarType();
- ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
+ if (V.getValueSizeInBits() > 128)
V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
- V = DAG.getBitcast(ExtVT, V);
- }
- return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
+ // Otherwise cast V to a vector with the same element type as VT, but
+ // possibly narrower than VT. Then perform the broadcast.
+ unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
+ MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
+ return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
}
// Check for whether we can use INSERTPS to perform the shuffle. We only use
@@ -13259,7 +13782,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -13293,8 +13816,7 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
- int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
-
+ SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 1) {
@@ -13548,7 +14070,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -14186,6 +14708,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, Subtarget, DAG))
return Broadcast;
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
@@ -14262,6 +14789,29 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return V;
+ // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
+ // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
+ // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
+ int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
+ if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
+ !Subtarget.hasVLX()) {
+ SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
+ for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
+ DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
+ SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
+ DWordClearMask);
+ V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
+ DWordClearMask);
+ // Now pack things back together.
+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
+ if (NumEvenDrops == 2) {
+ Result = DAG.getBitcast(MVT::v4i32, Result);
+ Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
+ }
+ return Result;
+ }
+
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
@@ -14281,72 +14831,6 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, Subtarget, DAG);
}
-/// Check whether a compaction lowering can be done by dropping even
-/// elements and compute how many times even elements must be dropped.
-///
-/// This handles shuffles which take every Nth element where N is a power of
-/// two. Example shuffle masks:
-///
-/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
-/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
-/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
-/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
-/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
-/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
-///
-/// Any of these lanes can of course be undef.
-///
-/// This routine only supports N <= 3.
-/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
-/// for larger N.
-///
-/// \returns N above, or the number of times even elements must be dropped if
-/// there is such a number. Otherwise returns zero.
-static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
- bool IsSingleInput) {
- // The modulus for the shuffle vector entries is based on whether this is
- // a single input or not.
- int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
- assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
- "We should only be called with masks with a power-of-2 size!");
-
- uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
-
- // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
- // and 2^3 simultaneously. This is because we may have ambiguity with
- // partially undef inputs.
- bool ViableForN[3] = {true, true, true};
-
- for (int i = 0, e = Mask.size(); i < e; ++i) {
- // Ignore undef lanes, we'll optimistically collapse them to the pattern we
- // want.
- if (Mask[i] < 0)
- continue;
-
- bool IsAnyViable = false;
- for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
- if (ViableForN[j]) {
- uint64_t N = j + 1;
-
- // The shuffle mask must be equal to (i * 2^N) % M.
- if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
- IsAnyViable = true;
- else
- ViableForN[j] = false;
- }
- // Early exit if we exhaust the possible powers of two.
- if (!IsAnyViable)
- break;
- }
-
- for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
- if (ViableForN[j])
- return j + 1;
-
- // Return 0 as there is no viable power of two.
- return 0;
-}
-
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
@@ -14410,6 +14894,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, Subtarget, DAG))
return Broadcast;
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;
@@ -14524,6 +15013,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return V;
+ // Check for compaction patterns.
+ bool IsSingleInput = V2.isUndef();
+ int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
+
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
// blends but after all of the single-input lowerings. If the single input
@@ -14534,10 +15027,13 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// and there are *very* few patterns that would actually be faster than the
// PSHUFB approach because of its ability to zero lanes.
//
+ // If the mask is a binary compaction, we can more efficiently perform this
+ // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
+ //
// FIXME: The only exceptions to the above are blends which are exact
// interleavings with direct instructions supporting them. We currently don't
// handle those well here.
- if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
bool V1InUse = false;
bool V2InUse = false;
@@ -14595,8 +15091,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// We special case these as they can be particularly efficiently handled with
// the PACKUSB instruction on x86 and they show up in common patterns of
// rearranging bytes to truncate wide elements.
- bool IsSingleInput = V2.isUndef();
- if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
+ if (NumEvenDrops) {
// NumEvenDrops is the power of two stride of the elements. Another way of
// thinking about it is that we need to drop the even elements this many
// times to get the original input.
@@ -14604,23 +15099,23 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// First we need to zero all the dropped bytes.
assert(NumEvenDrops <= 3 &&
"No support for dropping even elements more than 3 times.");
- SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
- for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
- ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
- SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
- V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+ SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
+ for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
+ WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
+ SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
+ WordClearMask);
if (!IsSingleInput)
- V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+ V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
+ WordClearMask);
// Now pack things back together.
- V1 = DAG.getBitcast(MVT::v8i16, V1);
- V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
- SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
+ IsSingleInput ? V1 : V2);
for (int i = 1; i < NumEvenDrops; ++i) {
Result = DAG.getBitcast(MVT::v8i16, Result);
Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
}
-
return Result;
}
@@ -14725,37 +15220,13 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
int NumElements = VT.getVectorNumElements();
int SplitNumElements = NumElements / 2;
MVT ScalarVT = VT.getVectorElementType();
- MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
+ MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
- // Rather than splitting build-vectors, just build two narrower build
- // vectors. This helps shuffling with splats and zeros.
+ // Use splitVector/extractSubVector so that split build-vectors just build two
+ // narrower build vectors. This helps shuffling with splats and zeros.
auto SplitVector = [&](SDValue V) {
- V = peekThroughBitcasts(V);
-
- MVT OrigVT = V.getSimpleValueType();
- int OrigNumElements = OrigVT.getVectorNumElements();
- int OrigSplitNumElements = OrigNumElements / 2;
- MVT OrigScalarVT = OrigVT.getVectorElementType();
- MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
-
SDValue LoV, HiV;
-
- auto *BV = dyn_cast<BuildVectorSDNode>(V);
- if (!BV) {
- LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
- DAG.getIntPtrConstant(0, DL));
- HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
- DAG.getIntPtrConstant(OrigSplitNumElements, DL));
- } else {
-
- SmallVector<SDValue, 16> LoOps, HiOps;
- for (int i = 0; i < OrigSplitNumElements; ++i) {
- LoOps.push_back(BV->getOperand(i));
- HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
- }
- LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
- HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
- }
+ std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
return std::make_pair(DAG.getBitcast(SplitVT, LoV),
DAG.getBitcast(SplitVT, HiV));
};
@@ -15963,7 +16434,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
SmallVector<int, 2> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
SmallVector<int, 4> PSHUFDMask;
- scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
+ narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
return DAG.getBitcast(
MVT::v4i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
@@ -15984,7 +16455,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have VLX support, we can use VALIGN or VEXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -16085,13 +16556,14 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have a single input shuffle with different shuffle patterns in the
// two 128-bit lanes use the variable mask to VPERMILPS.
if (V2.isUndef()) {
- SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
- if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
-
- if (Subtarget.hasAVX2())
+ }
+ if (Subtarget.hasAVX2()) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
-
+ }
// Otherwise, fall back.
return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
DAG, Subtarget);
@@ -16190,7 +16662,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have VLX support, we can use VALIGN or EXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -16210,9 +16682,14 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return V;
- // If the shuffle patterns aren't repeated but it is a single input, directly
- // generate a cross-lane VPERMD instruction.
if (V2.isUndef()) {
+ // Try to produce a fixed cross-128-bit lane permute followed by unpack
+ // because that should be faster than the variable permute alternatives.
+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ return V;
+
+ // If the shuffle patterns aren't repeated but it's a single input, directly
+ // generate a cross-lane VPERMD instruction.
SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
}
@@ -16294,6 +16771,16 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
if (V2.isUndef()) {
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to produce a fixed cross-128-bit lane permute followed by unpack
+ // because that should be faster than the variable permute alternatives.
+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ return V;
+
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
@@ -16379,7 +16866,7 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -16387,6 +16874,12 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use bit rotation instructions.
+ if (V2.isUndef())
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -16396,6 +16889,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
+ // Try to produce a fixed cross-128-bit lane permute followed by unpack
+ // because that should be faster than the variable permute alternatives.
+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ return V;
+
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
@@ -16518,13 +17016,14 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
- SmallVector<int, 4> WidenedMask;
- if (!canWidenShuffleElements(Mask, WidenedMask))
+ SmallVector<int, 4> Widened128Mask;
+ if (!canWidenShuffleElements(Mask, Widened128Mask))
return SDValue();
+ assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
// Try to use an insert into a zero vector.
- if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
- (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
+ if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
+ (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
@@ -16536,37 +17035,34 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
// Check for patterns which can be matched with a single insert of a 256-bit
// subvector.
- bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
- {0, 1, 2, 3, 0, 1, 2, 3});
- if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
- {0, 1, 2, 3, 8, 9, 10, 11})) {
+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});
+ if (OnlyUsesV1 ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
- SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
- OnlyUsesV1 ? V1 : V2,
- DAG.getIntPtrConstant(0, DL));
+ SDValue SubVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
DAG.getIntPtrConstant(4, DL));
}
- assert(WidenedMask.size() == 4);
-
// See if this is an insertion of the lower 128-bits of V2 into V1.
bool IsInsert = true;
int V2Index = -1;
for (int i = 0; i < 4; ++i) {
- assert(WidenedMask[i] >= -1);
- if (WidenedMask[i] < 0)
+ assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+ if (Widened128Mask[i] < 0)
continue;
// Make sure all V1 subvectors are in place.
- if (WidenedMask[i] < 4) {
- if (WidenedMask[i] != i) {
+ if (Widened128Mask[i] < 4) {
+ if (Widened128Mask[i] != i) {
IsInsert = false;
break;
}
} else {
// Make sure we only have a single V2 index and its the lowest 128-bits.
- if (V2Index >= 0 || WidenedMask[i] != 4) {
+ if (V2Index >= 0 || Widened128Mask[i] != 4) {
IsInsert = false;
break;
}
@@ -16580,16 +17076,26 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
}
+ // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
+ // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
+ // possible we at least ensure the lanes stay sequential to help later
+ // combines.
+ SmallVector<int, 2> Widened256Mask;
+ if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
+ Widened128Mask.clear();
+ narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
+ }
+
// Try to lower to vshuf64x2/vshuf32x4.
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
unsigned PermMask = 0;
// Insure elements came from the same Op.
for (int i = 0; i < 4; ++i) {
- assert(WidenedMask[i] >= -1);
- if (WidenedMask[i] < 0)
+ assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+ if (Widened128Mask[i] < 0)
continue;
- SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
+ SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
unsigned OpIndex = i / 2;
if (Ops[OpIndex].isUndef())
Ops[OpIndex] = Op;
@@ -16598,7 +17104,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
// Convert the 128-bit shuffle mask selection values into 128-bit selection
// bits defined by a vshuf64x2 instruction's immediate control byte.
- PermMask |= (WidenedMask[i] % 4) << (i * 2);
+ PermMask |= (Widened128Mask[i] % 4) << (i * 2);
}
return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
@@ -16696,6 +17202,12 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// If we have a single input shuffle with different shuffle patterns in the
// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
if (V2.isUndef() &&
@@ -16728,7 +17240,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
SmallVector<int, 2> Repeated128Mask;
if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
SmallVector<int, 4> PSHUFDMask;
- scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
+ narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
return DAG.getBitcast(
MVT::v8i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
@@ -16752,7 +17264,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -16814,7 +17326,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -16833,6 +17345,13 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v16i32, ShufPS);
}
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
DAG, Subtarget))
@@ -16841,6 +17360,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
+
return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
@@ -16865,6 +17385,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
return V;
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
+ return V;
+
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -16876,18 +17401,23 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Rotate;
if (V2.isUndef()) {
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v32 case.
- return lowerV8I16GeneralSingleInputShuffle(
- DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
+ return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
+ RepeatedMask, Subtarget, DAG);
}
}
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
@@ -16933,6 +17463,17 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use bit rotation instructions.
+ if (V2.isUndef())
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Lower as AND if possible.
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Masked;
+
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
@@ -16995,6 +17536,18 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Broadcast;
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
+ // Try using bit ops for masking and blending before falling back to
+ // splitting.
+ if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return V;
+
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
+
// Dispatch to each element type for lowering. If we don't have support for
// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that
@@ -17477,6 +18030,10 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
unsigned EltSize = VT.getScalarSizeInBits();
unsigned NumElts = VT.getVectorNumElements();
+ // Expand v32i16/v64i8 without BWI.
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return SDValue();
+
// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
// into an i1 condition so that we can use the mask-based 512-bit blend
// instructions.
@@ -17532,14 +18089,24 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
+ SDValue Vec = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+ assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
SDLoc dl(Op);
- if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
+ if (!Vec.getSimpleValueType().is128BitVector())
return SDValue();
if (VT.getSizeInBits() == 8) {
- SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
- Op.getOperand(0), Op.getOperand(1));
+ // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
+ // we're going to zero extend the register or fold the store.
+ if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
+ !MayFoldIntoStore(Op))
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec), Idx));
+
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
@@ -17552,22 +18119,17 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
if (!Op.hasOneUse())
return SDValue();
SDNode *User = *Op.getNode()->use_begin();
- if ((User->getOpcode() != ISD::STORE ||
- isNullConstant(Op.getOperand(1))) &&
+ if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
(User->getOpcode() != ISD::BITCAST ||
User->getValueType(0) != MVT::i32))
return SDValue();
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
- DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
- Op.getOperand(1));
+ DAG.getBitcast(MVT::v4i32, Vec), Idx);
return DAG.getBitcast(MVT::f32, Extract);
}
- if (VT == MVT::i32 || VT == MVT::i64) {
- // ExtractPS/pextrq works with constant index.
- if (isa<ConstantSDNode>(Op.getOperand(1)))
+ if (VT == MVT::i32 || VT == MVT::i64)
return Op;
- }
return SDValue();
}
@@ -17580,6 +18142,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Vec);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
+ auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
MVT EltVT = Op.getSimpleValueType();
assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
@@ -17587,7 +18150,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
// variable index can't be handled in mask registers,
// extend vector to VR512/128
- if (!isa<ConstantSDNode>(Idx)) {
+ if (!IdxC) {
unsigned NumElts = VecVT.getVectorNumElements();
// Extending v8i1/v16i1 to 512-bit get better performance on KNL
// than extending to 128/256bit.
@@ -17598,7 +18161,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
}
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ unsigned IdxVal = IdxC->getZExtValue();
if (IdxVal == 0) // the operation is legal
return Op;
@@ -17627,11 +18190,12 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Vec = Op.getOperand(0);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
+ auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
if (VecVT.getVectorElementType() == MVT::i1)
return ExtractBitFromMaskVector(Op, DAG, Subtarget);
- if (!isa<ConstantSDNode>(Idx)) {
+ if (!IdxC) {
// Its more profitable to go through memory (1 cycles throughput)
// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
// IACA tool was used to get performance estimation
@@ -17665,7 +18229,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return SDValue();
}
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ unsigned IdxVal = IdxC->getZExtValue();
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
@@ -17697,9 +18261,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
- // Transform it so it match pextrw which produces a 32-bit result.
- SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
- Op.getOperand(0), Op.getOperand(1));
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
@@ -17789,9 +18351,7 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
// Copy into a k-register, extract to v1i1 and insert_subvector.
SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
-
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
- Op.getOperand(2));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
}
SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
@@ -17864,11 +18424,22 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
// This will be just movd/movq/movss/movsd.
- if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
- (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
- EltVT == MVT::i64)) {
- N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
- return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
+ if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+ EltVT == MVT::i64) {
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ }
+
+ // We can't directly insert an i8 or i16 into a vector, so zero extend
+ // it to i32 first.
+ if (EltVT == MVT::i16 || EltVT == MVT::i8) {
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
+ N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ return DAG.getBitcast(VT, N1);
+ }
}
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
@@ -17981,12 +18552,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
- SDValue Idx = Op.getOperand(1);
-
- if (!isa<ConstantSDNode>(Idx))
- return SDValue();
+ uint64_t IdxVal = Op.getConstantOperandVal(1);
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0) // the operation is legal
return Op;
@@ -18045,7 +18612,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetConstantPool(
- CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
+ CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
SDLoc DL(CP);
Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
@@ -18554,25 +19121,47 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
Op0, Op1, Amt);
}
-
- assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
- "Unexpected funnel shift type!");
+ assert(
+ (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+ "Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
bool OptForSize = DAG.shouldOptForSize();
- if (!OptForSize && Subtarget.isSHLDSlow())
- return SDValue();
+ bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
- if (IsFSHR)
- std::swap(Op0, Op1);
+ // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
+ // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
+ if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
+ !isa<ConstantSDNode>(Amt)) {
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
+ SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
+ Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
+ Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
+ Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
+ SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
+ if (IsFSHR) {
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
+ } else {
+ Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
+ }
+ return DAG.getZExtOrTrunc(Res, DL, VT);
+ }
+
+ if (VT == MVT::i8 || ExpandFunnel)
+ return SDValue();
// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
- if (VT == MVT::i16)
+ if (VT == MVT::i16) {
Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
DAG.getConstant(15, DL, Amt.getValueType()));
+ unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
+ return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
+ }
- unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
- return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
+ return Op;
}
// Try to use a packed vector operation to handle i64 on 32-bit targets when
@@ -18682,6 +19271,56 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, DL));
}
+/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
+/// try to vectorize the cast ops. This will avoid an expensive round-trip
+/// between XMM and GPR.
+static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: Allow FP_TO_UINT.
+ SDValue CastToInt = CastToFP.getOperand(0);
+ MVT VT = CastToFP.getSimpleValueType();
+ if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
+ return SDValue();
+
+ MVT IntVT = CastToInt.getSimpleValueType();
+ SDValue X = CastToInt.getOperand(0);
+ MVT SrcVT = X.getSimpleValueType();
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return SDValue();
+
+ // See if we have 128-bit vector cast instructions for this type of cast.
+ // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
+ if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
+ IntVT != MVT::i32)
+ return SDValue();
+
+ unsigned SrcSize = SrcVT.getSizeInBits();
+ unsigned IntSize = IntVT.getSizeInBits();
+ unsigned VTSize = VT.getSizeInBits();
+ MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
+ MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
+ MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
+
+ // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
+ unsigned ToIntOpcode =
+ SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
+ unsigned ToFPOpcode =
+ IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
+
+ // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
+ //
+ // We are not defining the high elements (for example, zero them) because
+ // that could nullify any performance advantage that we hoped to gain from
+ // this vector op hack. We do not expect any adverse effects (like denorm
+ // penalties) with cast ops.
+ SDLoc DL(CastToFP);
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+ SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
+ SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
+ SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
+}
+
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(Op);
@@ -18739,15 +19378,15 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
SmallVector<SDValue, 4> SignCvts(4);
SmallVector<SDValue, 4> Chains(4);
for (int i = 0; i != 4; ++i) {
- SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
DAG.getIntPtrConstant(i, DL));
if (IsStrict) {
SignCvts[i] =
DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
- {Op.getOperand(0), Src});
+ {Op.getOperand(0), Elt});
Chains[i] = SignCvts[i].getValue(1);
} else {
- SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
}
}
SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
@@ -18784,6 +19423,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
+ if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
+ return R;
+
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
// Note: Since v2f64 is a legal type. We don't need to zero extend the
@@ -18832,21 +19474,23 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
SDValue ValueToStore = Src;
- if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit())
+ if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
- unsigned Size = SrcVT.getSizeInBits()/8;
+ unsigned Size = SrcVT.getStoreSize();
+ Align Alignment(Size);
MachineFunction &MF = DAG.getMachineFunction();
auto PtrVT = getPointerTy(MF.getDataLayout());
- int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
+ int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- Chain = DAG.getStore(
- Chain, dl, ValueToStore, StackSlot,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
- std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+ Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
+ std::pair<SDValue, SDValue> Tmp =
+ BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
if (IsStrict)
return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
@@ -18854,58 +19498,40 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
return Tmp.first;
}
-std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
- SDValue StackSlot,
- SelectionDAG &DAG) const {
+std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
+ EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
+ MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
// Build the FILD
- SDLoc DL(Op);
SDVTList Tys;
- bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+ bool useSSE = isScalarFPTypeInSSEReg(DstVT);
if (useSSE)
- Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+ Tys = DAG.getVTList(MVT::f80, MVT::Other);
else
- Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+ Tys = DAG.getVTList(DstVT, MVT::Other);
- unsigned ByteSize = SrcVT.getSizeInBits() / 8;
-
- FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
- MachineMemOperand *LoadMMO;
- if (FI) {
- int SSFI = FI->getIndex();
- LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
- MachineMemOperand::MOLoad, ByteSize, ByteSize);
- } else {
- LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
- StackSlot = StackSlot.getOperand(1);
- }
- SDValue FILDOps[] = {Chain, StackSlot};
+ SDValue FILDOps[] = {Chain, Pointer};
SDValue Result =
- DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
- Tys, FILDOps, SrcVT, LoadMMO);
+ DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
+ Alignment, MachineMemOperand::MOLoad);
Chain = Result.getValue(1);
if (useSSE) {
- SDValue InFlag = Result.getValue(2);
-
- // FIXME: Currently the FST is glued to the FILD_FLAG. This
- // shouldn't be necessary except that RFP cannot be live across
- // multiple blocks. When stackifier is fixed, they can be uncoupled.
MachineFunction &MF = DAG.getMachineFunction();
- unsigned SSFISize = Op.getValueSizeInBits() / 8;
- int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
+ unsigned SSFISize = DstVT.getStoreSize();
+ int SSFI =
+ MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
auto PtrVT = getPointerTy(MF.getDataLayout());
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Tys = DAG.getVTList(MVT::Other);
- SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
+ SDValue FSTOps[] = {Chain, Result, StackSlot};
MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
- MachineMemOperand::MOStore, SSFISize, SSFISize);
+ MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
- Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
- Op.getValueType(), StoreMMO);
+ Chain =
+ DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
Result = DAG.getLoad(
- Op.getValueType(), DL, Chain, StackSlot,
+ DstVT, DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
Chain = Result.getValue(1);
}
@@ -18948,7 +19574,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
Constant *C0 = ConstantDataVector::get(*Context, CV0);
auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
- SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
+ SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
SmallVector<Constant*,2> CV1;
CV1.push_back(
@@ -18958,7 +19584,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
APInt(64, 0x4530000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1);
- SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
+ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
// Load the 64-bit value into an XMM register.
SDValue XR1 =
@@ -19163,13 +19789,13 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
*DAG.getContext(),
APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
- SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8);
+ SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
SDValue VBias = DAG.getMemIntrinsicNode(
X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- /*Alignment*/ 8, MachineMemOperand::MOLoad);
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
+ MachineMemOperand::MOLoad);
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
DAG.getBitcast(MVT::v4i64, VBias));
@@ -19337,15 +19963,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
- SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
+ int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
if (SrcVT == MVT::i32) {
SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
SDValue Store1 =
- DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo());
+ DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/);
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
- OffsetSlot, MachinePointerInfo());
+ OffsetSlot, MPI.getWithOffset(4), 4);
std::pair<SDValue, SDValue> Tmp =
- BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+ BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);
if (IsStrict)
return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
@@ -19361,21 +19990,17 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
}
SDValue Store =
- DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo());
+ DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));
// For i64 source, we need to add the appropriate power of 2 if the input
// was negative. This is the same as the optimization in
// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
// we must be careful to do the computation in x87 extended precision, not
// in SSE. (The generic code can't know it's OK to do this, or how to.)
- int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
- MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
- MachineMemOperand::MOLoad, 8, 8);
-
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot };
- SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
- MVT::i64, MMO);
+ SDValue Fild =
+ DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
+ Align(8), MachineMemOperand::MOLoad);
Chain = Fild.getValue(1);
@@ -19388,6 +20013,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
APInt FF(64, 0x5F80000000000000ULL);
SDValue FudgePtr = DAG.getConstantPool(
ConstantInt::get(*DAG.getContext(), FF), PtrVT);
+ Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
SDValue Zero = DAG.getIntPtrConstant(0, dl);
@@ -19399,7 +20025,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue Fudge = DAG.getExtLoad(
ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
- /* Alignment = */ 4);
+ CPAlignment);
Chain = Fudge.getValue(1);
// Extend everything to 80 bits to force it to be done on x87.
// TODO: Are there any fast-math-flags to propagate here?
@@ -19462,7 +20088,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// stack slot.
MachineFunction &MF = DAG.getMachineFunction();
unsigned MemSize = DstTy.getStoreSize();
- int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
+ int SSFI =
+ MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
@@ -19537,20 +20164,20 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
- SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Chain, StackSlot };
unsigned FLDSize = TheVT.getStoreSize();
assert(FLDSize <= MemSize && "Stack slot not big enough");
MachineMemOperand *MMO = MF.getMachineMemOperand(
- MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
+ MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
Chain = Value.getValue(1);
}
// Build the FP_TO_INT*_IN_MEM
MachineMemOperand *MMO = MF.getMachineMemOperand(
- MPI, MachineMemOperand::MOStore, MemSize, MemSize);
+ MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
DAG.getVTList(MVT::Other),
@@ -19590,14 +20217,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
- // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
- if (InVT == MVT::v8i8) {
- if (VT != MVT::v8i64)
- return SDValue();
-
- In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
- MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
- return DAG.getNode(ExtendInVecOpc, dl, VT, In);
+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(InVT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
}
if (Subtarget.hasInt256())
@@ -19729,7 +20351,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
"Unexpected PACK opcode");
assert(DstVT.isVector() && "VT not a vector?");
- // Requires SSE2 but AVX512 has fast vector truncate.
+ // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
if (!Subtarget.hasSSE2())
return SDValue();
@@ -19770,15 +20392,14 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
In = DAG.getBitcast(InVT, In);
- SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
Res = extractSubVector(Res, 0, DAG, DL, 64);
return DAG.getBitcast(DstVT, Res);
}
- // Extract lower/upper subvectors.
- unsigned NumSubElts = NumElems / 2;
- SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
- SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+ // Split lower/upper subvectors.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(In, DAG, DL);
unsigned SubSizeInBits = SrcSizeInBits / 2;
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
@@ -19804,7 +20425,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
SmallVector<int, 64> Mask;
int Scale = 64 / OutVT.getScalarSizeInBits();
- scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
+ narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
if (DstVT.is256BitVector())
@@ -19818,7 +20439,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
// Recursively pack lower/upper subvectors, concat result and pack again.
assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
- EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
+ EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
@@ -19865,17 +20486,22 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
// we need to split into two 8 element vectors which we can extend to v8i32,
// truncate and concat the results. There's an additional complication if
- // the original type is v16i8. In that case we can't split the v16i8 so
- // first we pre-extend it to v16i16 which we can split to v8i16, then extend
- // to v8i32, truncate that to v8i1 and concat the two halves.
+ // the original type is v16i8. In that case we can't split the v16i8
+ // directly, so we need to shuffle high elements to low and use
+ // sign_extend_vector_inreg.
if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
+ SDValue Lo, Hi;
if (InVT == MVT::v16i8) {
- // First we need to sign extend up to 256-bits so we can split that.
- InVT = MVT::v16i16;
- In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
+ Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
+ Hi = DAG.getVectorShuffle(
+ InVT, DL, In, In,
+ {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
+ Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
+ } else {
+ assert(InVT == MVT::v16i16 && "Unexpected VT!");
+ Lo = extract128BitVector(In, 0, DAG, DL);
+ Hi = extract128BitVector(In, 8, DAG, DL);
}
- SDValue Lo = extract128BitVector(In, 0, DAG, DL);
- SDValue Hi = extract128BitVector(In, 8, DAG, DL);
// We're split now, just emit two truncates and a concat. The two
// truncates will trigger legalization to come back to this function.
Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
@@ -19918,7 +20544,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
if (!TLI.isTypeLegal(InVT)) {
if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
VT.is128BitVector()) {
- assert(Subtarget.hasVLX() && "Unexpected subtarget!");
+ assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
+ "Unexpected subtarget!");
// The default behavior is to truncate one step, concatenate, and then
// truncate the remainder. We'd rather produce two 64-bit results and
// concatenate those.
@@ -19942,6 +20569,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget.hasAVX512()) {
+ if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(VT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
+ }
+
// word to byte only under BWI. Otherwise we have to promoted to v16i32
// and then truncate that. But we should only do that if we haven't been
// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
@@ -20174,6 +20806,25 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
+ if (!Subtarget.hasVLX()) {
+ // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
+ // legalizer and then widened again by vector op legalization.
+ if (!IsStrict)
+ return SDValue();
+
+ SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
+ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
+ {Src, Zero, Zero, Zero});
+ Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
+ {Op->getOperand(0), Tmp});
+ SDValue Chain = Tmp.getValue(1);
+ Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
+ DAG.getIntPtrConstant(0, dl));
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp, Chain}, dl);
+ return Tmp;
+ }
+
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32));
@@ -20281,6 +20932,62 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
+SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // If the source is in an SSE register, the node is Legal.
+ if (isScalarFPTypeInSSEReg(SrcVT))
+ return Op;
+
+ return LRINT_LLRINTHelper(Op.getNode(), DAG);
+}
+
+SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
+ SelectionDAG &DAG) const {
+ EVT DstVT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
+ // f16 must be promoted before using the lowering in this routine.
+ // fp128 does not use this lowering.
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ SDValue Chain = DAG.getEntryNode();
+
+ bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
+
+ // If we're converting from SSE, the stack slot needs to hold both types.
+ // Otherwise it only needs to hold the DstVT.
+ EVT OtherVT = UseSSE ? SrcVT : DstVT;
+ SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+
+ if (UseSSE) {
+ assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
+ Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Chain, StackPtr };
+
+ Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
+ /*Align*/ None, MachineMemOperand::MOLoad);
+ Chain = Src.getValue(1);
+ }
+
+ SDValue StoreOps[] = { Chain, Src, StackPtr };
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
+ StoreOps, DstVT, MPI, /*Align*/ None,
+ MachineMemOperand::MOStore);
+
+ return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
+}
+
SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
@@ -20333,6 +21040,67 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
return Tmp.first;
}
+static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
+ "Unexpected VT!");
+
+ SDLoc dl(Op);
+ SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
+ DAG.getConstant(0, dl, MVT::v8i16), Src,
+ DAG.getIntPtrConstant(0, dl));
+
+ SDValue Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
+ {Op.getOperand(0), Res});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+
+ return Res;
+}
+
+static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
+ "Unexpected VT!");
+
+ SDLoc dl(Op);
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
+ DAG.getConstantFP(0, dl, MVT::v4f32), Src,
+ DAG.getIntPtrConstant(0, dl));
+ Res = DAG.getNode(
+ X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
+ {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
+ Chain = Res.getValue(1);
+ } else {
+ // FIXME: Should we use zeros for upper elements for non-strict?
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
+ Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
+ DAG.getTargetConstant(4, dl, MVT::i32));
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+
+ return Res;
+}
+
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
@@ -20413,6 +21181,30 @@ SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
}
+/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
+/// This mode isn't supported in hardware on X86. But as long as we aren't
+/// compiling with trapping math, we can emulate this with
+/// floor(X + copysign(nextafter(0.5, 0.0), X)).
+static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
+ SDValue N0 = Op.getOperand(0);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // N0 += copysign(nextafter(0.5, 0.0), N0)
+ const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
+ bool Ignored;
+ APFloat Point5Pred = APFloat(0.5f);
+ Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
+ Point5Pred.next(/*nextDown*/true);
+
+ SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
+ DAG.getConstantFP(Point5Pred, dl, VT), N0);
+ N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
+
+ // Truncate the result to remove fraction.
+ return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
+}
+
/// The only differences between FABS and FNEG are the mask and the logic op.
/// FNEG also has a folding opportunity for FNEG(FABS(x)).
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
@@ -20568,9 +21360,12 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
}
/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
-/// style scalarized (associative) reduction patterns.
+/// style scalarized (associative) reduction patterns. Partial reductions
+/// are supported when the pointer SrcMask is non-null.
+/// TODO - move this to SelectionDAG?
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
- SmallVectorImpl<SDValue> &SrcOps) {
+ SmallVectorImpl<SDValue> &SrcOps,
+ SmallVectorImpl<APInt> *SrcMask = nullptr) {
SmallVector<SDValue, 8> Opnds;
DenseMap<SDValue, APInt> SrcOpMap;
EVT VT = MVT::Other;
@@ -20598,8 +21393,8 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
return false;
// Quit if without a constant index.
- SDValue Idx = I->getOperand(1);
- if (!isa<ConstantSDNode>(Idx))
+ auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
+ if (!Idx)
return false;
SDValue Src = I->getOperand(0);
@@ -20615,61 +21410,167 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
SrcOps.push_back(Src);
}
+
// Quit if element already used.
- unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
+ unsigned CIdx = Idx->getZExtValue();
if (M->second[CIdx])
return false;
M->second.setBit(CIdx);
}
- // Quit if not all elements are used.
- for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
- E = SrcOpMap.end();
- I != E; ++I) {
- if (!I->second.isAllOnesValue())
- return false;
+ if (SrcMask) {
+ // Collect the source partial masks.
+ for (SDValue &SrcOp : SrcOps)
+ SrcMask->push_back(SrcOpMap[SrcOp]);
+ } else {
+ // Quit if not all elements are used.
+ for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
+ E = SrcOpMap.end();
+ I != E; ++I) {
+ if (!I->second.isAllOnesValue())
+ return false;
+ }
}
return true;
}
-// Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+// Helper function for comparing all bits of a vector against zero.
+static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
+ const APInt &Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, X86::CondCode &X86CC) {
+ EVT VT = V.getValueType();
+ assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&
+ "Element Mask vs Vector bitwidth mismatch");
+
+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
+ X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
+
+ auto MaskBits = [&](SDValue Src) {
+ if (Mask.isAllOnesValue())
+ return Src;
+ EVT SrcVT = Src.getValueType();
+ SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
+ return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
+ };
+
+ // For sub-128-bit vector, cast to (legal) integer and compare with zero.
+ if (VT.getSizeInBits() < 128) {
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
+ return SDValue();
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ DAG.getBitcast(IntVT, MaskBits(V)),
+ DAG.getConstant(0, DL, IntVT));
+ }
+
+ // Quit if not splittable to 128/256-bit vector.
+ if (!isPowerOf2_32(VT.getSizeInBits()))
+ return SDValue();
+
+ // Split down to 128/256-bit vector.
+ unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
+ while (VT.getSizeInBits() > TestSize) {
+ auto Split = DAG.SplitVector(V, DL);
+ VT = Split.first.getValueType();
+ V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
+ }
+
+ bool UsePTEST = Subtarget.hasSSE41();
+ if (UsePTEST) {
+ MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ V = DAG.getBitcast(TestVT, MaskBits(V));
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
+ }
+
+ // Without PTEST, a masked v2i64 or-reduction is not faster than
+ // scalarization.
+ if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
+ return SDValue();
+
+ V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
+ V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
+ getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
+ DAG.getConstant(0xFFFF, DL, MVT::i32));
+}
+
+// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
+// CMP(MOVMSK(PCMPEQB(X,0))).
+static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+ const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG, SDValue &X86CC) {
- assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
- if (!Subtarget.hasSSE41() || !Op->hasOneUse())
+ if (!Subtarget.hasSSE2() || !Op->hasOneUse())
return SDValue();
- SmallVector<SDValue, 8> VecIns;
- if (!matchScalarReduction(Op, ISD::OR, VecIns))
- return SDValue();
+ // Check whether we're masking/truncating an OR-reduction result, in which
+ // case track the masked bits.
+ APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
+ switch (Op.getOpcode()) {
+ case ISD::TRUNCATE: {
+ SDValue Src = Op.getOperand(0);
+ Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
+ Op.getScalarValueSizeInBits());
+ Op = Src;
+ break;
+ }
+ case ISD::AND: {
+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Mask = Cst->getAPIntValue();
+ Op = Op.getOperand(0);
+ }
+ break;
+ }
+ }
- // Quit if not 128/256-bit vector.
- EVT VT = VecIns[0].getValueType();
- if (!VT.is128BitVector() && !VT.is256BitVector())
- return SDValue();
+ SmallVector<SDValue, 8> VecIns;
+ if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
+ EVT VT = VecIns[0].getValueType();
+ assert(llvm::all_of(VecIns,
+ [VT](SDValue V) { return VT == V.getValueType(); }) &&
+ "Reduction source vector mismatch");
+
+ // Quit if less than 128-bits or not splittable to 128/256-bit vector.
+ if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
+ return SDValue();
- SDLoc DL(Op);
- MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ // If more than one full vector is evaluated, OR them first before PTEST.
+ for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
+ Slot += 2, e += 1) {
+ // Each iteration will OR 2 nodes and append the result until there is
+ // only 1 node left, i.e. the final OR'd value of all vectors.
+ SDValue LHS = VecIns[Slot];
+ SDValue RHS = VecIns[Slot + 1];
+ VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
+ }
- // Cast all vectors into TestVT for PTEST.
- for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
- VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
+ X86::CondCode CCode;
+ if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
+ DAG, CCode)) {
+ X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
+ return V;
+ }
+ }
- // If more than one full vector is evaluated, OR them first before PTEST.
- for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
- // Each iteration will OR 2 nodes and append the result until there is only
- // 1 node left, i.e. the final OR'd value of all vectors.
- SDValue LHS = VecIns[Slot];
- SDValue RHS = VecIns[Slot + 1];
- VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+ if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ ISD::NodeType BinOp;
+ if (SDValue Match =
+ DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
+ X86::CondCode CCode;
+ if (SDValue V =
+ LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
+ X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
+ return V;
+ }
+ }
}
- X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
- DL, MVT::i8);
- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
+ return SDValue();
}
/// return true if \c Op has a use that doesn't just read flags.
@@ -20814,27 +21715,14 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
-static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,
- unsigned X86CC, const SDLoc &dl,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SDValue Chain, bool IsSignaling) {
+static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
if (isNullConstant(Op1))
- return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain);
+ return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
EVT CmpVT = Op0.getValueType();
- if (CmpVT.isFloatingPoint()) {
- if (Chain) {
- SDValue Res =
- DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
- dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
- return std::make_pair(Res, Res.getValue(1));
- }
- return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1),
- SDValue());
- }
-
assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
@@ -20884,40 +21772,28 @@ static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,
Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
}
+ // 0-x == y --> x+y == 0
+ // 0-x != y --> x+y != 0
+ if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
+ Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
+ return Add.getValue(1);
+ }
+
+ // x == 0-y --> x+y == 0
+ // x != 0-y --> x+y != 0
+ if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
+ Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
+ return Add.getValue(1);
+ }
+
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
- return std::make_pair(Sub.getValue(1), SDValue());
-}
-
-/// Convert a comparison if required by the subtarget.
-SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
- SelectionDAG &DAG) const {
- // If the subtarget does not support the FUCOMI instruction, floating-point
- // comparisons have to be converted.
- bool IsCmp = Cmp.getOpcode() == X86ISD::CMP;
- bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP ||
- Cmp.getOpcode() == X86ISD::STRICT_FCMPS;
-
- if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) ||
- !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() ||
- !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint())
- return Cmp;
-
- // The instruction selector will select an FUCOM instruction instead of
- // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
- // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
- // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8))))
- SDLoc dl(Cmp);
- SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
- SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
- SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
- DAG.getConstant(8, dl, MVT::i8));
- SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
-
- // Some 64-bit targets lack SAHF support, but they do support FCOMI.
- assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
- return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+ return Sub.getValue(1);
}
/// Check if replacement of SQRT with RSQRT should be disabled.
@@ -21056,7 +21932,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
// Divide by pow2.
SDValue SRA =
- DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i64));
+ DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
// If we're dividing by a positive value, we're done. Otherwise, we must
// negate the result.
@@ -21211,32 +22087,30 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
/// concatenate the result back.
-static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
+static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
- assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
- "Unsupported value type for operation");
+ assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation");
+ assert(Op.getOperand(0).getValueType().isInteger() &&
+ VT == Op.getOperand(0).getValueType() && "Unsupported VTs!");
- unsigned NumElems = VT.getVectorNumElements();
SDLoc dl(Op);
SDValue CC = Op.getOperand(2);
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
+ // Extract the LHS Lo/Hi vectors
+ SDValue LHS1, LHS2;
+ std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
- // Extract the RHS vectors
- SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
- SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
+ // Extract the RHS Lo/Hi vectors
+ SDValue RHS1, RHS2;
+ std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
// Issue the operation on the smaller types and concatenate the result back
- MVT EltVT = VT.getVectorElementType();
- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+ DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
+ DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
}
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
@@ -21369,8 +22243,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ // If we have a strict compare with a vXi1 result and the input is 128/256
+ // bits we can't use a masked compare unless we have VLX. If we use a wider
+ // compare like we do for non-strict, we might trigger spurious exceptions
+ // from the upper elements. Instead emit a AVX compare and convert to mask.
unsigned Opc;
- if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+ if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
+ (!IsStrict || Subtarget.hasVLX() ||
+ Op0.getSimpleValueType().is512BitVector())) {
assert(VT.getVectorNumElements() <= 16);
Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
} else {
@@ -21466,10 +22346,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
- // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
- // result type of SETCC. The bitcast is expected to be optimized away
- // during combining/isel.
- Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+ if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
+ // We emitted a compare with an XMM/YMM result. Finish converting to a
+ // mask register using a vptestm.
+ EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
+ Cmp = DAG.getBitcast(CastVT, Cmp);
+ Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
+ DAG.getConstant(0, dl, CastVT), ISD::SETNE);
+ } else {
+ // If this is SSE/AVX CMPP, bitcast the result back to integer to match
+ // the result type of SETCC. The bitcast is expected to be optimized
+ // away during combining/isel.
+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+ }
if (IsStrict)
return DAG.getMergeValues({Cmp, Chain}, dl);
@@ -21563,7 +22452,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntVSETCC(Op, DAG);
+ return splitIntVSETCC(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8) {
+ assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
+ return splitIntVSETCC(Op, DAG);
+ }
// If this is a SETNE against the signed minimum value, change it to SETGT.
// If this is a SETNE against the signed maximum value, change it to SETLT.
@@ -21812,9 +22706,8 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
/// corresponding X86 condition code constant in X86CC.
SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
ISD::CondCode CC, const SDLoc &dl,
- SelectionDAG &DAG, SDValue &X86CC,
- SDValue &Chain,
- bool IsSignaling) const {
+ SelectionDAG &DAG,
+ SDValue &X86CC) const {
// Optimize to BT if possible.
// Lower (X & (1 << N)) == 0 to BT(X, N).
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
@@ -21825,13 +22718,12 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
return BT;
}
- // Try to use PTEST for a tree ORs equality compared with 0.
+ // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
// TODO: We could do AND tree with all 1s as well by using the C flag.
- if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
- (CC == ISD::SETEQ || CC == ISD::SETNE)) {
- if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
- return PTEST;
- }
+ if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
+ if (SDValue CmpZ =
+ MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
+ return CmpZ;
// Try to lower using KORTEST or KTEST.
if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
@@ -21873,17 +22765,11 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
}
}
- bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
- X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
- if (CondCode == X86::COND_INVALID)
- return SDValue();
+ X86::CondCode CondCode =
+ TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
+ assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
- std::pair<SDValue, SDValue> Tmp =
- EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling);
- SDValue EFLAGS = Tmp.first;
- if (Chain)
- Chain = Tmp.second;
- EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+ SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
return EFLAGS;
}
@@ -21920,18 +22806,32 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
}
}
- SDValue X86CC;
- SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain,
- Op.getOpcode() == ISD::STRICT_FSETCCS);
- if (!EFLAGS)
- return SDValue();
+ if (Op0.getSimpleValueType().isInteger()) {
+ SDValue X86CC;
+ SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
+ }
- SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ // Handle floating point.
+ X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
+ if (CondCode == X86::COND_INVALID)
+ return SDValue();
- if (IsStrict)
- return DAG.getMergeValues({Res, Chain}, dl);
+ SDValue EFLAGS;
+ if (IsStrict) {
+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+ EFLAGS =
+ DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
+ dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
+ Chain = EFLAGS.getValue(1);
+ } else {
+ EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
+ }
- return Res;
+ SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
}
SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
@@ -21946,9 +22846,8 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const
// Recreate the carry if needed.
EVT CarryVT = Carry.getValueType();
- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
- Carry, DAG.getConstant(NegOne, DL, CarryVT));
+ Carry, DAG.getAllOnesConstant(DL, CarryVT));
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
@@ -22024,7 +22923,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
static bool isX86LogicalCmp(SDValue Op) {
unsigned Opc = Op.getOpcode();
if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
- Opc == X86ISD::SAHF)
+ Opc == X86ISD::FCMP)
return true;
if (Op.getResNo() == 1 &&
(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
@@ -22057,9 +22956,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
// are available or VBLENDV if AVX is available.
// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
- if (Cond.getOpcode() == ISD::SETCC &&
- ((Subtarget.hasSSE2() && VT == MVT::f64) ||
- (Subtarget.hasSSE1() && VT == MVT::f32)) &&
+ if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
bool IsAlwaysSignaling;
@@ -22115,45 +23012,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
// AVX512 fallback is to lower selects of scalar floats to masked moves.
- if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
+ if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
- // For v64i1 without 64-bit support we need to split and rejoin.
- if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
- assert(Subtarget.hasBWI() && "Expected BWI to be legal");
- SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
- SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
- SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
- SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
- SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
- SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
- }
-
- if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
- SDValue Op1Scalar;
- if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
- Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
- else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
- Op1Scalar = Op1.getOperand(0);
- SDValue Op2Scalar;
- if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
- Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
- else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
- Op2Scalar = Op2.getOperand(0);
- if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
- SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
- Op1Scalar, Op2Scalar);
- if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
- return DAG.getBitcast(VT, newSelect);
- SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
- DAG.getIntPtrConstant(0, DL));
- }
- }
-
if (Cond.getOpcode() == ISD::SETCC) {
if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
Cond = NewCond;
@@ -22175,12 +23038,28 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(Cond.getOperand(1).getOperand(1))) {
SDValue Cmp = Cond.getOperand(1);
+ SDValue CmpOp0 = Cmp.getOperand(0);
unsigned CondCode = Cond.getConstantOperandVal(0);
- if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ // Special handling for __builtin_ffs(X) - 1 pattern which looks like
+ // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
+ // handle to keep the CMP with 0. This should be removed by
+ // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
+ // cttz_zero_undef.
+ auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
+ return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
+ Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
+ };
+ if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
+ ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
+ (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
+ // Keep Cmp.
+ } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
- SDValue CmpOp0 = Cmp.getOperand(0);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
// Apply further optimizations for special cases
// (select (x != 0), -1, 0) -> neg & sbb
@@ -22188,31 +23067,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (isNullConstant(Y) &&
(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
- SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
Zero = DAG.getConstant(0, DL, Op.getValueType());
- return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
+ return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
}
- Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
- Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
SDValue Res = // Res = 0 or -1.
- DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
+ DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
Res = DAG.getNOT(DL, Res, Res.getValueType());
- if (!isNullConstant(Op2))
- Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
- return Res;
+ return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
Cmp.getOperand(0).getOpcode() == ISD::AND &&
isOneConstant(Cmp.getOperand(0).getOperand(1))) {
- SDValue CmpOp0 = Cmp.getOperand(0);
SDValue Src1, Src2;
// true if Op2 is XOR or OR operator and one of its operands
// is equal to Op1
@@ -22265,7 +23138,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = Cond.getOperand(1);
bool IllegalFPCMov = false;
if (VT.isFloatingPoint() && !VT.isVector() &&
- !isScalarFPTypeInSSEReg(VT)) // FPStack?
+ !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
@@ -22311,7 +23184,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// a >= b ? -1 : 0 -> RES = setcc_carry
// a >= b ? 0 : -1 -> RES = ~setcc_carry
if (Cond.getOpcode() == X86ISD::SUB) {
- Cond = ConvertCmpIfNecessary(Cond, DAG);
unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
@@ -22333,7 +23205,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
if (T1.getValueType() == T2.getValueType() &&
- // Blacklist CopyFromReg to avoid partial register stalls.
+ // Exclude CopyFromReg to avoid partial register stalls.
T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
CC, Cond);
@@ -22570,14 +23442,9 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
- // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
- if (InVT == MVT::v8i8) {
- if (VT != MVT::v8i64)
- return SDValue();
-
- In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
- MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
- return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(InVT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
}
if (Subtarget.hasInt256())
@@ -22620,23 +23487,19 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
if (!Store->isSimple())
return SDValue();
- EVT StoreVT = StoredVal.getValueType();
- unsigned NumElems = StoreVT.getVectorNumElements();
- unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
- unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
-
SDLoc DL(Store);
- SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
- SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
+ SDValue Value0, Value1;
+ std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
+ unsigned HalfOffset = Value0.getValueType().getStoreSize();
SDValue Ptr0 = Store->getBasePtr();
- SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
- unsigned Alignment = Store->getAlignment();
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL);
SDValue Ch0 =
DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
- Alignment, Store->getMemOperand()->getFlags());
+ Store->getOriginalAlign(),
+ Store->getMemOperand()->getFlags());
SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
- Store->getPointerInfo().getWithOffset(HalfAlign),
- MinAlign(Alignment, HalfAlign),
+ Store->getPointerInfo().getWithOffset(HalfOffset),
+ Store->getOriginalAlign(),
Store->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
}
@@ -22659,7 +23522,6 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
MVT StoreSVT = StoreVT.getScalarType();
unsigned NumElems = StoreVT.getVectorNumElements();
unsigned ScalarSize = StoreSVT.getStoreSize();
- unsigned Alignment = Store->getAlignment();
SDLoc DL(Store);
SmallVector<SDValue, 4> Stores;
@@ -22670,7 +23532,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
DAG.getIntPtrConstant(i, DL));
SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
Store->getPointerInfo().getWithOffset(Offset),
- MinAlign(Alignment, Offset),
+ Store->getOriginalAlign(),
Store->getMemOperand()->getFlags());
Stores.push_back(Ch);
}
@@ -22699,7 +23561,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
@@ -22711,7 +23573,9 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
// and each half can execute independently. Some cores would split the op into
// halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
- if (StoreVT.is256BitVector()) {
+ if (StoreVT.is256BitVector() ||
+ ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
+ !Subtarget.hasBWI())) {
SmallVector<SDValue, 4> CatOps;
if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
return splitVectorStore(St, DAG);
@@ -22738,7 +23602,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
DAG.getIntPtrConstant(0, dl));
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
assert(Subtarget.hasSSE1() && "Expected SSE");
@@ -22773,7 +23637,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
"Expected AVX512F without AVX512DQI");
SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getPointerInfo(), Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
// Replace chain users with the new chain.
@@ -22801,163 +23665,44 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
Op.getOperand(1).hasOneUse());
}
-/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
-/// SETCC node has a single use.
-static bool isXor1OfSetCC(SDValue Op) {
- if (Op.getOpcode() != ISD::XOR)
- return false;
- if (isOneConstant(Op.getOperand(1)))
- return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
- Op.getOperand(0).hasOneUse();
- return false;
-}
-
SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
- bool addTest = true;
SDValue Chain = Op.getOperand(0);
SDValue Cond = Op.getOperand(1);
SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
- SDValue CC;
- bool Inverted = false;
- if (Cond.getOpcode() == ISD::SETCC) {
- // Check for setcc([su]{add,sub,mul}o == 0).
- if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
- isNullConstant(Cond.getOperand(1)) &&
- Cond.getOperand(0).getResNo() == 1 &&
- (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
- Cond.getOperand(0).getOpcode() == ISD::UADDO ||
- Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
- Cond.getOperand(0).getOpcode() == ISD::USUBO ||
- Cond.getOperand(0).getOpcode() == ISD::SMULO ||
- Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
- Inverted = true;
- Cond = Cond.getOperand(0);
- } else {
- if (SDValue NewCond = LowerSETCC(Cond, DAG))
- Cond = NewCond;
- }
- }
-#if 0
- // FIXME: LowerXALUO doesn't handle these!!
- else if (Cond.getOpcode() == X86ISD::ADD ||
- Cond.getOpcode() == X86ISD::SUB ||
- Cond.getOpcode() == X86ISD::SMUL ||
- Cond.getOpcode() == X86ISD::UMUL)
- Cond = LowerXALUO(Cond, DAG);
-#endif
+ if (Cond.getOpcode() == ISD::SETCC &&
+ Cond.getOperand(0).getValueType() != MVT::f128) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- // Look pass (and (setcc_carry (cmp ...)), 1).
- if (Cond.getOpcode() == ISD::AND &&
- Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
- isOneConstant(Cond.getOperand(1)))
- Cond = Cond.getOperand(0);
+ // Special case for
+ // setcc([su]{add,sub,mul}o == 0)
+ // setcc([su]{add,sub,mul}o != 1)
+ if (ISD::isOverflowIntrOpRes(LHS) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ (isNullConstant(RHS) || isOneConstant(RHS))) {
+ SDValue Value, Overflow;
+ X86::CondCode X86Cond;
+ std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
- // If condition flag is set by a X86ISD::CMP, then use it as the condition
- // setting operand in place of the X86ISD::SETCC.
- unsigned CondOpcode = Cond.getOpcode();
- if (CondOpcode == X86ISD::SETCC ||
- CondOpcode == X86ISD::SETCC_CARRY) {
- CC = Cond.getOperand(0);
+ if ((CC == ISD::SETEQ) == isNullConstant(RHS))
+ X86Cond = X86::GetOppositeBranchCondition(X86Cond);
- SDValue Cmp = Cond.getOperand(1);
- unsigned Opc = Cmp.getOpcode();
- // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
- if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
- Cond = Cmp;
- addTest = false;
- } else {
- switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
- default: break;
- case X86::COND_O:
- case X86::COND_B:
- // These can only come from an arithmetic instruction with overflow,
- // e.g. SADDO, UADDO.
- Cond = Cond.getOperand(1);
- addTest = false;
- break;
- }
+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Overflow);
}
- }
- CondOpcode = Cond.getOpcode();
- if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
- CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
- CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
- SDValue Value;
- X86::CondCode X86Cond;
- std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
- if (Inverted)
- X86Cond = X86::GetOppositeBranchCondition(X86Cond);
+ if (LHS.getSimpleValueType().isInteger()) {
+ SDValue CCVal;
+ SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ EFLAGS);
+ }
- CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
- addTest = false;
- } else {
- unsigned CondOpc;
- if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
- SDValue Cmp = Cond.getOperand(0).getOperand(1);
- if (CondOpc == ISD::OR) {
- // Also, recognize the pattern generated by an FCMP_UNE. We can emit
- // two branches instead of an explicit OR instruction with a
- // separate test.
- if (Cmp == Cond.getOperand(1).getOperand(1) &&
- isX86LogicalCmp(Cmp)) {
- CC = Cond.getOperand(0).getOperand(0);
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cmp);
- CC = Cond.getOperand(1).getOperand(0);
- Cond = Cmp;
- addTest = false;
- }
- } else { // ISD::AND
- // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
- // two branches instead of an explicit AND instruction with a
- // separate test. However, we only do this if this block doesn't
- // have a fall-through edge, because this requires an explicit
- // jmp when the condition is false.
- if (Cmp == Cond.getOperand(1).getOperand(1) &&
- isX86LogicalCmp(Cmp) &&
- Op.getNode()->hasOneUse()) {
- X86::CondCode CCode0 =
- (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
- CCode0 = X86::GetOppositeBranchCondition(CCode0);
- CC = DAG.getTargetConstant(CCode0, dl, MVT::i8);
- SDNode *User = *Op.getNode()->use_begin();
- // Look for an unconditional branch following this conditional branch.
- // We need this because we need to reverse the successors in order
- // to implement FCMP_OEQ.
- if (User->getOpcode() == ISD::BR) {
- SDValue FalseBB = User->getOperand(1);
- SDNode *NewBR =
- DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
- assert(NewBR == User);
- (void)NewBR;
- Dest = FalseBB;
-
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain,
- Dest, CC, Cmp);
- X86::CondCode CCode1 =
- (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
- CCode1 = X86::GetOppositeBranchCondition(CCode1);
- CC = DAG.getTargetConstant(CCode1, dl, MVT::i8);
- Cond = Cmp;
- addTest = false;
- }
- }
- }
- } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
- // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
- // It should be transformed during dag combiner except when the condition
- // is set by a arithmetics with overflow node.
- X86::CondCode CCode =
- (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
- CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
- Cond = Cond.getOperand(0).getOperand(1);
- addTest = false;
- } else if (Cond.getOpcode() == ISD::SETCC &&
- cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+ if (CC == ISD::SETOEQ) {
// For FCMP_OEQ, we can emit
// two branches instead of an explicit AND instruction with a
// separate test. However, we only do this if this block doesn't
@@ -22976,59 +23721,65 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
(void)NewBR;
Dest = FalseBB;
- SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
- Cond.getOperand(0), Cond.getOperand(1));
- Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cmp);
- CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
- Cond = Cmp;
- addTest = false;
+ SDValue Cmp =
+ DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+ SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
+ CCVal, Cmp);
+ CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
}
}
- } else if (Cond.getOpcode() == ISD::SETCC &&
- cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+ } else if (CC == ISD::SETUNE) {
// For FCMP_UNE, we can emit
// two branches instead of an explicit OR instruction with a
// separate test.
- SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
- Cond.getOperand(0), Cond.getOperand(1));
- Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cmp);
- CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
- Cond = Cmp;
- addTest = false;
+ SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+ SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
+ Chain =
+ DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
+ CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
+ } else {
+ X86::CondCode X86Cond =
+ TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
+ SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
}
}
- if (addTest) {
- // Look pass the truncate if the high bits are known zero.
- if (isTruncWithZeroHighBitsInput(Cond, DAG))
- Cond = Cond.getOperand(0);
+ if (ISD::isOverflowIntrOpRes(Cond)) {
+ SDValue Value, Overflow;
+ X86::CondCode X86Cond;
+ std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
- // We know the result of AND is compared against zero. Try to match
- // it to BT.
- if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
- SDValue BTCC;
- if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
- CC = BTCC;
- Cond = BT;
- addTest = false;
- }
- }
+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Overflow);
}
- if (addTest) {
- X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
- CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
- Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget);
- }
- Cond = ConvertCmpIfNecessary(Cond, DAG);
- return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cond);
+ // Look past the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ EVT CondVT = Cond.getValueType();
+
+ // Add an AND with 1 if we don't already have one.
+ if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
+ Cond =
+ DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
+
+ SDValue LHS = Cond;
+ SDValue RHS = DAG.getConstant(0, dl, CondVT);
+
+ SDValue CCVal;
+ SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ EFLAGS);
}
// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
@@ -23041,9 +23792,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
- bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
+ bool EmitStackProbeCall = hasStackProbeSymbol(MF);
bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
- SplitStack || EmitStackProbe;
+ SplitStack || EmitStackProbeCall;
SDLoc dl(Op);
// Get the inputs.
@@ -23067,12 +23818,22 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!");
- SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
- Chain = SP.getValue(1);
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- const Align StackAlign(TFI.getStackAlignment());
- Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
- if (Alignment && Alignment > StackAlign)
+ const Align StackAlign = TFI.getStackAlign();
+ if (hasInlineStackProbe(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+ Register Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+ Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ } else {
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SP.getValue(1);
+ Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ }
+ if (Alignment && *Alignment > StackAlign)
Result =
DAG.getNode(ISD::AND, dl, VT, Result,
DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
@@ -23203,14 +23964,13 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
// Decide which area this value should be read from.
// TODO: Implement the AMD64 ABI in its entirety. This simple
// selection mechanism works only for the basic types.
- if (ArgVT == MVT::f80) {
- llvm_unreachable("va_arg for f80 not yet implemented");
- } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+ assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
+ if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
- } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
- ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
} else {
- llvm_unreachable("Unhandled argument type in LowerVAARG");
+ assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
+ "Unhandled argument type in LowerVAARG");
+ ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
}
if (ArgMode == 2) {
@@ -23227,11 +23987,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(Align, dl, MVT::i32)};
SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(
- X86ISD::VAARG_64, dl,
- VTs, InstOps, MVT::i64,
- MachinePointerInfo(SV),
- /*Align=*/0,
- MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
+ X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
+ /*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
Chain = VAARG.getValue(1);
// Load the next argument and return it
@@ -23255,9 +24012,8 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
SDLoc DL(Op);
- return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
- DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
- false, false,
+ return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
+ Align(8), /*isVolatile*/ false, false, false,
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
@@ -23319,7 +24075,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
- Elts.push_back(CurrentOp);
+ // Must produce 0s in the correct bits.
+ Elts.push_back(DAG.getConstant(0, dl, ElementType));
continue;
}
auto *ND = cast<ConstantSDNode>(CurrentOp);
@@ -23331,7 +24088,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
- Elts.push_back(CurrentOp);
+ // Must produce 0s in the correct bits.
+ Elts.push_back(DAG.getConstant(0, dl, ElementType));
continue;
}
auto *ND = cast<ConstantSDNode>(CurrentOp);
@@ -23343,7 +24101,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
for (unsigned i = 0; i != NumElts; ++i) {
SDValue CurrentOp = SrcOp->getOperand(i);
if (CurrentOp->isUndef()) {
- Elts.push_back(CurrentOp);
+ // All shifted in bits must be the same so use 0.
+ Elts.push_back(DAG.getConstant(0, dl, ElementType));
continue;
}
auto *ND = cast<ConstantSDNode>(CurrentOp);
@@ -24001,8 +24760,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
+ // Some conditions require the operands to be swapped.
+ if (CC == ISD::SETLT || CC == ISD::SETLE)
+ std::swap(LHS, RHS);
+
SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
- SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
SDValue SetCC;
switch (CC) {
case ISD::SETEQ: { // (ZF = 0 and PF = 0)
@@ -24018,18 +24780,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
break;
}
case ISD::SETGT: // (CF = 0 and ZF = 0)
+ case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
break;
- case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
- SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
- break;
}
case ISD::SETGE: // CF = 0
+ case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
break;
- case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
- SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
- break;
default:
llvm_unreachable("Unexpected illegal condition!");
}
@@ -24478,6 +25236,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// Clamp out of bounds shift amounts since they will otherwise be masked
// to 8-bits which may make it no longer out of bounds.
unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
+ if (ShiftAmount == 0)
+ return Op.getOperand(1);
+
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
Op.getOperand(0), Op.getOperand(1),
DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
@@ -24537,19 +25298,23 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
TLI.getPointerTy(DAG.getDataLayout()));
EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+ // Cast mask to an integer type.
+ Mask = DAG.getBitcast(MaskVT, Mask);
+
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
- return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
+ SDValue Res =
+ DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
}
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
@@ -24574,7 +25339,7 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
@@ -24584,9 +25349,10 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
- return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
+ SDValue Res =
+ DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -24612,11 +25378,12 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
- SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+ SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
- VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
- return Res.getValue(1);
+ SDValue Res =
+ DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return Res;
}
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -24775,13 +25542,11 @@ static SDValue
EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
SelectionDAG &DAG) {
-
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
SDValue Ops[] = { Chain, Val, Ptr, Undef };
- return SignedSat ?
- DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
- DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+ unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
+ return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
}
/// Emit Masked Truncating Store with signed or unsigned saturation.
@@ -24789,12 +25554,10 @@ static SDValue
EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
MachineMemOperand *MMO, SelectionDAG &DAG) {
-
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = { Chain, Val, Ptr, Mask };
- return SignedSat ?
- DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
- DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+ unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
+ return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
}
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
@@ -25144,7 +25907,7 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
}
-unsigned X86TargetLowering::getExceptionPointerRegister(
+Register X86TargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
@@ -25152,7 +25915,7 @@ unsigned X86TargetLowering::getExceptionPointerRegister(
return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
}
-unsigned X86TargetLowering::getExceptionSelectorRegister(
+Register X86TargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
// Funclet personalities don't use selectors (the runtime does the selection).
assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
@@ -25176,7 +25939,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
"Invalid Frame Register!");
SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
- unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+ Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
DAG.getIntPtrConstant(RegInfo->getSlotSize(),
@@ -25390,93 +26153,51 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2 Round to +inf
3 Round to -inf
- To perform the conversion, we do:
- (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+ To perform the conversion, we use a packed lookup table of the four 2-bit
+ values that we can index by FPSP[11:10]
+ 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
+
+ (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
*/
MachineFunction &MF = DAG.getMachineFunction();
- const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- const Align StackAlignment(TFI.getStackAlignment());
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
// Save FP Control Word to stack slot
- int SSFI =
- MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false);
+ int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
SDValue StackSlot =
DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
- MachineMemOperand::MOStore, 2, 2);
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
- SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
- SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
- DAG.getVTList(MVT::Other),
- Ops, MVT::i16, MMO);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Ops[] = {Chain, StackSlot};
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+ DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
+ Align(2), MachineMemOperand::MOStore);
// Load FP Control Word from stack slot
- SDValue CWD =
- DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
+ SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
+ Chain = CWD.getValue(1);
- // Transform as necessary
- SDValue CWD1 =
- DAG.getNode(ISD::SRL, DL, MVT::i16,
- DAG.getNode(ISD::AND, DL, MVT::i16,
- CWD, DAG.getConstant(0x800, DL, MVT::i16)),
- DAG.getConstant(11, DL, MVT::i8));
- SDValue CWD2 =
+ // Mask and turn the control bits into a shift for the lookup table.
+ SDValue Shift =
DAG.getNode(ISD::SRL, DL, MVT::i16,
DAG.getNode(ISD::AND, DL, MVT::i16,
- CWD, DAG.getConstant(0x400, DL, MVT::i16)),
+ CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
DAG.getConstant(9, DL, MVT::i8));
+ Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
+ SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
SDValue RetVal =
- DAG.getNode(ISD::AND, DL, MVT::i16,
- DAG.getNode(ISD::ADD, DL, MVT::i16,
- DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
- DAG.getConstant(1, DL, MVT::i16)),
- DAG.getConstant(3, DL, MVT::i16));
-
- return DAG.getNode((VT.getSizeInBits() < 16 ?
- ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
-}
-
-// Split an unary integer op into 2 half sized ops.
-static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
- unsigned NumElems = VT.getVectorNumElements();
- unsigned SizeInBits = VT.getSizeInBits();
- MVT EltVT = VT.getVectorElementType();
- SDValue Src = Op.getOperand(0);
- assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
- "Src and Op should have the same element type!");
+ DAG.getNode(ISD::AND, DL, MVT::i32,
+ DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
+ DAG.getConstant(3, DL, MVT::i32));
- // Extract the Lo/Hi vectors
- SDLoc dl(Op);
- SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
- SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
+ RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
- MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
- DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
-}
-
-// Decompose 256-bit ops into smaller 128-bit ops.
-static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
- assert(Op.getSimpleValueType().is256BitVector() &&
- Op.getSimpleValueType().isInteger() &&
- "Only handle AVX 256-bit vector integer operation");
- return LowerVectorIntUnary(Op, DAG);
-}
-
-// Decompose 512-bit ops into smaller 256-bit ops.
-static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
- assert(Op.getSimpleValueType().is512BitVector() &&
- Op.getSimpleValueType().isInteger() &&
- "Only handle AVX 512-bit vector integer operation");
- return LowerVectorIntUnary(Op, DAG);
+ return DAG.getMergeValues({RetVal, Chain}, DL);
}
/// Lower a vector CTLZ using native supported vector CTLZ instruction.
@@ -25499,7 +26220,7 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
// Split vector, it's Lo and Hi parts will be handled in next iteration.
if (NumElems > 16 ||
(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
- return LowerVectorIntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
@@ -25609,11 +26330,11 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
// Decompose 512-bit ops into smaller 256-bit ops.
if (VT.is512BitVector() && !Subtarget.hasBWI())
- return Lower512IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
@@ -25679,64 +26400,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
}
-/// Break a 256-bit integer operation into two new 128-bit ones and then
-/// concatenate the result back.
-static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
-
- assert(VT.is256BitVector() && VT.isInteger() &&
- "Unsupported value type for operation");
-
- unsigned NumElems = VT.getVectorNumElements();
- SDLoc dl(Op);
-
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
-
- // Extract the RHS vectors
- SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
- SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
-
- MVT EltVT = VT.getVectorElementType();
- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
-}
-
-/// Break a 512-bit integer operation into two new 256-bit ones and then
-/// concatenate the result back.
-static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
-
- assert(VT.is512BitVector() && VT.isInteger() &&
- "Unsupported value type for operation");
-
- unsigned NumElems = VT.getVectorNumElements();
- SDLoc dl(Op);
-
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
-
- // Extract the RHS vectors
- SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
- SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
-
- MVT EltVT = VT.getVectorElementType();
- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
-}
-
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
@@ -25747,10 +26410,13 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
Op.getOperand(0), Op.getOperand(1));
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
}
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
@@ -25795,10 +26461,13 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
}
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
@@ -25828,9 +26497,12 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
if (VT.is256BitVector() && !Subtarget.hasInt256()) {
assert(VT.isInteger() &&
"Only handle AVX 256-bit vector integer operation");
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
}
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntUnary(Op, DAG);
+
// Default to expand.
return SDValue();
}
@@ -25840,7 +26512,10 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
// For AVX1 cases, split to use legal ops (everything but v4i64).
if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
SDLoc DL(Op);
unsigned Opcode = Op.getOpcode();
@@ -25884,7 +26559,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit ops into 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
+
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntBinary(Op, DAG);
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
@@ -26030,7 +26708,10 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit ops into 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
+
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntBinary(Op, DAG);
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
@@ -26119,41 +26800,9 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
}
- // For signed 512-bit vectors, split into 256-bit vectors to allow the
- // sign-extension to occur.
- if (VT == MVT::v64i8 && IsSigned)
- return split512IntArith(Op, DAG);
-
- // Signed AVX2 implementation - extend xmm subvectors to ymm.
- if (VT == MVT::v32i8 && IsSigned) {
- MVT ExVT = MVT::v16i16;
- SDValue ALo = extract128BitVector(A, 0, DAG, dl);
- SDValue BLo = extract128BitVector(B, 0, DAG, dl);
- SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
- SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
- ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
- BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
- AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
- BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
- SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
- SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
- Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
- Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
-
- // Bitcast back to VT and then pack all the even elements from Lo and Hi.
- // Shuffle lowering should turn this into PACKUS+PERMQ
- Lo = DAG.getBitcast(VT, Lo);
- Hi = DAG.getBitcast(VT, Hi);
- return DAG.getVectorShuffle(VT, dl, Lo, Hi,
- { 0, 2, 4, 6, 8, 10, 12, 14,
- 16, 18, 20, 22, 24, 26, 28, 30,
- 32, 34, 36, 38, 40, 42, 44, 46,
- 48, 50, 52, 54, 56, 58, 60, 62});
- }
-
- // For signed v16i8 and all unsigned vXi8 we will unpack the low and high
- // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
- // shift the results and pack the half lane results back together.
+ // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
+ // to a vXi16 type. Do the multiplies, shift the results and pack the half
+ // lane results back together.
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -26267,9 +26916,12 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering");
SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
Entry.Node = StackPtr;
InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
- MachinePointerInfo(), /* Alignment = */ 16);
+ MPI, /* Alignment = */ 16);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Ty = PointerType::get(ArgTy,0);
Entry.IsSExt = false;
@@ -26410,7 +27062,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
return ArithmeticShiftRight64(ShiftAmt);
if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
- VT == MVT::v64i8) {
+ (Subtarget.hasBWI() && VT == MVT::v64i8)) {
unsigned NumElts = VT.getVectorNumElements();
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -26856,8 +27508,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
- (VT == MVT::v16i8 || VT == MVT::v64i8 ||
- (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
+ (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
!Subtarget.hasXOP()) {
int NumElts = VT.getVectorNumElements();
SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
@@ -26920,12 +27572,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
ISD::SETGT);
return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
} else if (Subtarget.hasSSE41()) {
- // On SSE41 targets we make use of the fact that VSELECT lowers
- // to PBLENDVB which selects bytes based just on the sign bit.
+ // On SSE41 targets we can use PBLENDVB which selects bytes based just
+ // on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
- return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
}
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
@@ -27035,14 +27688,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
- // On SSE41 targets we make use of the fact that VSELECT lowers
- // to PBLENDVB which selects bytes based just on the sign bit.
+ // On SSE41 targets we can use PBLENDVB which selects bytes based just on
+ // the sign bit.
if (UseSSE41) {
MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
V0 = DAG.getBitcast(ExtVT, V0);
V1 = DAG.getBitcast(ExtVT, V1);
Sel = DAG.getBitcast(ExtVT, Sel);
- return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
}
// On pre-SSE41 targets we splat the sign bit - a negative value will
// set all bits of the lanes to true and VSELECT uses that in
@@ -27093,7 +27747,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit shifts into 128-bit shifts.
if (VT.is256BitVector())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
return SDValue();
}
@@ -27111,28 +27768,21 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
int NumElts = VT.getVectorNumElements();
// Check for constant splat rotation amount.
- APInt UndefElts;
- SmallVector<APInt, 32> EltBits;
- int CstSplatIndex = -1;
- if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
- for (int i = 0; i != NumElts; ++i)
- if (!UndefElts[i]) {
- if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
- CstSplatIndex = i;
- continue;
- }
- CstSplatIndex = -1;
- break;
- }
+ APInt CstSplatValue;
+ bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
+
+ // Check for splat rotate by zero.
+ if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
+ return R;
// AVX512 implicitly uses modulo rotation amounts.
if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
// Attempt to rotate by immediate.
- if (0 <= CstSplatIndex) {
- unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
- uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
- return DAG.getNode(Op, DL, VT, R,
- DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+ if (IsCstSplat) {
+ unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+ uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
+ return DAG.getNode(RotOpc, DL, VT, R,
+ DAG.getTargetConstant(RotAmt, DL, MVT::i8));
}
// Else, fall-back on VPROLV/VPRORV.
@@ -27146,14 +27796,14 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
// XOP implicitly uses modulo rotation amounts.
if (Subtarget.hasXOP()) {
if (VT.is256BitVector())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
// Attempt to rotate by immediate.
- if (0 <= CstSplatIndex) {
- uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
+ if (IsCstSplat) {
+ uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
- DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+ DAG.getTargetConstant(RotAmt, DL, MVT::i8));
}
// Use general rotate by variable (per-element).
@@ -27162,7 +27812,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
// Split 256-bit integers on pre-AVX2 targets.
if (VT.is256BitVector() && !Subtarget.hasAVX2())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
@@ -27170,7 +27820,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
"Only vXi32/vXi16/vXi8 vector rotates supported");
// Rotate by an uniform constant - expand back to shifts.
- if (0 <= CstSplatIndex)
+ if (IsCstSplat)
return SDValue();
bool IsSplatAmt = DAG.isSplatValue(Amt);
@@ -27186,12 +27836,13 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
if (Subtarget.hasSSE41()) {
- // On SSE41 targets we make use of the fact that VSELECT lowers
- // to PBLENDVB which selects bytes based just on the sign bit.
+ // On SSE41 targets we can use PBLENDVB which selects bytes based just
+ // on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
- return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
}
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
@@ -27303,15 +27954,14 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
return false;
}
-// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
-// TODO: In 32-bit mode, use FISTP when X87 is available?
bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
Type *MemType = SI->getValueOperand()->getType();
bool NoImplicitFloatOps =
SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
- !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
return false;
return needsCmpXchgNb(MemType);
@@ -27330,7 +27980,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- (Subtarget.hasSSE2() || Subtarget.hasX87()))
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
@@ -27396,7 +28046,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
AI->use_empty())
return nullptr;
- auto Builder = IRBuilder<>(AI);
+ IRBuilder<> Builder(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
@@ -27438,7 +28088,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// Finally we can emit the atomic load.
LoadInst *Loaded =
Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
- AI->getType()->getPrimitiveSizeInBits());
+ Align(AI->getType()->getPrimitiveSizeInBits()));
Loaded->setAtomic(Order, SSID);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
@@ -27633,18 +28283,6 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
}
- // Custom splitting for BWI types when AVX512F is available but BWI isn't.
- if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
- DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
- SDLoc dl(Op);
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
- MVT CastVT = DstVT.getHalfNumVectorElementsVT();
- Lo = DAG.getBitcast(CastVT, Lo);
- Hi = DAG.getBitcast(CastVT, Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
- }
-
// Use MOVMSK for vector to scalar conversion to prevent scalarization.
if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
@@ -27828,11 +28466,11 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
// Decompose 512-bit ops into smaller 256-bit ops.
if (VT.is512BitVector() && !Subtarget.hasBWI())
- return Lower512IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
// For element types greater than i8, do vXi8 pop counts and a bytesum.
if (VT.getScalarType() != MVT::i8) {
@@ -27876,7 +28514,7 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector())
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
assert(VT.is128BitVector() &&
"Only 128-bit vector bitreverse lowering supported.");
@@ -27913,12 +28551,9 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
- // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
- // lowering.
- if (VT == MVT::v8i64 || VT == MVT::v16i32) {
- assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE");
- return Lower512IntUnary(Op, DAG);
- }
+ // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
+ if (VT == MVT::v64i8 && !Subtarget.hasBWI())
+ return splitVectorIntUnary(Op, DAG);
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarType() == MVT::i8 &&
@@ -27926,7 +28561,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
// two nibbles and a PSHUFB lookup to find the bitreverse of each
@@ -28070,28 +28705,54 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
return Op;
if (VT == MVT::i64 && !IsTypeLegal) {
- // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
- // FIXME: Use movlps with SSE1.
- // FIXME: Use fist with X87.
+ // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
+ // is enabled.
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
- if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- Subtarget.hasSSE2()) {
- SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
- Node->getOperand(2));
- SDVTList Tys = DAG.getVTList(MVT::Other);
- SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
- SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
- Ops, MVT::i64,
- Node->getMemOperand());
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
+ SDValue Chain;
+ if (Subtarget.hasSSE1()) {
+ SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Node->getOperand(2));
+ MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+ SclToVec = DAG.getBitcast(StVT, SclToVec);
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
+ Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
+ MVT::i64, Node->getMemOperand());
+ } else if (Subtarget.hasX87()) {
+ // First load this into an 80-bit X87 register using a stack temporary.
+ // This will put the whole integer into the significand.
+ SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+ Chain =
+ DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
+ MPI, /*Align*/ 0, MachineMemOperand::MOStore);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue LdOps[] = {Chain, StackPtr};
+ SDValue Value =
+ DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
+ /*Align*/ None, MachineMemOperand::MOLoad);
+ Chain = Value.getValue(1);
+
+ // Now use an FIST to do the atomic store.
+ SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
+ Chain =
+ DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
+ StoreOps, MVT::i64, Node->getMemOperand());
+ }
- // If this is a sequentially consistent store, also emit an appropriate
- // barrier.
- if (IsSeqCst)
- Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+ if (Chain) {
+ // If this is a sequentially consistent store, also emit an appropriate
+ // barrier.
+ if (IsSeqCst)
+ Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
- return Chain;
+ return Chain;
+ }
}
}
@@ -28120,9 +28781,8 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
// Set the carry flag.
SDValue Carry = Op.getOperand(2);
EVT CarryVT = Carry.getValueType();
- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
- Carry, DAG.getConstant(NegOne, DL, CarryVT));
+ Carry, DAG.getAllOnesConstant(DL, CarryVT));
unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
@@ -28167,7 +28827,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
- : (Type *)VectorType::get(ArgTy, 4);
+ : (Type *)FixedVectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
@@ -28264,17 +28924,15 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
- SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
+ SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
- SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
- VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- return SDValue(NewScatter.getNode(), 1);
+ return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+ N->getMemoryVT(), N->getMemOperand());
}
return SDValue();
}
MVT IndexVT = Index.getSimpleValueType();
- MVT MaskVT = Mask.getSimpleValueType();
// If the index is v2i32, we're being called by type legalization and we
// should just let the default handling take care of it.
@@ -28292,18 +28950,17 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
- MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
Src = ExtendToType(Src, VT, DAG);
Index = ExtendToType(Index, IndexVT, DAG);
Mask = ExtendToType(Mask, MaskVT, DAG, true);
}
- SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+ SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
- SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
- VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- return SDValue(NewScatter.getNode(), 1);
+ return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+ N->getMemoryVT(), N->getMemOperand());
}
static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
@@ -28329,8 +28986,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
N->isExpandingLoad());
// Emit a blend.
- SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
- PassThru);
+ SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
}
@@ -28366,10 +29022,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
N->getExtensionType(), N->isExpandingLoad());
- SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- NewLoad.getValue(0),
- DAG.getIntPtrConstant(0, dl));
- SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
return DAG.getMergeValues(RetOps, dl);
}
@@ -28427,7 +29083,6 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Mask = N->getMask();
SDValue PassThru = N->getPassThru();
MVT IndexVT = Index.getSimpleValueType();
- MVT MaskVT = Mask.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
@@ -28448,7 +29103,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
- MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
PassThru = ExtendToType(PassThru, VT, DAG);
Index = ExtendToType(Index, IndexVT, DAG);
@@ -28457,12 +29112,12 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
N->getScale() };
- SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
+ SDValue NewGather = DAG.getMemIntrinsicNode(
+ X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
N->getMemOperand());
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
NewGather, DAG.getIntPtrConstant(0, dl));
- return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
+ return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
}
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
@@ -28528,6 +29183,20 @@ SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
return Tmp.first;
}
+// Custom split CVTPS2PH with wide types.
+static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ SDValue RC = Op.getOperand(1);
+ Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
+ Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+}
+
/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -28581,14 +29250,21 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::FP_ROUND:
case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
+ case ISD::FP16_TO_FP:
+ case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
+ case ISD::FP_TO_FP16:
+ case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FADD:
case ISD::FSUB: return lowerFaddFsub(Op, DAG);
+ case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
+ case ISD::LRINT:
+ case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
case ISD::SETCC:
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
@@ -28656,8 +29332,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
case ISD::GC_TRANSITION_START:
case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
- case ISD::ADDRSPACECAST:
- return LowerADDRSPACECAST(Op, DAG);
+ case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
+ case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
}
}
@@ -28703,6 +29379,35 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
N->dump(&DAG);
#endif
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case X86ISD::CVTPH2PS: {
+ EVT VT = N->getValueType(0);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
+ Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ return;
+ }
+ case X86ISD::STRICT_CVTPH2PS: {
+ EVT VT = N->getValueType(0);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
+ {N->getOperand(0), Lo});
+ Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
+ {N->getOperand(0), Hi});
+ SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Lo.getValue(1), Hi.getValue(1));
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// Use a v2i64 if possible.
@@ -28772,7 +29477,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ABS: {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert(N->getValueType(0) == MVT::i64 &&
"Unexpected type (!= i64) on ABS.");
MVT HalfT = MVT::i32;
@@ -28785,15 +29489,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
DAG.getConstant(1, dl, HalfT));
Tmp = DAG.getNode(
ISD::SRA, dl, HalfT, Hi,
- DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
- TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
+ DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
SDValue(Lo.getNode(), 1));
Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
- Results.push_back(Lo);
- Results.push_back(Hi);
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
return;
}
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
@@ -29145,6 +29847,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case ISD::LRINT:
+ case ISD::LLRINT: {
+ if (SDValue V = LRINT_LLRINTHelper(N, DAG))
+ Results.push_back(V);
+ return;
+ }
+
case ISD::SINT_TO_FP:
case ISD::STRICT_SINT_TO_FP:
case ISD::UINT_TO_FP:
@@ -29182,14 +29891,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
for (int i = 0; i != 2; ++i) {
- SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
SignSrc, DAG.getIntPtrConstant(i, dl));
if (IsStrict)
SignCvts[i] =
DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
- {N->getOperand(0), Src});
+ {N->getOperand(0), Elt});
else
- SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
};
SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
SDValue Slow, Chain;
@@ -29269,7 +29978,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(V.getValue(1));
return;
}
- case ISD::FP_EXTEND: {
+ case ISD::FP_EXTEND:
+ case ISD::STRICT_FP_EXTEND: {
// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
// No other ValueType for FP_EXTEND should reach this point.
assert(N->getValueType(0) == MVT::v2f32 &&
@@ -29391,15 +30101,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
auto *Node = cast<AtomicSDNode>(N);
- if (Subtarget.hasSSE2()) {
- // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
- // lower 64-bits.
- SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+ if (Subtarget.hasSSE1()) {
+ // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
+ // Then extract the lower 64-bits.
+ MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+ SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Node->getMemOperand());
- SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ if (Subtarget.hasSSE2()) {
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Ld.getValue(1));
+ return;
+ }
+ // We use an alternative sequence for SSE1 that extracts as v2f32 and
+ // then casts to i64. This avoids a 128-bit stack temporary being
+ // created by type legalization if we were to cast v4f32->v2i64.
+ SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
DAG.getIntPtrConstant(0, dl));
+ Res = DAG.getBitcast(MVT::i64, Res);
Results.push_back(Res);
Results.push_back(Ld.getValue(1));
return;
@@ -29407,14 +30129,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (Subtarget.hasX87()) {
// First load this into an 80-bit X87 register. This will put the whole
// integer into the significand.
- // FIXME: Do we need to glue? See FIXME comment in BuildFILD.
- SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
- SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
dl, Tys, Ops, MVT::i64,
Node->getMemOperand());
SDValue Chain = Result.getValue(1);
- SDValue InFlag = Result.getValue(2);
// Now store the X87 register to a stack temporary and convert to i64.
// This store is not atomic and doesn't need to be.
@@ -29424,11 +30144,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
- SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
- Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
- DAG.getVTList(MVT::Other), StoreOps,
- MVT::i64, MPI, 0 /*Align*/,
- MachineMemOperand::MOStore);
+ SDValue StoreOps[] = { Chain, Result, StackPtr };
+ Chain = DAG.getMemIntrinsicNode(
+ X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
+ MPI, None /*Align*/, MachineMemOperand::MOStore);
// Finally load the value back from the stack temporary and return it.
// This load is not atomic and doesn't need to be.
@@ -29477,24 +30196,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- // Custom splitting for BWI types when AVX512F is available but BWI isn't.
- if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
- SrcVT.isVector() && isTypeLegal(SrcVT)) {
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
- MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
- Lo = DAG.getBitcast(CastVT, Lo);
- Hi = DAG.getBitcast(CastVT, Hi);
- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
- Results.push_back(Res);
- return;
- }
-
if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
+ // FIXME: Use v4f32 for SSE1?
+ assert(Subtarget.hasSSE2() && "Requires SSE2");
assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!");
EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
- SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
+ SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
+ N->getOperand(0));
+ Res = DAG.getBitcast(WideVT, Res);
Results.push_back(Res);
return;
}
@@ -29526,11 +30236,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
Gather->getBasePtr(), Index, Gather->getScale() };
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
- Gather->getMemoryVT(), Gather->getMemOperand());
+ SDValue Res = DAG.getMemIntrinsicNode(
+ X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
+ Gather->getMemoryVT(), Gather->getMemOperand());
Results.push_back(Res);
- Results.push_back(Res.getValue(2));
+ Results.push_back(Res.getValue(1));
return;
}
return;
@@ -29549,7 +30259,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (Subtarget.hasSSE2()) {
MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getPointerInfo(), Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
MVT VecVT = MVT::getVectorVT(LdVT, 2);
@@ -29570,25 +30280,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ADDRSPACECAST: {
- SDValue Src = N->getOperand(0);
- EVT DstVT = N->getValueType(0);
- AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
- unsigned SrcAS = CastN->getSrcAddressSpace();
-
- assert(SrcAS != CastN->getDestAddressSpace() &&
- "addrspacecast must be between different address spaces");
-
- SDValue Res;
- if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)
- Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
- else if (DstVT == MVT::i64)
- Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
- else if (DstVT == MVT::i32)
- Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
- else
- report_fatal_error("Unrecognized addrspacecast type legalization");
-
- Results.push_back(Res);
+ SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
+ Results.push_back(V);
return;
}
}
@@ -29597,362 +30290,367 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((X86ISD::NodeType)Opcode) {
case X86ISD::FIRST_NUMBER: break;
- case X86ISD::BSF: return "X86ISD::BSF";
- case X86ISD::BSR: return "X86ISD::BSR";
- case X86ISD::SHLD: return "X86ISD::SHLD";
- case X86ISD::SHRD: return "X86ISD::SHRD";
- case X86ISD::FAND: return "X86ISD::FAND";
- case X86ISD::FANDN: return "X86ISD::FANDN";
- case X86ISD::FOR: return "X86ISD::FOR";
- case X86ISD::FXOR: return "X86ISD::FXOR";
- case X86ISD::FILD: return "X86ISD::FILD";
- case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
- case X86ISD::FIST: return "X86ISD::FIST";
- case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
- case X86ISD::FLD: return "X86ISD::FLD";
- case X86ISD::FST: return "X86ISD::FST";
- case X86ISD::CALL: return "X86ISD::CALL";
- case X86ISD::BT: return "X86ISD::BT";
- case X86ISD::CMP: return "X86ISD::CMP";
- case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP";
- case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS";
- case X86ISD::COMI: return "X86ISD::COMI";
- case X86ISD::UCOMI: return "X86ISD::UCOMI";
- case X86ISD::CMPM: return "X86ISD::CMPM";
- case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM";
- case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
- case X86ISD::SETCC: return "X86ISD::SETCC";
- case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
- case X86ISD::FSETCC: return "X86ISD::FSETCC";
- case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
- case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
- case X86ISD::CMOV: return "X86ISD::CMOV";
- case X86ISD::BRCOND: return "X86ISD::BRCOND";
- case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
- case X86ISD::IRET: return "X86ISD::IRET";
- case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
- case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
- case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
- case X86ISD::Wrapper: return "X86ISD::Wrapper";
- case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
- case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ";
- case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
- case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
- case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
- case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
- case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
- case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
- case X86ISD::PINSRB: return "X86ISD::PINSRB";
- case X86ISD::PINSRW: return "X86ISD::PINSRW";
- case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
- case X86ISD::ANDNP: return "X86ISD::ANDNP";
- case X86ISD::BLENDI: return "X86ISD::BLENDI";
- case X86ISD::BLENDV: return "X86ISD::BLENDV";
- case X86ISD::HADD: return "X86ISD::HADD";
- case X86ISD::HSUB: return "X86ISD::HSUB";
- case X86ISD::FHADD: return "X86ISD::FHADD";
- case X86ISD::FHSUB: return "X86ISD::FHSUB";
- case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
- case X86ISD::FMAX: return "X86ISD::FMAX";
- case X86ISD::FMAXS: return "X86ISD::FMAXS";
- case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
- case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
- case X86ISD::FMIN: return "X86ISD::FMIN";
- case X86ISD::FMINS: return "X86ISD::FMINS";
- case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
- case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
- case X86ISD::FMAXC: return "X86ISD::FMAXC";
- case X86ISD::FMINC: return "X86ISD::FMINC";
- case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
- case X86ISD::FRCP: return "X86ISD::FRCP";
- case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
- case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
- case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
- case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
- case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
- case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
- case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
- case X86ISD::EH_SJLJ_SETUP_DISPATCH:
- return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
- case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
- case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
- case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
- case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
- case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
- case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
- case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
- case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
- return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
- case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
- return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
- case X86ISD::LADD: return "X86ISD::LADD";
- case X86ISD::LSUB: return "X86ISD::LSUB";
- case X86ISD::LOR: return "X86ISD::LOR";
- case X86ISD::LXOR: return "X86ISD::LXOR";
- case X86ISD::LAND: return "X86ISD::LAND";
- case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
- case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
- case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
- case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
- case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
- case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
- case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC";
- case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS";
- case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS";
- case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
- case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
- case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
- case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
- case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
- case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT";
- case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
- case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
- case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
- case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
- case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND";
- case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
- case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
- case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
- case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
- case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
- case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
- case X86ISD::VSHL: return "X86ISD::VSHL";
- case X86ISD::VSRL: return "X86ISD::VSRL";
- case X86ISD::VSRA: return "X86ISD::VSRA";
- case X86ISD::VSHLI: return "X86ISD::VSHLI";
- case X86ISD::VSRLI: return "X86ISD::VSRLI";
- case X86ISD::VSRAI: return "X86ISD::VSRAI";
- case X86ISD::VSHLV: return "X86ISD::VSHLV";
- case X86ISD::VSRLV: return "X86ISD::VSRLV";
- case X86ISD::VSRAV: return "X86ISD::VSRAV";
- case X86ISD::VROTLI: return "X86ISD::VROTLI";
- case X86ISD::VROTRI: return "X86ISD::VROTRI";
- case X86ISD::VPPERM: return "X86ISD::VPPERM";
- case X86ISD::CMPP: return "X86ISD::CMPP";
- case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP";
- case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
- case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
- case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
- case X86ISD::ADD: return "X86ISD::ADD";
- case X86ISD::SUB: return "X86ISD::SUB";
- case X86ISD::ADC: return "X86ISD::ADC";
- case X86ISD::SBB: return "X86ISD::SBB";
- case X86ISD::SMUL: return "X86ISD::SMUL";
- case X86ISD::UMUL: return "X86ISD::UMUL";
- case X86ISD::OR: return "X86ISD::OR";
- case X86ISD::XOR: return "X86ISD::XOR";
- case X86ISD::AND: return "X86ISD::AND";
- case X86ISD::BEXTR: return "X86ISD::BEXTR";
- case X86ISD::BZHI: return "X86ISD::BZHI";
- case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
- case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
- case X86ISD::PTEST: return "X86ISD::PTEST";
- case X86ISD::TESTP: return "X86ISD::TESTP";
- case X86ISD::KORTEST: return "X86ISD::KORTEST";
- case X86ISD::KTEST: return "X86ISD::KTEST";
- case X86ISD::KADD: return "X86ISD::KADD";
- case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
- case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
- case X86ISD::PACKSS: return "X86ISD::PACKSS";
- case X86ISD::PACKUS: return "X86ISD::PACKUS";
- case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
- case X86ISD::VALIGN: return "X86ISD::VALIGN";
- case X86ISD::VSHLD: return "X86ISD::VSHLD";
- case X86ISD::VSHRD: return "X86ISD::VSHRD";
- case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
- case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
- case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
- case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
- case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
- case X86ISD::SHUFP: return "X86ISD::SHUFP";
- case X86ISD::SHUF128: return "X86ISD::SHUF128";
- case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
- case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
- case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
- case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
- case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
- case X86ISD::MOVSD: return "X86ISD::MOVSD";
- case X86ISD::MOVSS: return "X86ISD::MOVSS";
- case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
- case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
- case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
- case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD";
- case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
- case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
- case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
- case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
- case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
- case X86ISD::VPERMV: return "X86ISD::VPERMV";
- case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
- case X86ISD::VPERMI: return "X86ISD::VPERMI";
- case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
- case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
- case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
- case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
- case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
- case X86ISD::VRANGE: return "X86ISD::VRANGE";
- case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
- case X86ISD::VRANGES: return "X86ISD::VRANGES";
- case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
- case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
- case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
- case X86ISD::PSADBW: return "X86ISD::PSADBW";
- case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
- case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
- case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
- case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
- case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
- case X86ISD::MFENCE: return "X86ISD::MFENCE";
- case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
- case X86ISD::SAHF: return "X86ISD::SAHF";
- case X86ISD::RDRAND: return "X86ISD::RDRAND";
- case X86ISD::RDSEED: return "X86ISD::RDSEED";
- case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
- case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
- case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
- case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
- case X86ISD::VPSHA: return "X86ISD::VPSHA";
- case X86ISD::VPSHL: return "X86ISD::VPSHL";
- case X86ISD::VPCOM: return "X86ISD::VPCOM";
- case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
- case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
- case X86ISD::FMSUB: return "X86ISD::FMSUB";
- case X86ISD::FNMADD: return "X86ISD::FNMADD";
- case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
- case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
- case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
- case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
- case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
- case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
- case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
- case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
- case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
- case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
- case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
- case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
- case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE";
- case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
- case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
- case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
- case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
- case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
- case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
- case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
- case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
- case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
- case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
- case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
- case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
- case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
- case X86ISD::XTEST: return "X86ISD::XTEST";
- case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
- case X86ISD::EXPAND: return "X86ISD::EXPAND";
- case X86ISD::SELECTS: return "X86ISD::SELECTS";
- case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
- case X86ISD::RCP14: return "X86ISD::RCP14";
- case X86ISD::RCP14S: return "X86ISD::RCP14S";
- case X86ISD::RCP28: return "X86ISD::RCP28";
- case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
- case X86ISD::RCP28S: return "X86ISD::RCP28S";
- case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
- case X86ISD::EXP2: return "X86ISD::EXP2";
- case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
- case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
- case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
- case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
- case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
- case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
- case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
- case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
- case X86ISD::FADDS: return "X86ISD::FADDS";
- case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
- case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
- case X86ISD::FSUBS: return "X86ISD::FSUBS";
- case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
- case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
- case X86ISD::FMULS: return "X86ISD::FMULS";
- case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
- case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
- case X86ISD::FDIVS: return "X86ISD::FDIVS";
- case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
- case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
- case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
- case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
- case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
- case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
- case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
- case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
- case X86ISD::SCALEF: return "X86ISD::SCALEF";
- case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
- case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
- case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
- case X86ISD::AVG: return "X86ISD::AVG";
- case X86ISD::MULHRS: return "X86ISD::MULHRS";
- case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
- case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
- case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
- case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
- case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI";
- case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI";
- case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
- case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
- case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
- case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
- case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
- case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
- case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
- case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
- case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
- case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
- case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P";
- case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P";
- case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
- case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
- case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
- case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
- case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
- case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
- case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
- case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
- case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
- case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
- case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
- case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
- case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
- case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
- case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
- case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
- case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI";
- case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
- case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
- case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI";
- case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
- case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
- case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
- case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
- case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
- case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
- case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
- case X86ISD::LWPINS: return "X86ISD::LWPINS";
- case X86ISD::MGATHER: return "X86ISD::MGATHER";
- case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
- case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
- case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
- case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
- case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
- case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
- case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
- case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
- case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
- case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
- case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
- case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
- case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
- case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
- case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
- case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
+#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
+ NODE_NAME_CASE(BSF)
+ NODE_NAME_CASE(BSR)
+ NODE_NAME_CASE(FSHL)
+ NODE_NAME_CASE(FSHR)
+ NODE_NAME_CASE(FAND)
+ NODE_NAME_CASE(FANDN)
+ NODE_NAME_CASE(FOR)
+ NODE_NAME_CASE(FXOR)
+ NODE_NAME_CASE(FILD)
+ NODE_NAME_CASE(FIST)
+ NODE_NAME_CASE(FP_TO_INT_IN_MEM)
+ NODE_NAME_CASE(FLD)
+ NODE_NAME_CASE(FST)
+ NODE_NAME_CASE(CALL)
+ NODE_NAME_CASE(BT)
+ NODE_NAME_CASE(CMP)
+ NODE_NAME_CASE(FCMP)
+ NODE_NAME_CASE(STRICT_FCMP)
+ NODE_NAME_CASE(STRICT_FCMPS)
+ NODE_NAME_CASE(COMI)
+ NODE_NAME_CASE(UCOMI)
+ NODE_NAME_CASE(CMPM)
+ NODE_NAME_CASE(STRICT_CMPM)
+ NODE_NAME_CASE(CMPM_SAE)
+ NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(SETCC_CARRY)
+ NODE_NAME_CASE(FSETCC)
+ NODE_NAME_CASE(FSETCCM)
+ NODE_NAME_CASE(FSETCCM_SAE)
+ NODE_NAME_CASE(CMOV)
+ NODE_NAME_CASE(BRCOND)
+ NODE_NAME_CASE(RET_FLAG)
+ NODE_NAME_CASE(IRET)
+ NODE_NAME_CASE(REP_STOS)
+ NODE_NAME_CASE(REP_MOVS)
+ NODE_NAME_CASE(GlobalBaseReg)
+ NODE_NAME_CASE(Wrapper)
+ NODE_NAME_CASE(WrapperRIP)
+ NODE_NAME_CASE(MOVQ2DQ)
+ NODE_NAME_CASE(MOVDQ2Q)
+ NODE_NAME_CASE(MMX_MOVD2W)
+ NODE_NAME_CASE(MMX_MOVW2D)
+ NODE_NAME_CASE(PEXTRB)
+ NODE_NAME_CASE(PEXTRW)
+ NODE_NAME_CASE(INSERTPS)
+ NODE_NAME_CASE(PINSRB)
+ NODE_NAME_CASE(PINSRW)
+ NODE_NAME_CASE(PSHUFB)
+ NODE_NAME_CASE(ANDNP)
+ NODE_NAME_CASE(BLENDI)
+ NODE_NAME_CASE(BLENDV)
+ NODE_NAME_CASE(HADD)
+ NODE_NAME_CASE(HSUB)
+ NODE_NAME_CASE(FHADD)
+ NODE_NAME_CASE(FHSUB)
+ NODE_NAME_CASE(CONFLICT)
+ NODE_NAME_CASE(FMAX)
+ NODE_NAME_CASE(FMAXS)
+ NODE_NAME_CASE(FMAX_SAE)
+ NODE_NAME_CASE(FMAXS_SAE)
+ NODE_NAME_CASE(FMIN)
+ NODE_NAME_CASE(FMINS)
+ NODE_NAME_CASE(FMIN_SAE)
+ NODE_NAME_CASE(FMINS_SAE)
+ NODE_NAME_CASE(FMAXC)
+ NODE_NAME_CASE(FMINC)
+ NODE_NAME_CASE(FRSQRT)
+ NODE_NAME_CASE(FRCP)
+ NODE_NAME_CASE(EXTRQI)
+ NODE_NAME_CASE(INSERTQI)
+ NODE_NAME_CASE(TLSADDR)
+ NODE_NAME_CASE(TLSBASEADDR)
+ NODE_NAME_CASE(TLSCALL)
+ NODE_NAME_CASE(EH_SJLJ_SETJMP)
+ NODE_NAME_CASE(EH_SJLJ_LONGJMP)
+ NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
+ NODE_NAME_CASE(EH_RETURN)
+ NODE_NAME_CASE(TC_RETURN)
+ NODE_NAME_CASE(FNSTCW16m)
+ NODE_NAME_CASE(LCMPXCHG_DAG)
+ NODE_NAME_CASE(LCMPXCHG8_DAG)
+ NODE_NAME_CASE(LCMPXCHG16_DAG)
+ NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
+ NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
+ NODE_NAME_CASE(LADD)
+ NODE_NAME_CASE(LSUB)
+ NODE_NAME_CASE(LOR)
+ NODE_NAME_CASE(LXOR)
+ NODE_NAME_CASE(LAND)
+ NODE_NAME_CASE(VZEXT_MOVL)
+ NODE_NAME_CASE(VZEXT_LOAD)
+ NODE_NAME_CASE(VEXTRACT_STORE)
+ NODE_NAME_CASE(VTRUNC)
+ NODE_NAME_CASE(VTRUNCS)
+ NODE_NAME_CASE(VTRUNCUS)
+ NODE_NAME_CASE(VMTRUNC)
+ NODE_NAME_CASE(VMTRUNCS)
+ NODE_NAME_CASE(VMTRUNCUS)
+ NODE_NAME_CASE(VTRUNCSTORES)
+ NODE_NAME_CASE(VTRUNCSTOREUS)
+ NODE_NAME_CASE(VMTRUNCSTORES)
+ NODE_NAME_CASE(VMTRUNCSTOREUS)
+ NODE_NAME_CASE(VFPEXT)
+ NODE_NAME_CASE(STRICT_VFPEXT)
+ NODE_NAME_CASE(VFPEXT_SAE)
+ NODE_NAME_CASE(VFPEXTS)
+ NODE_NAME_CASE(VFPEXTS_SAE)
+ NODE_NAME_CASE(VFPROUND)
+ NODE_NAME_CASE(STRICT_VFPROUND)
+ NODE_NAME_CASE(VMFPROUND)
+ NODE_NAME_CASE(VFPROUND_RND)
+ NODE_NAME_CASE(VFPROUNDS)
+ NODE_NAME_CASE(VFPROUNDS_RND)
+ NODE_NAME_CASE(VSHLDQ)
+ NODE_NAME_CASE(VSRLDQ)
+ NODE_NAME_CASE(VSHL)
+ NODE_NAME_CASE(VSRL)
+ NODE_NAME_CASE(VSRA)
+ NODE_NAME_CASE(VSHLI)
+ NODE_NAME_CASE(VSRLI)
+ NODE_NAME_CASE(VSRAI)
+ NODE_NAME_CASE(VSHLV)
+ NODE_NAME_CASE(VSRLV)
+ NODE_NAME_CASE(VSRAV)
+ NODE_NAME_CASE(VROTLI)
+ NODE_NAME_CASE(VROTRI)
+ NODE_NAME_CASE(VPPERM)
+ NODE_NAME_CASE(CMPP)
+ NODE_NAME_CASE(STRICT_CMPP)
+ NODE_NAME_CASE(PCMPEQ)
+ NODE_NAME_CASE(PCMPGT)
+ NODE_NAME_CASE(PHMINPOS)
+ NODE_NAME_CASE(ADD)
+ NODE_NAME_CASE(SUB)
+ NODE_NAME_CASE(ADC)
+ NODE_NAME_CASE(SBB)
+ NODE_NAME_CASE(SMUL)
+ NODE_NAME_CASE(UMUL)
+ NODE_NAME_CASE(OR)
+ NODE_NAME_CASE(XOR)
+ NODE_NAME_CASE(AND)
+ NODE_NAME_CASE(BEXTR)
+ NODE_NAME_CASE(BZHI)
+ NODE_NAME_CASE(PDEP)
+ NODE_NAME_CASE(PEXT)
+ NODE_NAME_CASE(MUL_IMM)
+ NODE_NAME_CASE(MOVMSK)
+ NODE_NAME_CASE(PTEST)
+ NODE_NAME_CASE(TESTP)
+ NODE_NAME_CASE(KORTEST)
+ NODE_NAME_CASE(KTEST)
+ NODE_NAME_CASE(KADD)
+ NODE_NAME_CASE(KSHIFTL)
+ NODE_NAME_CASE(KSHIFTR)
+ NODE_NAME_CASE(PACKSS)
+ NODE_NAME_CASE(PACKUS)
+ NODE_NAME_CASE(PALIGNR)
+ NODE_NAME_CASE(VALIGN)
+ NODE_NAME_CASE(VSHLD)
+ NODE_NAME_CASE(VSHRD)
+ NODE_NAME_CASE(VSHLDV)
+ NODE_NAME_CASE(VSHRDV)
+ NODE_NAME_CASE(PSHUFD)
+ NODE_NAME_CASE(PSHUFHW)
+ NODE_NAME_CASE(PSHUFLW)
+ NODE_NAME_CASE(SHUFP)
+ NODE_NAME_CASE(SHUF128)
+ NODE_NAME_CASE(MOVLHPS)
+ NODE_NAME_CASE(MOVHLPS)
+ NODE_NAME_CASE(MOVDDUP)
+ NODE_NAME_CASE(MOVSHDUP)
+ NODE_NAME_CASE(MOVSLDUP)
+ NODE_NAME_CASE(MOVSD)
+ NODE_NAME_CASE(MOVSS)
+ NODE_NAME_CASE(UNPCKL)
+ NODE_NAME_CASE(UNPCKH)
+ NODE_NAME_CASE(VBROADCAST)
+ NODE_NAME_CASE(VBROADCAST_LOAD)
+ NODE_NAME_CASE(VBROADCASTM)
+ NODE_NAME_CASE(SUBV_BROADCAST)
+ NODE_NAME_CASE(VPERMILPV)
+ NODE_NAME_CASE(VPERMILPI)
+ NODE_NAME_CASE(VPERM2X128)
+ NODE_NAME_CASE(VPERMV)
+ NODE_NAME_CASE(VPERMV3)
+ NODE_NAME_CASE(VPERMI)
+ NODE_NAME_CASE(VPTERNLOG)
+ NODE_NAME_CASE(VFIXUPIMM)
+ NODE_NAME_CASE(VFIXUPIMM_SAE)
+ NODE_NAME_CASE(VFIXUPIMMS)
+ NODE_NAME_CASE(VFIXUPIMMS_SAE)
+ NODE_NAME_CASE(VRANGE)
+ NODE_NAME_CASE(VRANGE_SAE)
+ NODE_NAME_CASE(VRANGES)
+ NODE_NAME_CASE(VRANGES_SAE)
+ NODE_NAME_CASE(PMULUDQ)
+ NODE_NAME_CASE(PMULDQ)
+ NODE_NAME_CASE(PSADBW)
+ NODE_NAME_CASE(DBPSADBW)
+ NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
+ NODE_NAME_CASE(VAARG_64)
+ NODE_NAME_CASE(WIN_ALLOCA)
+ NODE_NAME_CASE(MEMBARRIER)
+ NODE_NAME_CASE(MFENCE)
+ NODE_NAME_CASE(SEG_ALLOCA)
+ NODE_NAME_CASE(PROBED_ALLOCA)
+ NODE_NAME_CASE(RDRAND)
+ NODE_NAME_CASE(RDSEED)
+ NODE_NAME_CASE(RDPKRU)
+ NODE_NAME_CASE(WRPKRU)
+ NODE_NAME_CASE(VPMADDUBSW)
+ NODE_NAME_CASE(VPMADDWD)
+ NODE_NAME_CASE(VPSHA)
+ NODE_NAME_CASE(VPSHL)
+ NODE_NAME_CASE(VPCOM)
+ NODE_NAME_CASE(VPCOMU)
+ NODE_NAME_CASE(VPERMIL2)
+ NODE_NAME_CASE(FMSUB)
+ NODE_NAME_CASE(STRICT_FMSUB)
+ NODE_NAME_CASE(FNMADD)
+ NODE_NAME_CASE(STRICT_FNMADD)
+ NODE_NAME_CASE(FNMSUB)
+ NODE_NAME_CASE(STRICT_FNMSUB)
+ NODE_NAME_CASE(FMADDSUB)
+ NODE_NAME_CASE(FMSUBADD)
+ NODE_NAME_CASE(FMADD_RND)
+ NODE_NAME_CASE(FNMADD_RND)
+ NODE_NAME_CASE(FMSUB_RND)
+ NODE_NAME_CASE(FNMSUB_RND)
+ NODE_NAME_CASE(FMADDSUB_RND)
+ NODE_NAME_CASE(FMSUBADD_RND)
+ NODE_NAME_CASE(VPMADD52H)
+ NODE_NAME_CASE(VPMADD52L)
+ NODE_NAME_CASE(VRNDSCALE)
+ NODE_NAME_CASE(STRICT_VRNDSCALE)
+ NODE_NAME_CASE(VRNDSCALE_SAE)
+ NODE_NAME_CASE(VRNDSCALES)
+ NODE_NAME_CASE(VRNDSCALES_SAE)
+ NODE_NAME_CASE(VREDUCE)
+ NODE_NAME_CASE(VREDUCE_SAE)
+ NODE_NAME_CASE(VREDUCES)
+ NODE_NAME_CASE(VREDUCES_SAE)
+ NODE_NAME_CASE(VGETMANT)
+ NODE_NAME_CASE(VGETMANT_SAE)
+ NODE_NAME_CASE(VGETMANTS)
+ NODE_NAME_CASE(VGETMANTS_SAE)
+ NODE_NAME_CASE(PCMPESTR)
+ NODE_NAME_CASE(PCMPISTR)
+ NODE_NAME_CASE(XTEST)
+ NODE_NAME_CASE(COMPRESS)
+ NODE_NAME_CASE(EXPAND)
+ NODE_NAME_CASE(SELECTS)
+ NODE_NAME_CASE(ADDSUB)
+ NODE_NAME_CASE(RCP14)
+ NODE_NAME_CASE(RCP14S)
+ NODE_NAME_CASE(RCP28)
+ NODE_NAME_CASE(RCP28_SAE)
+ NODE_NAME_CASE(RCP28S)
+ NODE_NAME_CASE(RCP28S_SAE)
+ NODE_NAME_CASE(EXP2)
+ NODE_NAME_CASE(EXP2_SAE)
+ NODE_NAME_CASE(RSQRT14)
+ NODE_NAME_CASE(RSQRT14S)
+ NODE_NAME_CASE(RSQRT28)
+ NODE_NAME_CASE(RSQRT28_SAE)
+ NODE_NAME_CASE(RSQRT28S)
+ NODE_NAME_CASE(RSQRT28S_SAE)
+ NODE_NAME_CASE(FADD_RND)
+ NODE_NAME_CASE(FADDS)
+ NODE_NAME_CASE(FADDS_RND)
+ NODE_NAME_CASE(FSUB_RND)
+ NODE_NAME_CASE(FSUBS)
+ NODE_NAME_CASE(FSUBS_RND)
+ NODE_NAME_CASE(FMUL_RND)
+ NODE_NAME_CASE(FMULS)
+ NODE_NAME_CASE(FMULS_RND)
+ NODE_NAME_CASE(FDIV_RND)
+ NODE_NAME_CASE(FDIVS)
+ NODE_NAME_CASE(FDIVS_RND)
+ NODE_NAME_CASE(FSQRT_RND)
+ NODE_NAME_CASE(FSQRTS)
+ NODE_NAME_CASE(FSQRTS_RND)
+ NODE_NAME_CASE(FGETEXP)
+ NODE_NAME_CASE(FGETEXP_SAE)
+ NODE_NAME_CASE(FGETEXPS)
+ NODE_NAME_CASE(FGETEXPS_SAE)
+ NODE_NAME_CASE(SCALEF)
+ NODE_NAME_CASE(SCALEF_RND)
+ NODE_NAME_CASE(SCALEFS)
+ NODE_NAME_CASE(SCALEFS_RND)
+ NODE_NAME_CASE(AVG)
+ NODE_NAME_CASE(MULHRS)
+ NODE_NAME_CASE(SINT_TO_FP_RND)
+ NODE_NAME_CASE(UINT_TO_FP_RND)
+ NODE_NAME_CASE(CVTTP2SI)
+ NODE_NAME_CASE(CVTTP2UI)
+ NODE_NAME_CASE(STRICT_CVTTP2SI)
+ NODE_NAME_CASE(STRICT_CVTTP2UI)
+ NODE_NAME_CASE(MCVTTP2SI)
+ NODE_NAME_CASE(MCVTTP2UI)
+ NODE_NAME_CASE(CVTTP2SI_SAE)
+ NODE_NAME_CASE(CVTTP2UI_SAE)
+ NODE_NAME_CASE(CVTTS2SI)
+ NODE_NAME_CASE(CVTTS2UI)
+ NODE_NAME_CASE(CVTTS2SI_SAE)
+ NODE_NAME_CASE(CVTTS2UI_SAE)
+ NODE_NAME_CASE(CVTSI2P)
+ NODE_NAME_CASE(CVTUI2P)
+ NODE_NAME_CASE(STRICT_CVTSI2P)
+ NODE_NAME_CASE(STRICT_CVTUI2P)
+ NODE_NAME_CASE(MCVTSI2P)
+ NODE_NAME_CASE(MCVTUI2P)
+ NODE_NAME_CASE(VFPCLASS)
+ NODE_NAME_CASE(VFPCLASSS)
+ NODE_NAME_CASE(MULTISHIFT)
+ NODE_NAME_CASE(SCALAR_SINT_TO_FP)
+ NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
+ NODE_NAME_CASE(SCALAR_UINT_TO_FP)
+ NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
+ NODE_NAME_CASE(CVTPS2PH)
+ NODE_NAME_CASE(STRICT_CVTPS2PH)
+ NODE_NAME_CASE(MCVTPS2PH)
+ NODE_NAME_CASE(CVTPH2PS)
+ NODE_NAME_CASE(STRICT_CVTPH2PS)
+ NODE_NAME_CASE(CVTPH2PS_SAE)
+ NODE_NAME_CASE(CVTP2SI)
+ NODE_NAME_CASE(CVTP2UI)
+ NODE_NAME_CASE(MCVTP2SI)
+ NODE_NAME_CASE(MCVTP2UI)
+ NODE_NAME_CASE(CVTP2SI_RND)
+ NODE_NAME_CASE(CVTP2UI_RND)
+ NODE_NAME_CASE(CVTS2SI)
+ NODE_NAME_CASE(CVTS2UI)
+ NODE_NAME_CASE(CVTS2SI_RND)
+ NODE_NAME_CASE(CVTS2UI_RND)
+ NODE_NAME_CASE(CVTNE2PS2BF16)
+ NODE_NAME_CASE(CVTNEPS2BF16)
+ NODE_NAME_CASE(MCVTNEPS2BF16)
+ NODE_NAME_CASE(DPBF16PS)
+ NODE_NAME_CASE(LWPINS)
+ NODE_NAME_CASE(MGATHER)
+ NODE_NAME_CASE(MSCATTER)
+ NODE_NAME_CASE(VPDPBUSD)
+ NODE_NAME_CASE(VPDPBUSDS)
+ NODE_NAME_CASE(VPDPWSSD)
+ NODE_NAME_CASE(VPDPWSSDS)
+ NODE_NAME_CASE(VPSHUFBITQMB)
+ NODE_NAME_CASE(GF2P8MULB)
+ NODE_NAME_CASE(GF2P8AFFINEQB)
+ NODE_NAME_CASE(GF2P8AFFINEINVQB)
+ NODE_NAME_CASE(NT_CALL)
+ NODE_NAME_CASE(NT_BRIND)
+ NODE_NAME_CASE(UMWAIT)
+ NODE_NAME_CASE(TPAUSE)
+ NODE_NAME_CASE(ENQCMD)
+ NODE_NAME_CASE(ENQCMDS)
+ NODE_NAME_CASE(VP2INTERSECT)
}
return nullptr;
+#undef NODE_NAME_CASE
}
/// Return true if the addressing mode represented by AM is legal for this
@@ -30018,7 +30716,8 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
return false;
// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
- if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
+ // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
+ if (Subtarget.hasXOP() &&
(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
return false;
@@ -30104,7 +30803,7 @@ bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
}
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
- if (!VT1.isInteger() || !VT2.isInteger())
+ if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
@@ -30145,6 +30844,39 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
+bool X86TargetLowering::shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const {
+ // A uniform shift amount in a vector shift or funnel shift may be much
+ // cheaper than a generic variable vector shift, so make that pattern visible
+ // to SDAG by sinking the shuffle instruction next to the shift.
+ int ShiftAmountOpNum = -1;
+ if (I->isShift())
+ ShiftAmountOpNum = 1;
+ else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == Intrinsic::fshl ||
+ II->getIntrinsicID() == Intrinsic::fshr)
+ ShiftAmountOpNum = 2;
+ }
+
+ if (ShiftAmountOpNum == -1)
+ return false;
+
+ auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
+ if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
+ isVectorShiftByScalarCheap(I->getType())) {
+ Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
+ return true;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
+ if (!Subtarget.is64Bit())
+ return false;
+ return TargetLowering::shouldConvertPhiType(From, To);
+}
+
bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
return false;
@@ -30188,7 +30920,7 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
/// VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
/// are assumed to be legal.
-bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
+bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
if (!VT.isSimple())
return false;
@@ -30218,8 +30950,8 @@ bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
}
bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
- // If the subtarget is using retpolines, we need to not generate jump tables.
- if (Subtarget.useRetpolineIndirectBranches())
+ // If the subtarget is using thunks, we need to not generate jump tables.
+ if (Subtarget.useIndirectThunkBranches())
return false;
// Otherwise, fallback on the generic logic.
@@ -30333,7 +31065,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
MachineOperand &Segment = MI.getOperand(5);
unsigned ArgSize = MI.getOperand(6).getImm();
unsigned ArgMode = MI.getOperand(7).getImm();
- unsigned Align = MI.getOperand(8).getImm();
+ Align Alignment = Align(MI.getOperand(8).getImm());
MachineFunction *MF = MBB->getParent();
@@ -30373,7 +31105,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
/* Align ArgSize to a multiple of 8 */
unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
- bool NeedsAlign = (Align > 8);
+ bool NeedsAlign = (Alignment > 8);
MachineBasicBlock *thisMBB = MBB;
MachineBasicBlock *overflowMBB;
@@ -30521,17 +31253,16 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// to OverflowDestReg.
if (NeedsAlign) {
// Align the overflow address
- assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
// aligned_addr = (addr + (align-1)) & ~(align-1)
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
- .addReg(OverflowAddrReg)
- .addImm(Align-1);
+ .addReg(OverflowAddrReg)
+ .addImm(Alignment.value() - 1);
BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
- .addReg(TmpReg)
- .addImm(~(uint64_t)(Align-1));
+ .addReg(TmpReg)
+ .addImm(~(uint64_t)(Alignment.value() - 1));
} else {
BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
.addReg(OverflowAddrReg);
@@ -30627,7 +31358,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
MachineMemOperand *MMO = F->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
MachineMemOperand::MOStore,
- /*Size=*/16, /*Align=*/16);
+ /*Size=*/16, Align(16));
BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
.addFrameIndex(RegSaveFrameIndex)
.addImm(/*Scale=*/1)
@@ -30694,11 +31425,13 @@ static bool isCMOVPseudo(MachineInstr &MI) {
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
+ case X86::CMOV_VR64:
case X86::CMOV_VR128:
case X86::CMOV_VR128X:
case X86::CMOV_VR256:
case X86::CMOV_VR256X:
case X86::CMOV_VR512:
+ case X86::CMOV_VK1:
case X86::CMOV_VK2:
case X86::CMOV_VK4:
case X86::CMOV_VK8:
@@ -30995,8 +31728,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
- ++NextMIIt;
- NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
+ NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
}
}
@@ -31068,6 +31800,112 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
return SinkMBB;
}
+static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::SUB64ri8;
+ return X86::SUB64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::SUB32ri8;
+ return X86::SUB32ri;
+ }
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
+ DebugLoc DL = MI.getDebugLoc();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+
+ const unsigned ProbeSize = getStackProbeSize(*MF);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+ MF->insert(MBBIter, testMBB);
+ MF->insert(MBBIter, blockMBB);
+ MF->insert(MBBIter, tailMBB);
+
+ Register sizeVReg = MI.getOperand(1).getReg();
+
+ Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
+
+ Register TmpStackPtr = MRI.createVirtualRegister(
+ TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
+ Register FinalStackPtr = MRI.createVirtualRegister(
+ TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
+
+ BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
+ .addReg(physSPReg);
+ {
+ const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
+ BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
+ .addReg(TmpStackPtr)
+ .addReg(sizeVReg);
+ }
+
+ // test rsp size
+
+ BuildMI(testMBB, DL,
+ TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+ .addReg(FinalStackPtr)
+ .addReg(physSPReg);
+
+ BuildMI(testMBB, DL, TII->get(X86::JCC_1))
+ .addMBB(tailMBB)
+ .addImm(X86::COND_L);
+ testMBB->addSuccessor(blockMBB);
+ testMBB->addSuccessor(tailMBB);
+
+ // Touch the block then extend it. This is done on the opposite side of
+ // static probe where we allocate then touch, to avoid the need of probing the
+ // tail of the static alloca. Possible scenarios are:
+ //
+ // + ---- <- ------------ <- ------------- <- ------------ +
+ // | |
+ // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
+ // | |
+ // + <- ----------- <- ------------ <- ----------- <- ------------ +
+ //
+ // The property we want to enforce is to never have more than [page alloc] between two probes.
+
+ const unsigned MovMIOpc =
+ TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
+ addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
+ .addImm(0);
+
+ BuildMI(blockMBB, DL,
+ TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
+ .addReg(physSPReg)
+ .addImm(ProbeSize);
+
+
+ BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
+ blockMBB->addSuccessor(testMBB);
+
+ // Replace original instruction by the expected stack ptr
+ BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+ .addReg(FinalStackPtr);
+
+ tailMBB->splice(tailMBB->end(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MBB->addSuccessor(testMBB);
+
+ // Delete the original pseudo instruction.
+ MI.eraseFromParent();
+
+ // And we're done.
+ return tailMBB;
+}
+
MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -31228,29 +32066,16 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
BB->addSuccessor(RestoreMBB);
MI.getOperand(0).setMBB(RestoreMBB);
+ // Marking this as an EH pad but not a funclet entry block causes PEI to
+ // restore stack pointers in the block.
+ RestoreMBB->setIsEHPad(true);
+
auto RestoreMBBI = RestoreMBB->begin();
- BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
return BB;
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
- MachineBasicBlock *BB) const {
- MachineFunction *MF = BB->getParent();
- const Constant *PerFn = MF->getFunction().getPersonalityFn();
- bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
- // Only 32-bit SEH requires special handling for catchpad.
- if (IsSEH && Subtarget.is32Bit()) {
- const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
- BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
- }
- MI.eraseFromParent();
- return BB;
-}
-
-MachineBasicBlock *
X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
MachineBasicBlock *BB) const {
// So, here we replace TLSADDR with the sequence:
@@ -31342,22 +32167,22 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
return BB;
}
-static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
+static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
switch (RPOpc) {
- case X86::RETPOLINE_CALL32:
+ case X86::INDIRECT_THUNK_CALL32:
return X86::CALLpcrel32;
- case X86::RETPOLINE_CALL64:
+ case X86::INDIRECT_THUNK_CALL64:
return X86::CALL64pcrel32;
- case X86::RETPOLINE_TCRETURN32:
+ case X86::INDIRECT_THUNK_TCRETURN32:
return X86::TCRETURNdi;
- case X86::RETPOLINE_TCRETURN64:
+ case X86::INDIRECT_THUNK_TCRETURN64:
return X86::TCRETURNdi64;
}
- llvm_unreachable("not retpoline opcode");
+ llvm_unreachable("not indirect thunk opcode");
}
-static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
- unsigned Reg) {
+static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
+ unsigned Reg) {
if (Subtarget.useRetpolineExternalThunk()) {
// When using an external thunk for retpolines, we pick names that match the
// names GCC happens to use as well. This helps simplify the implementation
@@ -31389,39 +32214,48 @@ static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
return "__x86_indirect_thunk_r11";
}
+ llvm_unreachable("unexpected reg for external indirect thunk");
+ }
+
+ if (Subtarget.useRetpolineIndirectCalls() ||
+ Subtarget.useRetpolineIndirectBranches()) {
+ // When targeting an internal COMDAT thunk use an LLVM-specific name.
+ switch (Reg) {
+ case X86::EAX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_eax";
+ case X86::ECX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_ecx";
+ case X86::EDX:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edx";
+ case X86::EDI:
+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+ return "__llvm_retpoline_edi";
+ case X86::R11:
+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+ return "__llvm_retpoline_r11";
+ }
llvm_unreachable("unexpected reg for retpoline");
}
- // When targeting an internal COMDAT thunk use an LLVM-specific name.
- switch (Reg) {
- case X86::EAX:
- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
- return "__llvm_retpoline_eax";
- case X86::ECX:
- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
- return "__llvm_retpoline_ecx";
- case X86::EDX:
- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
- return "__llvm_retpoline_edx";
- case X86::EDI:
- assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
- return "__llvm_retpoline_edi";
- case X86::R11:
+ if (Subtarget.useLVIControlFlowIntegrity()) {
assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
- return "__llvm_retpoline_r11";
+ return "__llvm_lvi_thunk_r11";
}
- llvm_unreachable("unexpected reg for retpoline");
+ llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
- MachineBasicBlock *BB) const {
+X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
// Copy the virtual register into the R11 physical register and
// call the retpoline thunk.
DebugLoc DL = MI.getDebugLoc();
const X86InstrInfo *TII = Subtarget.getInstrInfo();
Register CalleeVReg = MI.getOperand(0).getReg();
- unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
+ unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
// Find an available scratch register to hold the callee. On 64-bit, we can
// just use R11, but we scan for uses anyway to ensure we don't generate
@@ -31455,7 +32289,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
report_fatal_error("calling convention incompatible with retpoline, no "
"available registers");
- const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
+ const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
.addReg(CalleeVReg);
@@ -31743,12 +32577,17 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MBB->addSuccessor(checkSspMBB);
// Initialize a register with zero.
- Register ZReg = MRI.createVirtualRegister(PtrRC);
- unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
- BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
- .addDef(ZReg)
- .addReg(ZReg, RegState::Undef)
- .addReg(ZReg, RegState::Undef);
+ Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
+
+ if (PVT == MVT::i64) {
+ Register TmpZReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
+ .addImm(0)
+ .addReg(ZReg)
+ .addImm(X86::sub_32bit);
+ ZReg = TmpZReg;
+ }
// Read the current SSP Register value to the zeroed register.
Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
@@ -31877,7 +32716,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
Register Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+ Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
Register SP = RegInfo->getStackRegister();
MachineInstrBuilder MIB;
@@ -32224,6 +33063,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
+ auto TMMImmToTMMReg = [](unsigned Imm) {
+ assert (Imm < 8 && "Illegal tmm index");
+ return X86::TMM0 + Imm;
+ };
switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TLS_addr32:
@@ -32231,18 +33074,19 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
return EmitLoweredTLSAddr(MI, BB);
- case X86::RETPOLINE_CALL32:
- case X86::RETPOLINE_CALL64:
- case X86::RETPOLINE_TCRETURN32:
- case X86::RETPOLINE_TCRETURN64:
- return EmitLoweredRetpoline(MI, BB);
+ case X86::INDIRECT_THUNK_CALL32:
+ case X86::INDIRECT_THUNK_CALL64:
+ case X86::INDIRECT_THUNK_TCRETURN32:
+ case X86::INDIRECT_THUNK_TCRETURN64:
+ return EmitLoweredIndirectThunk(MI, BB);
case X86::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
- case X86::CATCHPAD:
- return EmitLoweredCatchPad(MI, BB);
case X86::SEG_ALLOCA_32:
case X86::SEG_ALLOCA_64:
return EmitLoweredSegAlloca(MI, BB);
+ case X86::PROBED_ALLOCA_32:
+ case X86::PROBED_ALLOCA_64:
+ return EmitLoweredProbedAlloca(MI, BB);
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
@@ -32256,11 +33100,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
+ case X86::CMOV_VR64:
case X86::CMOV_VR128:
case X86::CMOV_VR128X:
case X86::CMOV_VR256:
case X86::CMOV_VR256X:
case X86::CMOV_VR512:
+ case X86::CMOV_VK1:
case X86::CMOV_VK2:
case X86::CMOV_VK4:
case X86::CMOV_VK8:
@@ -32315,7 +33161,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::FP80_TO_INT64_IN_MEM: {
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
- int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ int OrigCWFrameIdx =
+ MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
@@ -32336,7 +33183,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(NewCW, RegState::Kill, X86::sub_16bit);
// Prepare memory for FLDCW.
- int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ int NewCWFrameIdx =
+ MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
NewCWFrameIdx)
.addReg(NewCW16, RegState::Kill);
@@ -32471,6 +33319,97 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
BB->addLiveIn(BasePtr);
return BB;
}
+ case TargetOpcode::PREALLOCATED_SETUP: {
+ assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
+ auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setHasPreallocatedCall(true);
+ int64_t PreallocatedId = MI.getOperand(0).getImm();
+ size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
+ assert(StackAdjustment != 0 && "0 stack adjustment");
+ LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
+ << StackAdjustment << "\n");
+ BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
+ .addReg(X86::ESP)
+ .addImm(StackAdjustment);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case TargetOpcode::PREALLOCATED_ARG: {
+ assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
+ int64_t PreallocatedId = MI.getOperand(1).getImm();
+ int64_t ArgIdx = MI.getOperand(2).getImm();
+ auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
+ LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
+ << ", arg offset " << ArgOffset << "\n");
+ // stack pointer + offset
+ addRegOffset(
+ BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
+ X86::ESP, false, ArgOffset);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case X86::PTDPBSSD:
+ case X86::PTDPBSUD:
+ case X86::PTDPBUSD:
+ case X86::PTDPBUUD:
+ case X86::PTDPBF16PS: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
+ case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
+ case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
+ case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
+ case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ case X86::PTILEZERO: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Imm = MI.getOperand(0).getImm();
+ BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ case X86::PTILELOADD:
+ case X86::PTILELOADDT1:
+ case X86::PTILESTORED: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::PTILELOADD: Opc = X86::TILELOADD; break;
+ case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
+ case X86::PTILESTORED: Opc = X86::TILESTORED; break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ unsigned CurOp = 0;
+ if (Opc != X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Define);
+
+ MIB.add(MI.getOperand(CurOp++)); // base
+ MIB.add(MI.getOperand(CurOp++)); // scale
+ MIB.add(MI.getOperand(CurOp++)); // index -- stride
+ MIB.add(MI.getOperand(CurOp++)); // displacement
+ MIB.add(MI.getOperand(CurOp++)); // segment
+
+ if (Opc == X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Undef);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
}
}
@@ -32480,20 +33419,53 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
bool
X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
- const APInt &Demanded,
+ const APInt &DemandedBits,
+ const APInt &DemandedElts,
TargetLoweringOpt &TLO) const {
- // Only optimize Ands to prevent shrinking a constant that could be
- // matched by movzx.
- if (Op.getOpcode() != ISD::AND)
- return false;
-
EVT VT = Op.getValueType();
+ unsigned Opcode = Op.getOpcode();
+ unsigned EltSize = VT.getScalarSizeInBits();
- // Ignore vectors.
- if (VT.isVector())
+ if (VT.isVector()) {
+ // If the constant is only all signbits in the active bits, then we should
+ // extend it to the entire constant to allow it act as a boolean constant
+ // vector.
+ auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
+ if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
+ return false;
+ for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
+ if (!DemandedElts[i] || V.getOperand(i).isUndef())
+ continue;
+ const APInt &Val = V.getConstantOperandAPInt(i);
+ if (Val.getBitWidth() > Val.getNumSignBits() &&
+ Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
+ return true;
+ }
+ return false;
+ };
+ // For vectors - if we have a constant, then try to sign extend.
+ // TODO: Handle AND/ANDN cases.
+ unsigned ActiveBits = DemandedBits.getActiveBits();
+ if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
+ (Opcode == ISD::OR || Opcode == ISD::XOR) &&
+ NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
+ EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
+ EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
+ VT.getVectorNumElements());
+ SDValue NewC =
+ TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
+ Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
+ SDValue NewOp =
+ TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
+ return TLO.CombineTo(Op, NewOp);
+ }
return false;
+ }
- unsigned Size = VT.getSizeInBits();
+ // Only optimize Ands to prevent shrinking a constant that could be
+ // matched by movzx.
+ if (Opcode != ISD::AND)
+ return false;
// Make sure the RHS really is a constant.
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
@@ -32503,7 +33475,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
const APInt &Mask = C->getAPIntValue();
// Clear all non-demanded bits initially.
- APInt ShrunkMask = Mask & Demanded;
+ APInt ShrunkMask = Mask & DemandedBits;
// Find the width of the shrunk mask.
unsigned Width = ShrunkMask.getActiveBits();
@@ -32515,10 +33487,10 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
// Find the next power of 2 width, rounding up to a byte.
Width = PowerOf2Ceil(std::max(Width, 8U));
// Truncate the width to size to handle illegal types.
- Width = std::min(Width, Size);
+ Width = std::min(Width, EltSize);
// Calculate a possible zero extend mask for this constant.
- APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
+ APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
// If we aren't changing the mask, just return true to keep it and prevent
// the caller from optimizing.
@@ -32527,7 +33499,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
// Make sure the new mask can be represented by a combination of mask bits
// and non-demanded bits.
- if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
+ if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
return false;
// Replace the constant with the zero extend mask.
@@ -32543,6 +33515,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
const SelectionDAG &DAG,
unsigned Depth) const {
unsigned BitWidth = Known.getBitWidth();
+ unsigned NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert((Opc >= ISD::BUILTIN_OP_END ||
@@ -32570,7 +33543,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
Op.getConstantOperandVal(1));
Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
- Known = Known.zextOrTrunc(BitWidth, false);
+ Known = Known.anyextOrTrunc(BitWidth);
Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
break;
}
@@ -32640,10 +33613,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
- // Output known-0 bits are only known if clear in both the LHS & RHS.
- Known.Zero &= Known2.Zero;
- // Output known-1 are known to be set if set in either the LHS | RHS.
- Known.One |= Known2.One;
+ Known |= Known2;
break;
}
case X86ISD::PSADBW: {
@@ -32667,6 +33637,76 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.Zero &= Known2.Zero;
break;
}
+ case X86ISD::BEXTR: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+ unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
+ unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
+
+ // If the length is 0, the result is 0.
+ if (Length == 0) {
+ Known.setAllZero();
+ break;
+ }
+
+ if ((Shift + Length) <= BitWidth) {
+ Known = DAG.computeKnownBits(Op0, Depth + 1);
+ Known = Known.extractBits(Length, Shift);
+ Known = Known.zextOrTrunc(BitWidth);
+ }
+ }
+ break;
+ }
+ case X86ISD::CVTSI2P:
+ case X86ISD::CVTUI2P:
+ case X86ISD::CVTP2SI:
+ case X86ISD::CVTP2UI:
+ case X86ISD::MCVTP2SI:
+ case X86ISD::MCVTP2UI:
+ case X86ISD::CVTTP2SI:
+ case X86ISD::CVTTP2UI:
+ case X86ISD::MCVTTP2SI:
+ case X86ISD::MCVTTP2UI:
+ case X86ISD::MCVTSI2P:
+ case X86ISD::MCVTUI2P:
+ case X86ISD::VFPROUND:
+ case X86ISD::VMFPROUND:
+ case X86ISD::CVTPS2PH:
+ case X86ISD::MCVTPS2PH: {
+ // Conversions - upper elements are known zero.
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ if (SrcVT.isVector()) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ if (NumElts > NumSrcElts &&
+ DemandedElts.countTrailingZeros() >= NumSrcElts)
+ Known.setAllZero();
+ }
+ break;
+ }
+ case X86ISD::STRICT_CVTTP2SI:
+ case X86ISD::STRICT_CVTTP2UI:
+ case X86ISD::STRICT_CVTSI2P:
+ case X86ISD::STRICT_CVTUI2P:
+ case X86ISD::STRICT_VFPROUND:
+ case X86ISD::STRICT_CVTPS2PH: {
+ // Strict Conversions - upper elements are known zero.
+ EVT SrcVT = Op.getOperand(1).getValueType();
+ if (SrcVT.isVector()) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ if (NumElts > NumSrcElts &&
+ DemandedElts.countTrailingZeros() >= NumSrcElts)
+ Known.setAllZero();
+ }
+ break;
+ }
+ case X86ISD::MOVQ2DQ: {
+ // Move from MMX to XMM. Upper half of XMM should be 0.
+ if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
+ Known.setAllZero();
+ break;
+ }
}
// Handle target shuffles.
@@ -32733,11 +33773,12 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
return VTBits;
case X86ISD::VTRUNC: {
- // TODO: Add DemandedElts support.
SDValue Src = Op.getOperand(0);
- unsigned NumSrcBits = Src.getScalarValueSizeInBits();
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
assert(VTBits < NumSrcBits && "Illegal truncation input type");
- unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+ APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
if (Tmp > (NumSrcBits - VTBits))
return Tmp - (NumSrcBits - VTBits);
return 1;
@@ -32865,6 +33906,21 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
return N;
}
+// Helper to look for a normal load that can be narrowed into a vzload with the
+// specified VT and memory VT. Returns SDValue() on failure.
+static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
+ SelectionDAG &DAG) {
+ // Can't if the load is volatile or atomic.
+ if (!LN->isSimple())
+ return SDValue();
+
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
+ LN->getPointerInfo(), LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+}
+
// Attempt to match a combined shuffle mask against supported unary shuffle
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
@@ -33009,9 +34065,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
unsigned InputSizeInBits = MaskVT.getSizeInBits();
unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
-
- bool ContainsZeros =
- llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+ bool ContainsZeros = isAnyZero(Mask);
// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
if (!ContainsZeros && MaskScalarSizeInBits == 64) {
@@ -33059,7 +34113,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Narrow the repeated mask to create 32-bit element permutes.
SmallVector<int, 4> WordMask = RepeatedMask;
if (MaskScalarSizeInBits == 64)
- scaleShuffleMask<int>(2, RepeatedMask, WordMask);
+ narrowShuffleMaskElts(2, RepeatedMask, WordMask);
Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
@@ -33102,17 +34156,32 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// Attempt to match against byte/bit shifts.
- // FIXME: Add 512-bit support.
- if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ if (AllowIntDomain &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
Mask, 0, Zeroable, Subtarget);
- if (0 < ShiftAmt) {
+ if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
+ 32 <= ShuffleVT.getScalarSizeInBits())) {
PermuteImm = (unsigned)ShiftAmt;
return true;
}
}
+ // Attempt to match against bit rotates.
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
+ Subtarget.hasAVX512())) {
+ int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
+ Subtarget, Mask);
+ if (0 < RotateAmt) {
+ Shuffle = X86ISD::VROTLI;
+ PermuteImm = (unsigned)RotateAmt;
+ return true;
+ }
+ }
+
return false;
}
@@ -33193,9 +34262,29 @@ static bool matchBinaryPermuteShuffle(
unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
+ // Attempt to match against VALIGND/VALIGNQ rotate.
+ if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
+ ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
+ if (!isAnyZero(Mask)) {
+ int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
+ if (0 < Rotation) {
+ Shuffle = X86ISD::VALIGN;
+ if (EltSizeInBits == 64)
+ ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
+ else
+ ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
+ PermuteImm = Rotation;
+ return true;
+ }
+ }
+ }
+
// Attempt to match against PALIGNR byte rotate.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
if (0 < ByteRotation) {
Shuffle = X86ISD::PALIGNR;
@@ -33245,8 +34334,7 @@ static bool matchBinaryPermuteShuffle(
// Attempt to combine to INSERTPS, but only if it has elements that need to
// be set to zero.
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
- MaskVT.is128BitVector() &&
- llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
+ MaskVT.is128BitVector() && isAnyZero(Mask) &&
matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
@@ -33374,6 +34462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, V1);
}
+ bool OptForSize = DAG.shouldOptForSize();
unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned NumRootElts = RootVT.getVectorNumElements();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
@@ -33384,11 +34473,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Don't combine if we are a AVX512/EVEX target and the mask element size
// is different from the root element size - this would prevent writemasks
// from being reused.
- // TODO - this currently prevents all lane shuffles from occurring.
- // TODO - check for writemasks usage instead of always preventing combining.
- // TODO - attempt to narrow Mask back to writemask size.
- bool IsEVEXShuffle =
- RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+ bool IsMaskedShuffle = false;
+ if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
+ if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
+ Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
+ IsMaskedShuffle = true;
+ }
+ }
+
+ // If we are shuffling a broadcast (and not introducing zeros) then
+ // we can just use the broadcast directly. This works for smaller broadcast
+ // elements as well as they already repeat across each mask element
+ if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
+ (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {
+ return DAG.getBitcast(RootVT, V1);
+ }
// Attempt to match a subvector broadcast.
// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
@@ -33408,27 +34507,138 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
- // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
+ // Handle 128/256-bit lane shuffles of 512-bit vectors.
+ if (RootVT.is512BitVector() &&
+ (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
+ MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+
+ // If the upper subvectors are zeroable, then an extract+insert is more
+ // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
+ // to zero the upper subvectors.
+ if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
+ if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+ return SDValue(); // Nothing to do!
+ assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
+ "Unexpected lane shuffle");
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);
+ bool UseZero = isAnyZero(BaseMask);
+ Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
+ Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // Narrow shuffle mask to v4x128.
+ SmallVector<int, 4> Mask;
+ assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
+ narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
+
+ // Try to lower to vshuf64x2/vshuf32x4.
+ auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2, SelectionDAG &DAG) {
+ unsigned PermMask = 0;
+ // Insure elements came from the same Op.
+ SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
+ for (int i = 0; i < 4; ++i) {
+ assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
+ if (Mask[i] < 0)
+ continue;
+
+ SDValue Op = Mask[i] >= 4 ? V2 : V1;
+ unsigned OpIndex = i / 2;
+ if (Ops[OpIndex].isUndef())
+ Ops[OpIndex] = Op;
+ else if (Ops[OpIndex] != Op)
+ return SDValue();
+
+ // Convert the 128-bit shuffle mask selection values into 128-bit
+ // selection bits defined by a vshuf64x2 instruction's immediate control
+ // byte.
+ PermMask |= (Mask[i] % 4) << (i * 2);
+ }
+
+ return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
+ DAG.getBitcast(ShuffleVT, Ops[0]),
+ DAG.getBitcast(ShuffleVT, Ops[1]),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ };
+
+ // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
+ // doesn't work because our mask is for 128 bits and we don't have an MVT
+ // to match that.
+ bool PreferPERMQ =
+ UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
+ isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
+ isUndefOrInRange(Mask[3], 2, 4) &&
+ (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
+ (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
+
+ if (!isAnyZero(Mask) && !PreferPERMQ) {
+ if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
+ return DAG.getBitcast(RootVT, V);
+ }
+ }
// Handle 128-bit lane shuffles of 256-bit vectors.
- // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
- // we need to use the zeroing feature.
- // TODO - this should support binary shuffles.
- if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
- !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
- !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+ if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
+ MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+
+ // If the upper half is zeroable, then an extract+insert is more optimal
+ // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
+ // zero the upper half.
+ if (isUndefOrZero(BaseMask[1])) {
+ if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+ return SDValue(); // Nothing to do!
+ assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
+ Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
+ DL, 256);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
return SDValue(); // Nothing to do!
- MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
- unsigned PermMask = 0;
- PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
- PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
-
- Res = DAG.getBitcast(ShuffleVT, V1);
- Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
- DAG.getUNDEF(ShuffleVT),
- DAG.getTargetConstant(PermMask, DL, MVT::i8));
- return DAG.getBitcast(RootVT, Res);
+
+ // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
+ // we need to use the zeroing feature.
+ // Prefer blends for sequential shuffles unless we are optimizing for size.
+ if (UnaryShuffle &&
+ !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
+ (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+ PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
+ DAG.getUNDEF(ShuffleVT),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+ return SDValue(); // Nothing to do!
+
+ // TODO - handle AVX512VL cases with X86ISD::SHUF128.
+ if (!UnaryShuffle && !IsMaskedShuffle) {
+ assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
+ "Unexpected shuffle sentinel value");
+ // Prefer blends to X86ISD::VPERM2X128.
+ if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
+ (BaseMask[0] == 2 && BaseMask[1] == 1))) {
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] & 3) << 0);
+ PermMask |= ((BaseMask[1] & 3) << 4);
+
+ Res = DAG.getNode(
+ X86ISD::VPERM2X128, DL, ShuffleVT,
+ DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),
+ DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
}
// For masks that have been widened to 128-bit elements or more,
@@ -33437,9 +34647,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (BaseMaskEltSizeInBits > 64) {
assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
int MaskScale = BaseMaskEltSizeInBits / 64;
- scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
+ narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
} else {
- Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
+ Mask.assign(BaseMask.begin(), BaseMask.end());
+ }
+
+ // For masked shuffles, we're trying to match the root width for better
+ // writemask folding, attempt to scale the mask.
+ // TODO - variable shuffles might need this to be widened again.
+ if (IsMaskedShuffle && NumRootElts > Mask.size()) {
+ assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
+ int MaskScale = NumRootElts / Mask.size();
+ SmallVector<int, 64> ScaledMask;
+ narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
+ Mask = std::move(ScaledMask);
}
unsigned NumMaskElts = Mask.size();
@@ -33472,26 +34693,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
APInt Zeroable = KnownUndef | KnownZero;
if (UnaryShuffle) {
- // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
- // directly if we don't shuffle the lower element and we shuffle the upper
- // (zero) elements within themselves.
- if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
- (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
- MaskEltSizeInBits) == 0) {
- unsigned Scale =
- cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
- MaskEltSizeInBits;
- ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
- if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
- isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
- return DAG.getBitcast(RootVT, V1);
- }
- }
-
// Attempt to match against broadcast-from-vector.
// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
- if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
- && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
+ if ((Subtarget.hasAVX2() ||
+ (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
+ (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
if (V1.getValueType() == MaskVT &&
@@ -33517,7 +34723,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT) &&
- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ (!IsMaskedShuffle ||
+ (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
@@ -33528,7 +34735,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
PermuteImm) &&
- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ (!IsMaskedShuffle ||
+ (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, V1);
@@ -33538,12 +34746,31 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ // Attempt to combine to INSERTPS, but only if the inserted element has come
+ // from a scalar.
+ // TODO: Handle other insertions here as well?
+ if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
+ MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
+ !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
+ SDValue SrcV1 = V1, SrcV2 = V2;
+ if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
+ SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+ return SDValue(); // Nothing to do!
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, SrcV1),
+ DAG.getBitcast(MVT::v4f32, SrcV2),
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
SDValue NewV1 = V1; // Save operands in case early exit happens.
SDValue NewV2 = V2;
if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT, UnaryShuffle) &&
- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
@@ -33554,10 +34781,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
NewV1 = V1; // Save operands in case early exit happens.
NewV2 = V2;
- if (matchBinaryPermuteShuffle(
- MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
- NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+ AllowIntDomain, NewV1, NewV2, DL, DAG,
+ Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
+ (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
@@ -33597,6 +34824,44 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ // Match shuffle against TRUNCATE patterns.
+ if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
+ // Match against a VTRUNC instruction, accounting for src/dst sizes.
+ if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
+ Subtarget)) {
+ bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
+ ShuffleSrcVT.getVectorNumElements();
+ unsigned Opc =
+ IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
+ if (Depth == 0 && Root.getOpcode() == Opc)
+ return SDValue(); // Nothing to do!
+ V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+ Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
+ if (ShuffleVT.getSizeInBits() < RootSizeInBits)
+ Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // Do we need a more general binary truncation pattern?
+ if (RootSizeInBits < 512 &&
+ ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
+ (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
+ (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
+ isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
+ if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
+ return SDValue(); // Nothing to do!
+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
+ V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+ V2 = DAG.getBitcast(ShuffleSrcVT, V2);
+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
+ Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 1)
@@ -33606,8 +34871,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
- bool MaskContainsZeros =
- any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+ bool MaskContainsZeros = isAnyZero(Mask);
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
@@ -33702,7 +34966,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
Res = DAG.getBitcast(MaskVT, V1);
unsigned AndOpcode =
- FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
+ MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
return DAG.getBitcast(RootVT, Res);
}
@@ -33779,7 +35043,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
continue;
}
if (M == SM_SentinelZero) {
- PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
+ PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
continue;
}
M = Ratio * M + i % Ratio;
@@ -33810,7 +35074,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
continue;
}
if (M == SM_SentinelZero) {
- VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
+ VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
continue;
}
M = Ratio * M + i % Ratio;
@@ -33885,8 +35149,7 @@ static SDValue combineX86ShuffleChainWithExtract(
unsigned &Offset = Offsets[i];
Src = peekThroughBitcasts(Src);
EVT BaseVT = Src.getValueType();
- while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- isa<ConstantSDNode>(Src.getOperand(1))) {
+ while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
Offset += Src.getConstantOperandVal(1);
Src = Src.getOperand(0);
}
@@ -33998,6 +35261,7 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
return SDValue();
// Shuffle the constant bits according to the mask.
+ SDLoc DL(Root);
APInt UndefElts(NumMaskElts, 0);
APInt ZeroElts(NumMaskElts, 0);
APInt ConstantElts(NumMaskElts, 0);
@@ -34035,6 +35299,10 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
}
assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
+ // Attempt to create a zero vector.
+ if ((UndefElts | ZeroElts).isAllOnesValue())
+ return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
+
// Create the constant data.
MVT MaskSVT;
if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
@@ -34043,8 +35311,9 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
+ return SDValue();
- SDLoc DL(Root);
SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
return DAG.getBitcast(VT, CstOp);
}
@@ -34103,7 +35372,8 @@ static SDValue combineX86ShufflesRecursively(
assert(Root.getSimpleValueType().isVector() &&
"Shuffles operate on vector types!");
- assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+ unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
+ assert(VT.getSizeInBits() == RootSizeInBits &&
"Can only combine shuffles of the same vector register size.");
// Extract target shuffle mask and resolve sentinels and inputs.
@@ -34117,6 +35387,18 @@ static SDValue combineX86ShufflesRecursively(
OpZero, DAG, Depth, false))
return SDValue();
+ // Shuffle inputs must be the same size as the result, bail on any larger
+ // inputs and widen any smaller inputs.
+ if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
+ return Op.getValueSizeInBits() > RootSizeInBits;
+ }))
+ return SDValue();
+
+ for (SDValue &Op : OpInputs)
+ if (Op.getValueSizeInBits() < RootSizeInBits)
+ Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
+ SDLoc(Op), RootSizeInBits);
+
SmallVector<int, 64> Mask;
SmallVector<SDValue, 16> Ops;
@@ -34517,6 +35799,59 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
return V;
}
+// Attempt to commute shufps LHS loads:
+// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
+static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
+ SelectionDAG &DAG) {
+ // TODO: Add vXf64 support.
+ if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
+ return SDValue();
+
+ // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
+ auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
+ if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
+ return SDValue();
+ SDValue N0 = V.getOperand(0);
+ SDValue N1 = V.getOperand(1);
+ unsigned Imm = V.getConstantOperandVal(2);
+ if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
+ MayFoldLoad(peekThroughOneUseBitcasts(N1)))
+ return SDValue();
+ Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
+ DAG.getTargetConstant(Imm, DL, MVT::i8));
+ };
+
+ switch (N.getOpcode()) {
+ case X86ISD::VPERMILPI:
+ if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
+ unsigned Imm = N.getConstantOperandVal(1);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+ }
+ break;
+ case X86ISD::SHUFP: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ unsigned Imm = N.getConstantOperandVal(2);
+ if (N0 == N1) {
+ if (SDValue NewSHUFP = commuteSHUFP(N, N0))
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
+ DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
+ }
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -34526,35 +35861,105 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
+ bool IsUnary;
+ SmallVector<int, 64> TargetMask;
+ SmallVector<SDValue, 2> TargetOps;
+ if (isTargetShuffle(Opcode))
+ getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);
+
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
- // single instruction.
- if (VT.getScalarSizeInBits() == 64 &&
- (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
- Opcode == X86ISD::UNPCKL)) {
- auto BC0 = peekThroughBitcasts(N.getOperand(0));
- auto BC1 = peekThroughBitcasts(N.getOperand(1));
- EVT VT0 = BC0.getValueType();
- EVT VT1 = BC1.getValueType();
- unsigned Opcode0 = BC0.getOpcode();
- unsigned Opcode1 = BC1.getOpcode();
- if (Opcode0 == Opcode1 && VT0 == VT1 &&
- (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
- Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
- Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
- SDValue Lo, Hi;
- if (Opcode == X86ISD::MOVSD) {
- Lo = BC1.getOperand(0);
- Hi = BC0.getOperand(1);
- } else {
- Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
- Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
+ // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
+ // represents the LHS/RHS inputs for the lower/upper halves.
+ SmallVector<int, 16> TargetMask128;
+ if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&
+ isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {
+ SmallVector<int, 16> WidenedMask128 = TargetMask128;
+ while (WidenedMask128.size() > 2) {
+ SmallVector<int, 16> WidenedMask;
+ if (!canWidenShuffleElements(WidenedMask128, WidenedMask))
+ break;
+ WidenedMask128 = std::move(WidenedMask);
+ }
+ if (WidenedMask128.size() == 2) {
+ assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");
+ SDValue BC0 = peekThroughBitcasts(TargetOps.front());
+ SDValue BC1 = peekThroughBitcasts(TargetOps.back());
+ EVT VT0 = BC0.getValueType();
+ EVT VT1 = BC1.getValueType();
+ unsigned Opcode0 = BC0.getOpcode();
+ unsigned Opcode1 = BC1.getOpcode();
+ bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+ Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
+ if (Opcode0 == Opcode1 && VT0 == VT1 &&
+ (isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
+ bool SingleOp = (TargetOps.size() == 1);
+ if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
+ SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
+ SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
+ Lo = Lo.getOperand(WidenedMask128[0] & 1);
+ Hi = Hi.getOperand(WidenedMask128[1] & 1);
+ if (SingleOp) {
+ MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+ SDValue Undef = DAG.getUNDEF(SrcVT);
+ SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
+ Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);
+ Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);
+ Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);
+ Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);
+ }
+ SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+ return DAG.getBitcast(VT, Horiz);
+ }
}
- SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
- return DAG.getBitcast(VT, Horiz);
}
}
+ if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
+ return R;
+
+ // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
+ // help expose the 'NOT' pattern further up the DAG.
+ // TODO: This might be beneficial for any binop with a 'splattable' operand.
switch (Opcode) {
+ case X86ISD::MOVDDUP:
+ case X86ISD::PSHUFD: {
+ SDValue Src = N.getOperand(0);
+ if (Src.hasOneUse() && Src.getValueType() == VT) {
+ if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {
+ Not = DAG.getBitcast(VT, Not);
+ Not = Opcode == X86ISD::MOVDDUP
+ ? DAG.getNode(Opcode, DL, VT, Not)
+ : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
+ EVT IntVT = Not.getValueType().changeTypeToInteger();
+ SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
+ Not = DAG.getBitcast(IntVT, Not);
+ Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
+ return DAG.getBitcast(VT, Not);
+ }
+ }
+ break;
+ }
+ }
+
+ // Handle specific target shuffles.
+ switch (Opcode) {
+ case X86ISD::MOVDDUP: {
+ SDValue Src = N.getOperand(0);
+ // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
+ if (VT == MVT::v2f64 && Src.hasOneUse() &&
+ ISD::isNormalLoad(Src.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src);
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
+ SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
+ DCI.CombineTo(N.getNode(), Movddup);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ return SDValue();
+ }
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
SDValue BC = peekThroughBitcasts(Src);
@@ -34580,7 +35985,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
// broadcast(bitcast(src)) -> bitcast(broadcast(src))
// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
if (Src.getOpcode() == ISD::BITCAST &&
- SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
+ SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
VT.getVectorNumElements());
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
@@ -34627,6 +36033,190 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return N; // Return N so it doesn't get rechecked!
}
+ // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
+ // i16. So shrink it ourselves if we can make a broadcast_load.
+ if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
+ Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
+ assert(Subtarget.hasAVX2() && "Expected AVX2");
+ SDValue TruncIn = Src.getOperand(0);
+
+ // If this is a truncate of a non extending load we can just narrow it to
+ // use a broadcast_load.
+ if (ISD::isNormalLoad(TruncIn.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
+ // Unless its volatile or atomic.
+ if (LN->isSimple()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+ LN->getPointerInfo(), LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // If this is a truncate of an i16 extload, we can directly replace it.
+ if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
+ ISD::isEXTLoad(Src.getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
+ if (LN->getMemoryVT().getSizeInBits() == 16) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // If this is a truncate of load that has been shifted right, we can
+ // offset the pointer and use a narrower load.
+ if (TruncIn.getOpcode() == ISD::SRL &&
+ TruncIn.getOperand(0).hasOneUse() &&
+ isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
+ ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
+ unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
+ // Make sure the shift amount and the load size are divisible by 16.
+ // Don't do this if the load is volatile or atomic.
+ if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
+ LN->isSimple()) {
+ unsigned Offset = ShiftAmt / 8;
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
+ SDValue Ops[] = { LN->getChain(), Ptr };
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+ LN->getPointerInfo().getWithOffset(Offset),
+ LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+ }
+
+ // vbroadcast(vzload X) -> vbroadcast_load X
+ if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
+ MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
+ if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // vbroadcast(vector load X) -> vbroadcast_load
+ if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&
+ ISD::isNormalLoad(Src.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src);
+ // Unless the load is volatile or atomic.
+ if (LN->isSimple()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
+ LN->getPointerInfo(), LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ return SDValue();
+ }
+ case X86ISD::VZEXT_MOVL: {
+ SDValue N0 = N.getOperand(0);
+
+ // If this a vzmovl of a full vector load, replace it with a vzload, unless
+ // the load is volatile.
+ if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
+ auto *LN = cast<LoadSDNode>(N0);
+ if (SDValue VZLoad =
+ narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
+ DCI.CombineTo(N.getNode(), VZLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N;
+ }
+ }
+
+ // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
+ // and can just use a VZEXT_LOAD.
+ // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
+ if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *LN = cast<MemSDNode>(N0);
+ if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), VZLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N;
+ }
+ }
+
+ // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
+ // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
+ // if the upper bits of the i64 are zero.
+ if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ N0.getOperand(0).hasOneUse() &&
+ N0.getOperand(0).getValueType() == MVT::i64) {
+ SDValue In = N0.getOperand(0);
+ APInt Mask = APInt::getHighBitsSet(64, 32);
+ if (DAG.MaskedValueIsZero(In, Mask)) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
+ MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+ SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
+ return DAG.getBitcast(VT, Movl);
+ }
+ }
+
+ // Load a scalar integer constant directly to XMM instead of transferring an
+ // immediate value from GPR.
+ // vzext_movl (scalar_to_vector C) --> load [C,0...]
+ if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+ // Create a vector constant - scalar constant followed by zeros.
+ EVT ScalarVT = N0.getOperand(0).getValueType();
+ Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
+ unsigned NumElts = VT.getVectorNumElements();
+ Constant *Zero = ConstantInt::getNullValue(ScalarTy);
+ SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
+ ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
+
+ // Load the vector constant from constant pool.
+ MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
+ MachineMemOperand::MOLoad);
+ }
+ }
+
return SDValue();
}
case X86ISD::BLENDI: {
@@ -34667,6 +36257,34 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
return SDValue();
}
+ case X86ISD::VPERM2X128: {
+ // If both 128-bit values were inserted into high halves of 256-bit values,
+ // the shuffle can be reduced to a concatenation of subvectors:
+ // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
+ // Note: We are only looking for the exact high/high shuffle mask because we
+ // expect to fold other similar patterns before creating this opcode.
+ SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
+ SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
+ unsigned Imm = N.getConstantOperandVal(2);
+ if (!(Imm == 0x31 &&
+ Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Ins0.getValueType() == Ins1.getValueType()))
+ return SDValue();
+
+ SDValue X = Ins0.getOperand(1);
+ SDValue Y = Ins1.getOperand(1);
+ unsigned C1 = Ins0.getConstantOperandVal(2);
+ unsigned C2 = Ins1.getConstantOperandVal(2);
+ MVT SrcVT = X.getSimpleValueType();
+ unsigned SrcElts = SrcVT.getVectorNumElements();
+ if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||
+ C1 != SrcElts || C2 != SrcElts)
+ return SDValue();
+
+ return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
+ Ins1.getValueType(), X, Y));
+ }
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
@@ -34706,8 +36324,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
SDValue Op0 = N.getOperand(0);
SDValue Op1 = N.getOperand(1);
- SDValue Op2 = N.getOperand(2);
- unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned InsertPSMask = N.getConstantOperandVal(2);
unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
unsigned ZeroMask = InsertPSMask & 0xF;
@@ -34847,9 +36464,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
(V.getOpcode() == X86ISD::PSHUFLW ||
V.getOpcode() == X86ISD::PSHUFHW) &&
V.getOpcode() != N.getOpcode() &&
- V.hasOneUse()) {
+ V.hasOneUse() && V.getOperand(0).hasOneUse()) {
SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
- if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+ if (D.getOpcode() == X86ISD::PSHUFD) {
SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
@@ -35248,7 +36865,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
}
// Attempt to combine into a vector load/broadcast.
- if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
+ if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
+ Subtarget, true))
return LD;
// For AVX2, we sometimes want to combine
@@ -35281,79 +36899,100 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue(N, 0);
}
- // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
- // in the upper 64 bits.
- // TODO: Can we generalize this using computeKnownBits.
- if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
- (VT == MVT::v2f64 || VT == MVT::v2i64) &&
- N->getOperand(0).getOpcode() == ISD::BITCAST &&
- (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 ||
- N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
- SDValue In = N->getOperand(0).getOperand(0);
- switch (In.getOpcode()) {
- default:
- break;
- case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
- case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
- case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
- case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
- case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
- case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
- case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
- if (In.getOperand(0).getValueType() == MVT::v2f64 ||
- In.getOperand(0).getValueType() == MVT::v2i64)
- return N->getOperand(0); // return the bitcast
- break;
- case X86ISD::STRICT_CVTTP2SI:
- case X86ISD::STRICT_CVTTP2UI:
- case X86ISD::STRICT_CVTSI2P:
- case X86ISD::STRICT_CVTUI2P:
- case X86ISD::STRICT_VFPROUND:
- if (In.getOperand(1).getValueType() == MVT::v2f64 ||
- In.getOperand(1).getValueType() == MVT::v2i64)
- return N->getOperand(0);
- break;
- }
- }
-
// Pull subvector inserts into undef through VZEXT_MOVL by making it an
// insert into a zero vector. This helps get VZEXT_MOVL closer to
// scalar_to_vectors where 256/512 are canonicalized to an insert and a
// 128-bit scalar_to_vector. This reduces the number of isel patterns.
if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
- N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
- N->getOperand(0).hasOneUse() &&
- N->getOperand(0).getOperand(0).isUndef() &&
- isNullConstant(N->getOperand(0).getOperand(2))) {
- SDValue In = N->getOperand(0).getOperand(1);
- SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
- getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
- Movl, N->getOperand(0).getOperand(2));
- }
-
- // If this a vzmovl of a full vector load, replace it with a vzload, unless
- // the load is volatile.
- if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
- ISD::isNormalLoad(N->getOperand(0).getNode())) {
- LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- if (LN->isSimple()) {
- SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
- SDValue VZLoad =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
- VT.getVectorElementType(),
- LN->getPointerInfo(),
- LN->getAlignment(),
- MachineMemOperand::MOLoad);
- DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
- return VZLoad;
+ N->getOperand(0).hasOneUse()) {
+ SDValue V = peekThroughOneUseBitcasts(N->getOperand(0));
+
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) {
+ SDValue In = V.getOperand(1);
+ MVT SubVT =
+ MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
+ In.getValueSizeInBits() / VT.getScalarSizeInBits());
+ In = DAG.getBitcast(SubVT, In);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
+ getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
+ Movl, V.getOperand(2));
}
}
return SDValue();
}
+// Simplify variable target shuffle masks based on the demanded elements.
+// TODO: Handle DemandedBits in mask indices as well?
+bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
+ SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
+ TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
+ // If we're demanding all elements don't bother trying to simplify the mask.
+ unsigned NumElts = DemandedElts.getBitWidth();
+ if (DemandedElts.isAllOnesValue())
+ return false;
+
+ SDValue Mask = Op.getOperand(MaskIndex);
+ if (!Mask.hasOneUse())
+ return false;
+
+ // Attempt to generically simplify the variable shuffle mask.
+ APInt MaskUndef, MaskZero;
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+ Depth + 1))
+ return true;
+
+ // Attempt to extract+simplify a (constant pool load) shuffle mask.
+ // TODO: Support other types from getTargetShuffleMaskIndices?
+ SDValue BC = peekThroughOneUseBitcasts(Mask);
+ EVT BCVT = BC.getValueType();
+ auto *Load = dyn_cast<LoadSDNode>(BC);
+ if (!Load)
+ return false;
+
+ const Constant *C = getTargetConstantFromNode(Load);
+ if (!C)
+ return false;
+
+ Type *CTy = C->getType();
+ if (!CTy->isVectorTy() ||
+ CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
+ return false;
+
+ // Handle scaling for i64 elements on 32-bit targets.
+ unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
+ if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
+ return false;
+ unsigned Scale = NumCstElts / NumElts;
+
+ // Simplify mask if we have an undemanded element that is not undef.
+ bool Simplified = false;
+ SmallVector<Constant *, 32> ConstVecOps;
+ for (unsigned i = 0; i != NumCstElts; ++i) {
+ Constant *Elt = C->getAggregateElement(i);
+ if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
+ ConstVecOps.push_back(UndefValue::get(Elt->getType()));
+ Simplified = true;
+ continue;
+ }
+ ConstVecOps.push_back(Elt);
+ }
+ if (!Simplified)
+ return false;
+
+ // Generate new constant pool entry + legalize immediately for the load.
+ SDLoc DL(Op);
+ SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
+ SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
+ SDValue NewMask = TLO.DAG.getLoad(
+ BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
+ MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
+ Load->getAlign());
+ return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
+}
+
bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
TargetLoweringOpt &TLO, unsigned Depth) const {
@@ -35523,12 +37162,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Aggressively peek through ops to get at the demanded elts.
// TODO - we should do this for all target/faux shuffles ops.
if (!DemandedElts.isAllOnesValue()) {
- APInt DemandedSrcBits =
- APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
- SDValue NewN0 = SimplifyMultipleUseDemandedBits(
- N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
- SDValue NewN1 = SimplifyMultipleUseDemandedBits(
- N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
+ SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
+ TLO.DAG, Depth + 1);
+ SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
+ TLO.DAG, Depth + 1);
if (NewN0 || NewN1) {
NewN0 = NewN0 ? NewN0 : N0;
NewN1 = NewN1 ? NewN1 : N1;
@@ -35590,6 +37227,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
KnownUndef = LHSUndef & RHSUndef;
break;
}
+ case X86ISD::VZEXT_MOVL: {
+ // If upper demanded elements are already zero then we have nothing to do.
+ SDValue Src = Op.getOperand(0);
+ APInt DemandedUpperElts = DemandedElts;
+ DemandedUpperElts.clearLowBits(1);
+ if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
+ return TLO.CombineTo(Op, Src);
+ break;
+ }
case X86ISD::VBROADCAST: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -35607,36 +37253,32 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
+ // Aggressively peek through src to get at the demanded elt.
+ // TODO - we should do this for all target/faux shuffles ops.
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+ Src, SrcElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
break;
}
- case X86ISD::VPERMV: {
- SDValue Mask = Op.getOperand(0);
- APInt MaskUndef, MaskZero;
- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
- Depth + 1))
+ case X86ISD::VPERMV:
+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
+ Depth))
return true;
break;
- }
case X86ISD::PSHUFB:
case X86ISD::VPERMV3:
- case X86ISD::VPERMILPV: {
- SDValue Mask = Op.getOperand(1);
- APInt MaskUndef, MaskZero;
- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
- Depth + 1))
+ case X86ISD::VPERMILPV:
+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
+ Depth))
return true;
break;
- }
case X86ISD::VPPERM:
- case X86ISD::VPERMIL2: {
- SDValue Mask = Op.getOperand(2);
- APInt MaskUndef, MaskZero;
- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
- Depth + 1))
+ case X86ISD::VPERMIL2:
+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
+ Depth))
return true;
break;
}
- }
// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
// demand any of the high elements, then narrow the op to 128/256-bits: e.g.
@@ -35651,18 +37293,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
ExtSizeInBits = SizeInBits / 4;
switch (Opc) {
- // Zero upper elements.
- case X86ISD::VZEXT_MOVL: {
- SDLoc DL(Op);
- SDValue Ext0 =
- extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
- SDValue ExtOp =
- TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
- SDValue UndefVec = TLO.DAG.getUNDEF(VT);
- SDValue Insert =
- insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
- return TLO.CombineTo(Op, Insert);
- }
// Subvector broadcast.
case X86ISD::SUBV_BROADCAST: {
SDLoc DL(Op);
@@ -35715,10 +37345,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
break;
}
- // Target Shuffles.
+ // Zero upper elements.
+ case X86ISD::VZEXT_MOVL:
+ // Target unary shuffles by immediate:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ case X86ISD::VPERMILPI:
+ // (Non-Lane Crossing) Target Shuffles.
+ case X86ISD::VPERMILPV:
+ case X86ISD::VPERMIL2:
case X86ISD::PSHUFB:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
+ case X86ISD::BLENDI:
// Saturated Packs.
case X86ISD::PACKSS:
case X86ISD::PACKUS:
@@ -35728,14 +37368,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::FHADD:
case X86ISD::FHSUB: {
SDLoc DL(Op);
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+ SDValue SrcOp = Op.getOperand(i);
+ EVT SrcVT = SrcOp.getValueType();
+ assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
+ "Unsupported vector size");
+ Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
+ ExtSizeInBits)
+ : SrcOp);
+ }
MVT ExtVT = VT.getSimpleVT();
ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
ExtSizeInBits / ExtVT.getScalarSizeInBits());
- SDValue Ext0 =
- extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
- SDValue Ext1 =
- extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
- SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
+ SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
SDValue Insert =
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
@@ -35832,6 +37478,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
unsigned BitWidth = OriginalDemandedBits.getBitWidth();
unsigned Opc = Op.getOpcode();
switch(Opc) {
+ case X86ISD::VTRUNC: {
+ KnownBits KnownOp;
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // Simplify the input, using demanded bit information.
+ APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
+ APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
+ if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
+ return true;
+ break;
+ }
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ: {
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
@@ -35888,6 +37546,14 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
}
+ // If we are only demanding sign bits then we can use the shift source directly.
+ unsigned NumSignBits =
+ TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
+ unsigned UpperDemandedBits =
+ BitWidth - OriginalDemandedBits.countTrailingZeros();
+ if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
+ return TLO.CombineTo(Op, Op0);
+
if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
TLO, Depth + 1))
return true;
@@ -36001,7 +37667,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
return TLO.CombineTo(
Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
- Known = KnownVec.zext(BitWidth, true);
+ Known = KnownVec.zext(BitWidth);
return false;
}
break;
@@ -36054,6 +37720,17 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
KnownRHS, TLO, Depth + 1))
return true;
+
+ // Attempt to avoid multi-use ops if we don't need anything from them.
+ SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+ Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
+ SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+ Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
+ if (DemandedOp0 || DemandedOp1) {
+ SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
+ SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
+ }
}
// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
break;
@@ -36086,16 +37763,51 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
// MOVMSK only uses the MSB from each vector element.
KnownBits KnownSrc;
- if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
- KnownSrc, TLO, Depth + 1))
+ APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
+ if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
+ Depth + 1))
return true;
if (KnownSrc.One[SrcBits - 1])
Known.One.setLowBits(NumElts);
else if (KnownSrc.Zero[SrcBits - 1])
Known.Zero.setLowBits(NumElts);
+
+ // Attempt to avoid multi-use os if we don't need anything from it.
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+ Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
return false;
}
+ case X86ISD::BEXTR: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Only bottom 16-bits of the control bits are required.
+ if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+ // NOTE: SimplifyDemandedBits won't do this for constants.
+ const APInt &Val1 = Cst1->getAPIntValue();
+ APInt MaskedVal1 = Val1 & 0xFFFF;
+ if (MaskedVal1 != Val1) {
+ SDLoc DL(Op);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
+ TLO.DAG.getConstant(MaskedVal1, DL, VT)));
+ }
+ }
+
+ KnownBits Known1;
+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
+ if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
+ return true;
+
+ // If the length is 0, replace with 0.
+ KnownBits LengthBits = Known1.extractBits(8, 8);
+ if (LengthBits.isZero())
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+ break;
+ }
}
return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -36119,8 +37831,26 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
!DemandedElts[CIdx->getZExtValue()])
return Vec;
- break;
+ break;
}
+ case X86ISD::VSHLI: {
+ // If we are only demanding sign bits then we can use the shift source
+ // directly.
+ SDValue Op0 = Op.getOperand(0);
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ unsigned BitWidth = DemandedBits.getBitWidth();
+ unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+ unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
+ if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
+ return Op0;
+ break;
+ }
+ case X86ISD::VSRAI:
+ // iff we only need the sign bit then we can use the source directly.
+ // TODO: generalize where we only demand extended signbits.
+ if (DemandedBits.isSignMask())
+ return Op.getOperand(0);
+ break;
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
// iff we only need the sign bit then we can use R directly.
@@ -36154,13 +37884,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
int M = ShuffleMask[i];
if (!DemandedElts[i] || ShuffleUndef[i])
continue;
- int Op = M / NumElts;
- int Index = M % NumElts;
- if (M < 0 || Index != i) {
+ int OpIdx = M / NumElts;
+ int EltIdx = M % NumElts;
+ if (M < 0 || EltIdx != i) {
IdentityOp.clearAllBits();
break;
}
- IdentityOp &= APInt::getOneBitSet(NumOps, Op);
+ IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
if (IdentityOp == 0)
break;
}
@@ -36191,6 +37921,51 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
return false;
}
+// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
+static unsigned getAltBitOpcode(unsigned Opcode) {
+ switch(Opcode) {
+ case ISD::AND: return X86ISD::FAND;
+ case ISD::OR: return X86ISD::FOR;
+ case ISD::XOR: return X86ISD::FXOR;
+ case X86ISD::ANDNP: return X86ISD::FANDN;
+ }
+ llvm_unreachable("Unknown bitwise opcode");
+}
+
+// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
+static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
+ const SDLoc &DL) {
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT != MVT::v4i1)
+ return SDValue();
+
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
+ ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
+ cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
+ SDValue Op0 = Src.getOperand(0);
+ if (ISD::isNormalLoad(Op0.getNode()))
+ return DAG.getBitcast(MVT::v4f32, Op0);
+ if (Op0.getOpcode() == ISD::BITCAST &&
+ Op0.getOperand(0).getValueType() == MVT::v4f32)
+ return Op0.getOperand(0);
+ }
+ break;
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR: {
+ SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
+ SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
+ if (Op0 && Op1)
+ return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
+ Op1);
+ break;
+ }
+ }
+ return SDValue();
+}
+
// Helper to push sign extension of vXi1 SETCC result through bitops.
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
SDValue Src, const SDLoc &DL) {
@@ -36221,18 +37996,40 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();
+ // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
+ // legalization destroys the v4i32 type.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
+ if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
+ DAG.getBitcast(MVT::v4f32, V));
+ return DAG.getZExtOrTrunc(V, DL, VT);
+ }
+ }
+
// If the input is a truncate from v16i8 or v32i8 go ahead and use a
// movmskb even with avx512. This will be better than truncating to vXi1 and
// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
// vpcmpeqb/vpcmpgtb.
- bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
- (Src.getOperand(0).getValueType() == MVT::v16i8 ||
- Src.getOperand(0).getValueType() == MVT::v32i8 ||
- Src.getOperand(0).getValueType() == MVT::v64i8);
+ bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
+ (Src.getOperand(0).getValueType() == MVT::v16i8 ||
+ Src.getOperand(0).getValueType() == MVT::v32i8 ||
+ Src.getOperand(0).getValueType() == MVT::v64i8);
+
+ // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
+ // directly with vpmovmskb/vmovmskps/vmovmskpd.
+ if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
+ cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
+ ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
+ EVT CmpVT = Src.getOperand(0).getValueType();
+ EVT EltVT = CmpVT.getVectorElementType();
+ if (CmpVT.getSizeInBits() <= 256 &&
+ (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
+ PreferMovMsk = true;
+ }
// With AVX512 vxi1 types are legal and we prefer using k-regs.
// MOVMSK is supported in SSE2 or later.
- if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))
+ if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
return SDValue();
// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
@@ -36288,7 +38085,14 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
case MVT::v64i1:
// If we have AVX512F, but not AVX512BW and the input is truncated from
// v64i8 checked earlier. Then split the input and make two pmovmskbs.
- if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
+ if (Subtarget.hasAVX512()) {
+ if (Subtarget.hasBWI())
+ return SDValue();
+ SExtVT = MVT::v64i8;
+ break;
+ }
+ // Split if this is a <64 x i8> comparison result.
+ if (checkBitcastSrcVectorSize(Src, 512)) {
SExtVT = MVT::v64i8;
break;
}
@@ -36458,6 +38262,74 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
return Ops[0];
}
+// Recursive function that attempts to find if a bool vector node was originally
+// a vector/float/double that got truncated/extended/bitcast to/from a scalar
+// integer. If so, replace the scalar ops with bool vector equivalents back down
+// the chain.
+static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned Opc = V.getOpcode();
+ switch (Opc) {
+ case ISD::BITCAST: {
+ // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
+ SDValue Src = V.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.isVector() || SrcVT.isFloatingPoint())
+ return DAG.getBitcast(VT, Src);
+ break;
+ }
+ case ISD::TRUNCATE: {
+ // If we find a suitable source, a truncated scalar becomes a subvector.
+ SDValue Src = V.getOperand(0);
+ EVT NewSrcVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
+ if (TLI.isTypeLegal(NewSrcVT))
+ if (SDValue N0 =
+ combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
+ DAG.getIntPtrConstant(0, DL));
+ break;
+ }
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND: {
+ // If we find a suitable source, an extended scalar becomes a subvector.
+ SDValue Src = V.getOperand(0);
+ EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Src.getScalarValueSizeInBits());
+ if (TLI.isTypeLegal(NewSrcVT))
+ if (SDValue N0 =
+ combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
+ : DAG.getConstant(0, DL, VT),
+ N0, DAG.getIntPtrConstant(0, DL));
+ break;
+ }
+ case ISD::OR: {
+ // If we find suitable sources, we can just move an OR to the vector domain.
+ SDValue Src0 = V.getOperand(0);
+ SDValue Src1 = V.getOperand(1);
+ if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+ if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
+ return DAG.getNode(Opc, DL, VT, N0, N1);
+ break;
+ }
+ case ISD::SHL: {
+ // If we find a suitable source, a SHL becomes a KSHIFTL.
+ SDValue Src0 = V.getOperand(0);
+ if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
+ if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+ return DAG.getNode(
+ X86ISD::KSHIFTL, DL, VT, N0,
+ DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
+ break;
+ }
+ }
+ return SDValue();
+}
+
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -36476,24 +38348,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;
- // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type
- // legalization destroys the v4i32 type.
- if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
- VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
- N0.getOperand(0).getValueType() == MVT::v4i32 &&
- ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
- cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
- SDValue N00 = N0.getOperand(0);
- // Only do this if we can avoid scalarizing the input.
- if (ISD::isNormalLoad(N00.getNode()) ||
- (N00.getOpcode() == ISD::BITCAST &&
- N00.getOperand(0).getValueType() == MVT::v4f32)) {
- SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
- DAG.getBitcast(MVT::v4f32, N00));
- return DAG.getZExtOrTrunc(V, dl, VT);
- }
- }
-
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
@@ -36535,6 +38389,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
N0 = DAG.getBitcast(MVT::i8, N0);
return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
}
+ } else {
+ // If we're bitcasting from iX to vXi1, see if the integer originally
+ // began as a vXi1 and whether we can remove the bitcast entirely.
+ if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
+ SrcVT.isScalarInteger() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ if (SDValue V =
+ combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
+ return V;
+ }
}
// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
@@ -36549,19 +38413,30 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
DAG.getBitcast(MVT::i16, N0.getOperand(0)));
- // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT
- // determines // the number of bits loaded. Remaining bits are zero.
+ // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
+ // and the vbroadcast_load are both integer or both fp. In some cases this
+ // will remove the bitcast entirely.
if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
- VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {
+ VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
auto *BCast = cast<MemIntrinsicSDNode>(N0);
- SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
- SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
- VT.getVectorElementType(),
- BCast->getMemOperand());
- DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
- return ResNode;
+ unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
+ unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
+ // Don't swap i8/i16 since don't have fp types that size.
+ if (MemSize >= 32) {
+ MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
+ : MVT::getIntegerVT(MemSize);
+ MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
+ : MVT::getIntegerVT(SrcVTSize);
+ LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
+
+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+ SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+ MemVT, BCast->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
+ return DAG.getBitcast(VT, ResNode);
+ }
}
// Since MMX types are special and don't usually play with other vector types,
@@ -36648,6 +38523,47 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
return DAG.getConstant(0, SDLoc(N0), VT);
}
+ // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
+ // Turn it into a sign bit compare that produces a k-register. This avoids
+ // a trip through a GPR.
+ if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
+ VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ isPowerOf2_32(VT.getVectorNumElements())) {
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue Src = N0;
+
+ // Peek through truncate.
+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
+ Src = N0.getOperand(0);
+
+ if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
+ SDValue MovmskIn = Src.getOperand(0);
+ MVT MovmskVT = MovmskIn.getSimpleValueType();
+ unsigned MovMskElts = MovmskVT.getVectorNumElements();
+
+ // We allow extra bits of the movmsk to be used since they are known zero.
+ // We can't convert a VPMOVMSKB without avx512bw.
+ if (MovMskElts <= NumElts &&
+ (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
+ EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
+ MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
+ SDLoc dl(N);
+ MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
+ SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
+ DAG.getConstant(0, dl, IntVT), ISD::SETLT);
+ if (EVT(CmpVT) == VT)
+ return Cmp;
+
+ // Pad with zeroes up to original VT to replace the zeroes that were
+ // being used from the MOVMSK.
+ unsigned NumConcats = NumElts / MovMskElts;
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
+ Ops[0] = Cmp;
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
+ }
+ }
+ }
+
// Try to remove bitcasts from input and output of mask arithmetic to
// remove GPR<->K-register crossings.
if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
@@ -36772,12 +38688,9 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
while (SrcVT.getSizeInBits() > 128) {
- unsigned NumElts = SrcVT.getVectorNumElements();
- unsigned NumSubElts = NumElts / 2;
- SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
- unsigned SubSizeInBits = SrcVT.getSizeInBits();
- SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
- SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
+ SrcVT = Lo.getValueType();
MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
}
assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
@@ -36864,6 +38777,25 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = DAG.getBitcast(MovmskVT, Match);
} else {
+ // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
+ // PCMPEQQ (SSE41+), use PCMPEQD instead.
+ if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
+ Match.getOpcode() == ISD::SETCC &&
+ ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
+ cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
+ ISD::CondCode::SETEQ) {
+ SDValue Vec = Match.getOperand(0);
+ if (Vec.getValueType().getScalarType() == MVT::i64 &&
+ (2 * NumElts) <= MaxElts) {
+ NumElts *= 2;
+ EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+ Match = DAG.getSetCC(
+ DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
+ DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
+ }
+ }
+
// Use combineBitcastvxi1 to create the MOVMSK.
while (NumElts > MaxElts) {
SDValue Lo, Hi;
@@ -36878,10 +38810,7 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
return SDValue();
Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
} else {
- // Bail with AVX512VL (which uses predicate registers).
- if (Subtarget.hasVLX())
- return SDValue();
-
+ // FIXME: Better handling of k-registers or 512-bit vectors?
unsigned MatchSizeInBits = Match.getValueSizeInBits();
if (!(MatchSizeInBits == 128 ||
(MatchSizeInBits == 256 && Subtarget.hasAVX())))
@@ -36958,21 +38887,14 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
if (!Subtarget.hasSSE2())
return SDValue();
- // Verify the type we're extracting from is any integer type above i16.
- EVT VT = Extract->getOperand(0).getValueType();
- if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
+ EVT ExtractVT = Extract->getValueType(0);
+ // Verify the type we're extracting is either i32 or i64.
+ // FIXME: Could support other types, but this is what we have coverage for.
+ if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
return SDValue();
- unsigned RegSize = 128;
- if (Subtarget.useBWIRegs())
- RegSize = 512;
- else if (Subtarget.hasAVX())
- RegSize = 256;
-
- // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
- // TODO: We should be able to handle larger vectors by splitting them before
- // feeding them into several SADs, and then reducing over those.
- if (RegSize / VT.getVectorNumElements() < 8)
+ EVT VT = Extract->getOperand(0).getValueType();
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
return SDValue();
// Match shuffle + add pyramid.
@@ -36988,8 +38910,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
// (extends the sign bit which is zero).
// So it is correct to skip the sign/zero extend instruction.
if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
- Root.getOpcode() == ISD::ZERO_EXTEND ||
- Root.getOpcode() == ISD::ANY_EXTEND))
+ Root.getOpcode() == ISD::ZERO_EXTEND ||
+ Root.getOpcode() == ISD::ANY_EXTEND))
Root = Root.getOperand(0);
// If there was a match, we want Root to be a select that is the root of an
@@ -37009,7 +38931,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
// If the original vector was wider than 8 elements, sum over the results
// in the SAD vector.
unsigned Stages = Log2_32(VT.getVectorNumElements());
- MVT SadVT = SAD.getSimpleValueType();
+ EVT SadVT = SAD.getValueType();
if (Stages > 3) {
unsigned SadElems = SadVT.getVectorNumElements();
@@ -37024,12 +38946,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
}
}
- MVT Type = Extract->getSimpleValueType(0);
- unsigned TypeSizeInBits = Type.getSizeInBits();
- // Return the lowest TypeSizeInBits bits.
- MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
+ unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
+ // Return the lowest ExtractSizeInBits bits.
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
+ SadVT.getSizeInBits() / ExtractSizeInBits);
SAD = DAG.getBitcast(ResVT, SAD);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
Extract->getOperand(1));
}
@@ -37048,19 +38970,34 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getVectorElementType();
+ unsigned SrcEltBits = SrcSVT.getSizeInBits();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
// Don't attempt this for boolean mask vectors or unknown extraction indices.
if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
return SDValue();
+ const APInt &IdxC = N->getConstantOperandAPInt(1);
+ if (IdxC.uge(NumSrcElts))
+ return SDValue();
+
SDValue SrcBC = peekThroughBitcasts(Src);
- // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
+ // Handle extract(bitcast(broadcast(scalar_value))).
if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
SDValue SrcOp = SrcBC.getOperand(0);
- if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
- return DAG.getBitcast(VT, SrcOp);
+ EVT SrcOpVT = SrcOp.getValueType();
+ if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
+ (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
+ unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
+ unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
+ // TODO support non-zero offsets.
+ if (Offset == 0) {
+ SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
+ SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
+ return SrcOp;
+ }
+ }
}
// If we're extracting a single element from a broadcast load and there are
@@ -37069,22 +39006,43 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
- VT.getSizeInBits() == SrcBCWidth) {
+ VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
MemIntr->getBasePtr(),
MemIntr->getPointerInfo(),
- MemIntr->getAlignment(),
+ MemIntr->getOriginalAlign(),
MemIntr->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
return Load;
}
}
+ // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
+ // TODO: Move to DAGCombine?
+ if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
+ SrcBC.getValueType().isInteger() &&
+ (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
+ SrcBC.getScalarValueSizeInBits() ==
+ SrcBC.getOperand(0).getValueSizeInBits()) {
+ unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
+ if (IdxC.ult(Scale)) {
+ unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
+ SDValue Scl = SrcBC.getOperand(0);
+ EVT SclVT = Scl.getValueType();
+ if (Offset) {
+ Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
+ DAG.getShiftAmountConstant(Offset, SclVT, dl));
+ }
+ Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
+ Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
+ return Scl;
+ }
+ }
+
// Handle extract(truncate(x)) for 0'th index.
// TODO: Treat this as a faux shuffle?
// TODO: When can we use this for general indices?
- if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() &&
- isNullConstant(Idx)) {
+ if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
Src = DAG.getBitcast(SrcVT, Src);
return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
@@ -37096,12 +39054,18 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
return SDValue();
+ // Shuffle inputs must be the same size as the result.
+ if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
+ return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
+ }))
+ return SDValue();
+
// Attempt to narrow/widen the shuffle mask to the correct size.
if (Mask.size() != NumSrcElts) {
if ((NumSrcElts % Mask.size()) == 0) {
SmallVector<int, 16> ScaledMask;
int Scale = NumSrcElts / Mask.size();
- scaleShuffleMask<int>(Scale, Mask, ScaledMask);
+ narrowShuffleMaskElts(Scale, Mask, ScaledMask);
Mask = std::move(ScaledMask);
} else if ((Mask.size() % NumSrcElts) == 0) {
// Simplify Mask based on demanded element.
@@ -37126,7 +39090,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (Mask.size() != NumSrcElts)
return SDValue();
- int SrcIdx = Mask[N->getConstantOperandVal(1)];
+ int SrcIdx = Mask[IdxC.getZExtValue()];
// If the shuffle source element is undef/zero then we can just accept it.
if (SrcIdx == SM_SentinelUndef)
@@ -37153,8 +39117,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
- assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
- "Unexpected extraction type");
+ assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type");
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
@@ -37324,12 +39287,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
// vXi8 reduction - sum lo/hi halves then use PSADBW.
if (VT == MVT::i8) {
while (Rdx.getValueSizeInBits() > 128) {
- unsigned HalfSize = VecVT.getSizeInBits() / 2;
- unsigned HalfElts = VecVT.getVectorNumElements() / 2;
- SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
- SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
- Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
- VecVT = Rdx.getValueType();
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+ VecVT = Lo.getValueType();
+ Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
}
assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
@@ -37344,8 +39305,7 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
}
// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
- if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+ if (!shouldUseHorizontalOp(true, DAG, Subtarget))
return SDValue();
unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
@@ -37477,11 +39437,21 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
// Attempt to extract a i1 element by using MOVMSK to extract the signbits
// and then testing the relevant element.
+ //
+ // Note that we only combine extracts on the *same* result number, i.e.
+ // t0 = merge_values a0, a1, a2, a3
+ // i1 = extract_vector_elt t0, Constant:i64<2>
+ // i1 = extract_vector_elt t0, Constant:i64<3>
+ // but not
+ // i1 = extract_vector_elt t0:1, Constant:i64<2>
+ // since the latter would need its own MOVMSK.
if (CIdx && SrcVT.getScalarType() == MVT::i1) {
SmallVector<SDNode *, 16> BoolExtracts;
- auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
+ unsigned ResNo = InputVector.getResNo();
+ auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(Use->getOperand(1)) &&
+ Use->getOperand(0).getResNo() == ResNo &&
Use->getValueType(0) == MVT::i1) {
BoolExtracts.push_back(Use);
return true;
@@ -37530,8 +39500,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
assert(CondVT.isVector() && "Vector select expects a vector selector!");
- // Check if the first operand is all zeros and Cond type is vXi1.
- // This situation only applies to avx512.
// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
// TODO: Can we assert that both operands are not zeros (because that should
// get simplified at node creation time)?
@@ -37546,14 +39514,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
return DAG.getConstant(0, DL, VT);
}
- if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
- Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
- // Invert the cond to not(cond) : xor(op,allones)=not(op)
- SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
- // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
- return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
- }
-
// To use the condition operand as a bitwise mask, it must have elements that
// are the same size as the select elements. Ie, the condition operand must
// have already been promoted from the IR select condition type <N x i1>.
@@ -37778,12 +39738,13 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
return true;
};
+ APInt DemandedBits(APInt::getSignMask(BitWidth));
+
if (OnlyUsedAsSelectCond(Cond)) {
- APInt DemandedMask(APInt::getSignMask(BitWidth));
KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
- if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
+ if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
return SDValue();
// If we changed the computation somewhere in the DAG, this change will
@@ -37805,15 +39766,9 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
}
// Otherwise we can still at least try to simplify multiple use bits.
- APInt DemandedMask(APInt::getSignMask(BitWidth));
- APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements()));
- KnownBits Known;
- TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
- !DCI.isBeforeLegalizeOps());
- if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask,
- DemandedElts, DAG, 0))
- return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
- V, N->getOperand(1), N->getOperand(2));
+ if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
+ return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
+ N->getOperand(1), N->getOperand(2));
return SDValue();
}
@@ -38297,6 +40252,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
+ // Check if the first operand is all zeros and Cond type is vXi1.
+ // If this an avx512 target we can improve the use of zero masking by
+ // swapping the operands and inverting the condition.
+ if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
+ Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
+ ISD::isBuildVectorAllZeros(LHS.getNode()) &&
+ !ISD::isBuildVectorAllZeros(RHS.getNode())) {
+ // Invert the cond to not(cond) : xor(op,allones)=not(op)
+ SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
+ // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+ return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
+ }
+
// Early exit check
if (!TLI.isTypeLegal(VT))
return SDValue();
@@ -38316,12 +40284,86 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
- // Custom action for SELECT MMX
- if (VT == MVT::x86mmx) {
- LHS = DAG.getBitcast(MVT::i64, LHS);
- RHS = DAG.getBitcast(MVT::i64, RHS);
- SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
- return DAG.getBitcast(VT, newSelect);
+ // Try to optimize vXi1 selects if both operands are either all constants or
+ // bitcasts from scalar integer type. In that case we can convert the operands
+ // to integer and use an integer select which will be converted to a CMOV.
+ // We need to take a little bit of care to avoid creating an i64 type after
+ // type legalization.
+ if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
+ VT.getVectorElementType() == MVT::i1 &&
+ (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
+ bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
+ bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
+
+ if ((LHSIsConst ||
+ (LHS.getOpcode() == ISD::BITCAST &&
+ LHS.getOperand(0).getValueType() == IntVT)) &&
+ (RHSIsConst ||
+ (RHS.getOpcode() == ISD::BITCAST &&
+ RHS.getOperand(0).getValueType() == IntVT))) {
+ if (LHSIsConst)
+ LHS = combinevXi1ConstantToInteger(LHS, DAG);
+ else
+ LHS = LHS.getOperand(0);
+
+ if (RHSIsConst)
+ RHS = combinevXi1ConstantToInteger(RHS, DAG);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
+ return DAG.getBitcast(VT, Select);
+ }
+ }
+
+ // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
+ // single bits, then invert the predicate and swap the select operands.
+ // This can lower using a vector shift bit-hack rather than mask and compare.
+ if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
+ Cond.getOperand(0).getOpcode() == ISD::AND &&
+ isNullOrNullSplat(Cond.getOperand(1)) &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+ Cond.getOperand(0).getValueType() == VT) {
+ // The 'and' mask must be composed of power-of-2 constants.
+ SDValue And = Cond.getOperand(0);
+ auto *C = isConstOrConstSplat(And.getOperand(1));
+ if (C && C->getAPIntValue().isPowerOf2()) {
+ // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
+ SDValue NotCond =
+ DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
+ return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
+ }
+
+ // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
+ // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
+ // 16-bit lacks a proper blendv.
+ unsigned EltBitWidth = VT.getScalarSizeInBits();
+ bool CanShiftBlend =
+ TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
+ (Subtarget.hasAVX2() && EltBitWidth == 64) ||
+ (Subtarget.hasXOP()));
+ if (CanShiftBlend &&
+ ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
+ return C->getAPIntValue().isPowerOf2();
+ })) {
+ // Create a left-shift constant to get the mask bits over to the sign-bit.
+ SDValue Mask = And.getOperand(1);
+ SmallVector<int, 32> ShlVals;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
+ ShlVals.push_back(EltBitWidth - 1 -
+ MaskVal->getAPIntValue().exactLogBase2());
+ }
+ // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
+ SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
+ SDValue NewCond =
+ DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
+ return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
+ }
}
return SDValue();
@@ -38647,6 +40689,282 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
return SDValue();
}
+/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
+/// to avoid the inversion.
+static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
+ if (EFLAGS.getOpcode() != X86ISD::PTEST &&
+ EFLAGS.getOpcode() != X86ISD::TESTP)
+ return SDValue();
+
+ // PTEST/TESTP sets EFLAGS as:
+ // TESTZ: ZF = (Op0 & Op1) == 0
+ // TESTC: CF = (~Op0 & Op1) == 0
+ // TESTNZC: ZF == 0 && CF == 0
+ EVT VT = EFLAGS.getValueType();
+ SDValue Op0 = EFLAGS.getOperand(0);
+ SDValue Op1 = EFLAGS.getOperand(1);
+ EVT OpVT = Op0.getValueType();
+
+ // TEST*(~X,Y) == TEST*(X,Y)
+ if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
+ X86::CondCode InvCC;
+ switch (CC) {
+ case X86::COND_B:
+ // testc -> testz.
+ InvCC = X86::COND_E;
+ break;
+ case X86::COND_AE:
+ // !testc -> !testz.
+ InvCC = X86::COND_NE;
+ break;
+ case X86::COND_E:
+ // testz -> testc.
+ InvCC = X86::COND_B;
+ break;
+ case X86::COND_NE:
+ // !testz -> !testc.
+ InvCC = X86::COND_AE;
+ break;
+ case X86::COND_A:
+ case X86::COND_BE:
+ // testnzc -> testnzc (no change).
+ InvCC = CC;
+ break;
+ default:
+ InvCC = X86::COND_INVALID;
+ break;
+ }
+
+ if (InvCC != X86::COND_INVALID) {
+ CC = InvCC;
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, NotOp0), Op1);
+ }
+ }
+
+ if (CC == X86::COND_E || CC == X86::COND_NE) {
+ // TESTZ(X,~Y) == TESTC(Y,X)
+ if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
+ CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, NotOp1), Op0);
+ }
+
+ if (Op0 == Op1) {
+ SDValue BC = peekThroughBitcasts(Op0);
+ EVT BCVT = BC.getValueType();
+ assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
+ "Unexpected vector type");
+
+ // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
+ if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, BC.getOperand(0)),
+ DAG.getBitcast(OpVT, BC.getOperand(1)));
+ }
+
+ // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
+ if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
+ CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, BC.getOperand(0)),
+ DAG.getBitcast(OpVT, BC.getOperand(1)));
+ }
+
+ // If every element is an all-sign value, see if we can use MOVMSK to
+ // more efficiently extract the sign bits and compare that.
+ // TODO: Handle TESTC with comparison inversion.
+ // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
+ // MOVMSK combines to make sure its never worse than PTEST?
+ unsigned EltBits = BCVT.getScalarSizeInBits();
+ if (DAG.ComputeNumSignBits(BC) == EltBits) {
+ assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
+ APInt SignMask = APInt::getSignMask(EltBits);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (SDValue Res =
+ TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
+ // For vXi16 cases we need to use pmovmksb and extract every other
+ // sign bit.
+ SDLoc DL(EFLAGS);
+ if (EltBits == 16) {
+ MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
+ Res = DAG.getBitcast(MovmskVT, Res);
+ Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+ Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
+ DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+ } else {
+ Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+ }
+ }
+
+ // TESTZ(-1,X) == TESTZ(X,X)
+ if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
+
+ // TESTZ(X,-1) == TESTZ(X,X)
+ if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
+ }
+
+ return SDValue();
+}
+
+// Attempt to simplify the MOVMSK input based on the comparison type.
+static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Handle eq/ne against zero (any_of).
+ // Handle eq/ne against -1 (all_of).
+ if (!(CC == X86::COND_E || CC == X86::COND_NE))
+ return SDValue();
+ if (EFLAGS.getValueType() != MVT::i32)
+ return SDValue();
+ unsigned CmpOpcode = EFLAGS.getOpcode();
+ if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
+ return SDValue();
+ auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
+ if (!CmpConstant)
+ return SDValue();
+ const APInt &CmpVal = CmpConstant->getAPIntValue();
+
+ SDValue CmpOp = EFLAGS.getOperand(0);
+ unsigned CmpBits = CmpOp.getValueSizeInBits();
+ assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
+
+ // Peek through any truncate.
+ if (CmpOp.getOpcode() == ISD::TRUNCATE)
+ CmpOp = CmpOp.getOperand(0);
+
+ // Bail if we don't find a MOVMSK.
+ if (CmpOp.getOpcode() != X86ISD::MOVMSK)
+ return SDValue();
+
+ SDValue Vec = CmpOp.getOperand(0);
+ MVT VecVT = Vec.getSimpleValueType();
+ assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
+ "Unexpected MOVMSK operand");
+ unsigned NumElts = VecVT.getVectorNumElements();
+ unsigned NumEltBits = VecVT.getScalarSizeInBits();
+
+ bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
+ bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
+ CmpVal.isMask(NumElts);
+ if (!IsAnyOf && !IsAllOf)
+ return SDValue();
+
+ // See if we can peek through to a vector with a wider element type, if the
+ // signbits extend down to all the sub-elements as well.
+ // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
+ // potential SimplifyDemandedBits/Elts cases.
+ if (Vec.getOpcode() == ISD::BITCAST) {
+ SDValue BC = peekThroughBitcasts(Vec);
+ MVT BCVT = BC.getSimpleValueType();
+ unsigned BCNumElts = BCVT.getVectorNumElements();
+ unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
+ if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
+ BCNumEltBits > NumEltBits &&
+ DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
+ SDLoc DL(EFLAGS);
+ unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
+ DAG.getConstant(CmpMask, DL, MVT::i32));
+ }
+ }
+
+ // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
+ // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
+ if (IsAllOf && Subtarget.hasSSE41()) {
+ SDValue BC = peekThroughBitcasts(Vec);
+ if (BC.getOpcode() == X86ISD::PCMPEQ &&
+ ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
+ MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
+ return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
+ }
+ }
+
+ // See if we can avoid a PACKSS by calling MOVMSK on the sources.
+ // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
+ // sign bits prior to the comparison with zero unless we know that
+ // the vXi16 splats the sign bit down to the lower i8 half.
+ // TODO: Handle all_of patterns.
+ if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
+ SDValue VecOp0 = Vec.getOperand(0);
+ SDValue VecOp1 = Vec.getOperand(1);
+ bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
+ bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
+ // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
+ if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
+ if (!SignExt0) {
+ Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
+ DAG.getConstant(0xAAAA, DL, MVT::i16));
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ DAG.getConstant(0, DL, MVT::i16));
+ }
+ // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
+ // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
+ if (CmpBits == 16 && Subtarget.hasInt256() &&
+ VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
+ VecOp0.getConstantOperandAPInt(1) == 0 &&
+ VecOp1.getConstantOperandAPInt(1) == 8 &&
+ (IsAnyOf || (SignExt0 && SignExt1))) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
+ if (!SignExt0 || !SignExt1) {
+ assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
+ Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+ DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ DAG.getConstant(CmpMask, DL, MVT::i32));
+ }
+ }
+
+ // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
+ SmallVector<int, 32> ShuffleMask;
+ SmallVector<SDValue, 2> ShuffleInputs;
+ if (NumElts == CmpBits &&
+ getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
+ ShuffleMask, DAG) &&
+ ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
+ ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
+ unsigned NumShuffleElts = ShuffleMask.size();
+ APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
+ for (int M : ShuffleMask) {
+ assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
+ DemandedElts.setBit(M);
+ }
+ if (DemandedElts.isAllOnesValue()) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ Result =
+ DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ EFLAGS.getOperand(1));
+ }
+ }
+
+ return SDValue();
+}
+
/// Optimize an EFLAGS definition used according to the condition code \p CC
/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
/// uses of chain values.
@@ -38659,6 +40977,13 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
return R;
+
+ if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
+ return R;
+
+ if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
+ return R;
+
return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
}
@@ -38680,7 +41005,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
- if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
+ if (!(FalseOp.getValueType() == MVT::f80 ||
+ (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
+ (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
+ !Subtarget.hasCMov() || hasFPCMov(CC)) {
SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
Flags};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
@@ -38989,7 +41317,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
: ISD::SIGN_EXTEND,
DL, VT, MulLo);
- MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
// the higher part is also needed.
SDValue MulHi =
@@ -39120,10 +41448,14 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
return SDValue();
- // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
- // Also allow v2i32 if it will be widened.
+ // Make sure the type is legal or will be widened to a legal type.
+ if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
- if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
+
+ // Without BWI, we would need to split v32i16.
+ if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
return SDValue();
SDValue N0 = N->getOperand(0);
@@ -39340,6 +41672,64 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
return NewMul;
}
+// Try to form a MULHU or MULHS node by looking for
+// (srl (mul ext, ext), 16)
+// TODO: This is X86 specific because we want to be able to handle wide types
+// before type legalization. But we can only do it if the vector will be
+// legalized via widening/splitting. Type legalization can't handle promotion
+// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
+// combiner.
+static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+ "SRL or SRA node is required here!");
+ SDLoc DL(N);
+
+ // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
+ // the multiply.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ // The operation feeding into the shift must be a multiply.
+ SDValue ShiftOperand = N->getOperand(0);
+ if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
+ return SDValue();
+
+ // Input type should be at least vXi32.
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
+ return SDValue();
+
+ // Need a shift by 16.
+ APInt ShiftAmt;
+ if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
+ ShiftAmt != 16)
+ return SDValue();
+
+ SDValue LHS = ShiftOperand.getOperand(0);
+ SDValue RHS = ShiftOperand.getOperand(1);
+
+ unsigned ExtOpc = LHS.getOpcode();
+ if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
+ RHS.getOpcode() != ExtOpc)
+ return SDValue();
+
+ // Peek through the extends.
+ LHS = LHS.getOperand(0);
+ RHS = RHS.getOperand(0);
+
+ // Ensure the input types match.
+ EVT MulVT = LHS.getValueType();
+ if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
+ return SDValue();
+
+ unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
+ SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
+
+ ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ return DAG.getNode(ExtOpc, DL, VT, Mulh);
+}
+
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -39399,12 +41789,16 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
unsigned Size = VT.getSizeInBits();
+ if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
+ return V;
+
// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
@@ -39453,11 +41847,15 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
}
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
+ if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
+ return V;
+
// Only do this on the last DAG combine as it can interfere with other
// combines.
if (!DCI.isAfterLegalizeDAG())
@@ -39501,16 +41899,92 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opcode = N->getOpcode();
+ assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
+ "Unexpected pack opcode");
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ unsigned NumDstElts = VT.getVectorNumElements();
+
+ // Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
+ // to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
+ // truncation trees that help us avoid lane crossing shuffles.
+ // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
+ if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getConstantOperandAPInt(1) == 0 &&
+ N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&
+ N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
+ N0.getOperand(0).getValueType().is256BitVector()) {
+ // TODO - support target/faux shuffles.
+ SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
+ // To keep the PACK LHS/RHS coherency, we must be able to scale the unary
+ // shuffle to a vXi64 width - we can probably relax this in the future.
+ SmallVector<int, 4> ShuffleMask;
+ if (SVN->getOperand(1).isUndef() &&
+ scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
+ SDLoc DL(N);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
+ Lo = DAG.getBitcast(N0.getValueType(), Lo);
+ Hi = DAG.getBitcast(N1.getValueType(), Hi);
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
+ Res = DAG.getBitcast(MVT::v4i32, Res);
+ Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+ }
+
+ // Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)).
+ // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
+ if (VT.is256BitVector()) {
+ if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {
+ if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {
+ SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
+ if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&
+ scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {
+ SDValue Op00 = SVN0->getOperand(0);
+ SDValue Op01 = SVN0->getOperand(1);
+ SDValue Op10 = SVN1->getOperand(0);
+ SDValue Op11 = SVN1->getOperand(1);
+ if ((Op00 == Op11) && (Op01 == Op10)) {
+ std::swap(Op10, Op11);
+ ShuffleVectorSDNode::commuteMask(ShuffleMask1);
+ }
+ if ((Op00 == Op10) && (Op01 == Op11)) {
+ SmallVector<int, 4> ShuffleMask;
+ ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
+ ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
+ SDLoc DL(N);
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
+ Res = DAG.getBitcast(MVT::v4i64, Res);
+ Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
- "Unexpected shift opcode");
+ "Unexpected pack opcode");
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ unsigned NumDstElts = VT.getVectorNumElements();
unsigned DstBitsPerElt = VT.getScalarSizeInBits();
unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
@@ -39527,7 +42001,6 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
unsigned NumLanes = VT.getSizeInBits() / 128;
- unsigned NumDstElts = VT.getVectorNumElements();
unsigned NumSrcElts = NumDstElts / 2;
unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
@@ -39574,6 +42047,10 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
}
+ // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
+ if (SDValue V = combineVectorPackWithShuffle(N, DAG))
+ return V;
+
// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
// truncate to create a larger truncate.
if (Subtarget.hasAVX512() &&
@@ -39656,26 +42133,37 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
if (ShiftVal >= NumBitsPerElt) {
if (LogicalShift)
return DAG.getConstant(0, SDLoc(N), VT);
- else
- ShiftVal = NumBitsPerElt - 1;
+ ShiftVal = NumBitsPerElt - 1;
}
- // Shift N0 by zero -> N0.
+ // (shift X, 0) -> X
if (!ShiftVal)
return N0;
- // Shift zero -> zero.
+ // (shift 0, C) -> 0
if (ISD::isBuildVectorAllZeros(N0.getNode()))
+ // N0 is all zeros or undef. We guarantee that the bits shifted into the
+ // result are all zeros, not undef.
return DAG.getConstant(0, SDLoc(N), VT);
- // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
- // clamped to (NumBitsPerElt - 1).
- if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
+ // (VSRAI -1, C) -> -1
+ if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
+ // N0 is all ones or undef. We guarantee that the bits shifted into the
+ // result are all ones, not undef.
+ return DAG.getConstant(-1, SDLoc(N), VT);
+
+ // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
+ if (Opcode == N0.getOpcode()) {
unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
unsigned NewShiftVal = ShiftVal + ShiftVal2;
- if (NewShiftVal >= NumBitsPerElt)
+ if (NewShiftVal >= NumBitsPerElt) {
+ // Out of range logical bit shifts are guaranteed to be zero.
+ // Out of range arithmetic bit shifts splat the sign bit.
+ if (LogicalShift)
+ return DAG.getConstant(0, SDLoc(N), VT);
NewShiftVal = NumBitsPerElt - 1;
- return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
+ }
+ return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
}
@@ -39693,14 +42181,22 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
assert(EltBits.size() == VT.getVectorNumElements() &&
"Unexpected shift value type");
- for (APInt &Elt : EltBits) {
- if (X86ISD::VSHLI == Opcode)
+ // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
+ // created an undef input due to no input bits being demanded, but user
+ // still expects 0 in other bits.
+ for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
+ APInt &Elt = EltBits[i];
+ if (UndefElts[i])
+ Elt = 0;
+ else if (X86ISD::VSHLI == Opcode)
Elt <<= ShiftVal;
else if (X86ISD::VSRAI == Opcode)
Elt.ashrInPlace(ShiftVal);
else
Elt.lshrInPlace(ShiftVal);
}
+ // Reset undef elements since they were zeroed above.
+ UndefElts = 0;
return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
}
@@ -39717,19 +42213,24 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
- (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
+ (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
+ N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion");
- unsigned NumBitsPerElt = VT.getScalarSizeInBits();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.SimplifyDemandedBits(SDValue(N, 0),
- APInt::getAllOnesValue(NumBitsPerElt), DCI))
- return SDValue(N, 0);
+ if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBitsPerElt), DCI))
+ return SDValue(N, 0);
+ }
- // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
- SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
- return Res;
+ // Attempt to combine insertion patterns to a shuffle.
+ if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
return SDValue();
}
@@ -39752,7 +42253,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
// The SETCCs should both refer to the same CMP.
- if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+ if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
return SDValue();
SDValue CMP00 = CMP0->getOperand(0);
@@ -39851,10 +42352,27 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- if (SDValue Not = IsNOT(N0, DAG)) {
+ auto GetNot = [&VT, &DAG](SDValue V) {
+ // Basic X = NOT(Y) detection.
+ if (SDValue Not = IsNOT(V, DAG))
+ return Not;
+ // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
+ if (V.getOpcode() == X86ISD::VBROADCAST) {
+ SDValue Src = V.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isVector())
+ return SDValue();
+ if (SDValue Not = IsNOT(Src, DAG))
+ return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
+ DAG.getBitcast(SrcVT, Not));
+ }
+ return SDValue();
+ };
+
+ if (SDValue Not = GetNot(N0)) {
X = Not;
Y = N1;
- } else if (SDValue Not = IsNOT(N1, DAG)) {
+ } else if (SDValue Not = GetNot(N1)) {
X = Not;
Y = N0;
} else
@@ -39865,6 +42383,65 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
}
+// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
+// logical operations, like in the example below.
+// or (and (truncate x, truncate y)),
+// (xor (truncate z, build_vector (constants)))
+// Given a target type \p VT, we generate
+// or (and x, y), (xor z, zext(build_vector (constants)))
+// given x, y and z are of type \p VT. We can do so, if operands are either
+// truncates from VT types, the second operand is a vector of constants or can
+// be recursively promoted.
+static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
+ unsigned Depth) {
+ // Limit recursion to avoid excessive compile times.
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue();
+
+ if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
+ N->getOpcode() != ISD::OR)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
+ return SDValue();
+
+ if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
+ N0 = NN0;
+ else {
+ // The Left side has to be a trunc.
+ if (N0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ // The type of the truncated inputs.
+ if (N0.getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ N0 = N0.getOperand(0);
+ }
+
+ if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
+ N1 = NN1;
+ else {
+ // The right side has to be a 'trunc' or a constant vector.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
+ N1.getOperand(0).getValueType() == VT;
+ if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
+ return SDValue();
+
+ if (RHSTrunc)
+ N1 = N1.getOperand(0);
+ else
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
+ }
+
+ return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
+}
+
// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
// register. In most cases we actually compare or select YMM-sized registers
// and mixing the two types creates horrible code. This method optimizes
@@ -39876,6 +42453,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Expected vector type");
+ SDLoc DL(N);
assert((N->getOpcode() == ISD::ANY_EXTEND ||
N->getOpcode() == ISD::ZERO_EXTEND ||
N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
@@ -39883,57 +42461,33 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
SDValue Narrow = N->getOperand(0);
EVT NarrowVT = Narrow.getValueType();
- if (Narrow->getOpcode() != ISD::XOR &&
- Narrow->getOpcode() != ISD::AND &&
- Narrow->getOpcode() != ISD::OR)
- return SDValue();
-
- SDValue N0 = Narrow->getOperand(0);
- SDValue N1 = Narrow->getOperand(1);
- SDLoc DL(Narrow);
-
- // The Left side has to be a trunc.
- if (N0.getOpcode() != ISD::TRUNCATE)
- return SDValue();
-
- // The type of the truncated inputs.
- if (N0.getOperand(0).getValueType() != VT)
- return SDValue();
-
- // The right side has to be a 'trunc' or a constant vector.
- bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
- N1.getOperand(0).getValueType() == VT;
- if (!RHSTrunc &&
- !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
- return SDValue();
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
- if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
- return SDValue();
-
- // Set N0 and N1 to hold the inputs to the new wide operation.
- N0 = N0.getOperand(0);
- if (RHSTrunc)
- N1 = N1.getOperand(0);
- else
- N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
-
// Generate the wide operation.
- SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
- unsigned Opcode = N->getOpcode();
- switch (Opcode) {
+ SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
+ if (!Op)
+ return SDValue();
+ switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode");
case ISD::ANY_EXTEND:
return Op;
case ISD::ZERO_EXTEND:
- return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
+ return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
case ISD::SIGN_EXTEND:
return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
Op, DAG.getValueType(NarrowVT));
}
}
+static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
+ unsigned FPOpcode;
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected input node for FP logic conversion");
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+ }
+ return FPOpcode;
+}
+
/// If both input operands of a logic op are being cast from floating point
/// types, try to convert this into a floating point logic node to avoid
/// unnecessary moves from SSE to integer registers.
@@ -39958,18 +42512,45 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
(Subtarget.hasSSE2() && N00Type == MVT::f64)))
return SDValue();
- unsigned FPOpcode;
- switch (N->getOpcode()) {
- default: llvm_unreachable("Unexpected input node for FP logic conversion");
- case ISD::AND: FPOpcode = X86ISD::FAND; break;
- case ISD::OR: FPOpcode = X86ISD::FOR; break;
- case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
- }
-
+ unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
return DAG.getBitcast(VT, FPLogic);
}
+// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
+// to reduce XMM->GPR traffic.
+static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opc = N->getOpcode();
+ assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
+ "Unexpected bit opcode");
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Both operands must be single use MOVMSK.
+ if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
+ N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
+ return SDValue();
+
+ SDValue Vec0 = N0.getOperand(0);
+ SDValue Vec1 = N1.getOperand(0);
+ EVT VecVT0 = Vec0.getValueType();
+ EVT VecVT1 = Vec1.getValueType();
+
+ // Both MOVMSK operands must be from vectors of the same size and same element
+ // size, but its OK for a fp/int diff.
+ if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
+ VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
+ return SDValue();
+
+ SDLoc DL(N);
+ unsigned VecOpc =
+ VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
+ SDValue Result =
+ DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
+ return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+}
+
/// If this is a zero/all-bits result that is bitwise-anded with a low bits
/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
/// with a shift-right to eliminate loading the vector constant mask value.
@@ -40292,7 +42873,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
SmallVector<SDValue, 2> SrcOps;
- if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
+ SmallVector<APInt, 2> SrcPartials;
+ if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
SrcOps.size() == 1) {
SDLoc dl(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -40302,9 +42884,11 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
if (Mask) {
- APInt AllBits = APInt::getAllOnesValue(NumElts);
- return DAG.getSetCC(dl, MVT::i1, Mask,
- DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
+ assert(SrcPartials[0].getBitWidth() == NumElts &&
+ "Unexpected partial reduction mask");
+ SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
+ Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
+ return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
}
}
}
@@ -40312,6 +42896,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
return V;
+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+ return R;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -40420,6 +43007,16 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
}
SDLoc DL(N);
+
+ if (UseVPTERNLOG) {
+ // Emit a VPTERNLOG node directly.
+ SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
+ SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
+ SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
+ SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
+ return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
+ }
+
SDValue X = N->getOperand(0);
SDValue Y =
DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
@@ -40503,6 +43100,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
if (!Subtarget.hasSSE41())
return SDValue();
+ // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
+ if (Subtarget.hasVLX())
+ return SDValue();
+
MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
X = DAG.getBitcast(BlendVT, X);
@@ -40619,139 +43220,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
return Ret;
}
-static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node");
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- EVT VT = N->getValueType(0);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
- if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||
- !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
- return SDValue();
-
- // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
- bool OptForSize = DAG.shouldOptForSize();
- unsigned Bits = VT.getScalarSizeInBits();
-
- // SHLD/SHRD instructions have lower register pressure, but on some
- // platforms they have higher latency than the equivalent
- // series of shifts/or that would otherwise be generated.
- // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
- // have higher latencies and we are not optimizing for size.
- if (!OptForSize && Subtarget.isSHLDSlow())
- return SDValue();
-
- if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
- std::swap(N0, N1);
- if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
- return SDValue();
- if (!N0.hasOneUse() || !N1.hasOneUse())
- return SDValue();
-
- EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-
- SDValue ShAmt0 = N0.getOperand(1);
- if (ShAmt0.getValueType() != ShiftVT)
- return SDValue();
- SDValue ShAmt1 = N1.getOperand(1);
- if (ShAmt1.getValueType() != ShiftVT)
- return SDValue();
-
- // Peek through any modulo shift masks.
- SDValue ShMsk0;
- if (ShAmt0.getOpcode() == ISD::AND &&
- isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
- ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
- ShMsk0 = ShAmt0;
- ShAmt0 = ShAmt0.getOperand(0);
- }
- SDValue ShMsk1;
- if (ShAmt1.getOpcode() == ISD::AND &&
- isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
- ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
- ShMsk1 = ShAmt1;
- ShAmt1 = ShAmt1.getOperand(0);
- }
-
- if (ShAmt0.getOpcode() == ISD::TRUNCATE)
- ShAmt0 = ShAmt0.getOperand(0);
- if (ShAmt1.getOpcode() == ISD::TRUNCATE)
- ShAmt1 = ShAmt1.getOperand(0);
-
- SDLoc DL(N);
- unsigned Opc = ISD::FSHL;
- SDValue Op0 = N0.getOperand(0);
- SDValue Op1 = N1.getOperand(0);
- if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
- Opc = ISD::FSHR;
- std::swap(Op0, Op1);
- std::swap(ShAmt0, ShAmt1);
- std::swap(ShMsk0, ShMsk1);
- }
-
- auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
- SDValue Amt) {
- if (Opc == ISD::FSHR)
- std::swap(Op0, Op1);
- return DAG.getNode(Opc, DL, VT, Op0, Op1,
- DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
- };
-
- // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
- // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
- // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
- // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
- // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
- // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
- if (ShAmt1.getOpcode() == ISD::SUB) {
- SDValue Sum = ShAmt1.getOperand(0);
- if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
- SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
- if (ShAmt1Op1.getOpcode() == ISD::AND &&
- isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
- ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
- ShMsk1 = ShAmt1Op1;
- ShAmt1Op1 = ShAmt1Op1.getOperand(0);
- }
- if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
- ShAmt1Op1 = ShAmt1Op1.getOperand(0);
- if ((SumC->getAPIntValue() == Bits ||
- (SumC->getAPIntValue() == 0 && ShMsk1)) &&
- ShAmt1Op1 == ShAmt0)
- return GetFunnelShift(Op0, Op1, ShAmt0);
- }
- } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
- auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
- if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
- return GetFunnelShift(Op0, Op1, ShAmt0);
- } else if (ShAmt1.getOpcode() == ISD::XOR) {
- SDValue Mask = ShAmt1.getOperand(1);
- if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
- unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
- SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
- if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
- ShAmt1Op0 = ShAmt1Op0.getOperand(0);
- if (MaskC->getSExtValue() == (Bits - 1) &&
- (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
- if (Op1.getOpcode() == InnerShift &&
- isa<ConstantSDNode>(Op1.getOperand(1)) &&
- Op1.getConstantOperandAPInt(1).isOneValue()) {
- return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
- }
- // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
- if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
- Op1.getOperand(0) == Op1.getOperand(1)) {
- return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
- }
- }
- }
- }
-
- return SDValue();
-}
-
static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -40771,7 +43239,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
SmallVector<SDValue, 2> SrcOps;
- if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
+ SmallVector<APInt, 2> SrcPartials;
+ if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
SrcOps.size() == 1) {
SDLoc dl(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -40781,13 +43250,19 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
if (Mask) {
- APInt AllBits = APInt::getNullValue(NumElts);
- return DAG.getSetCC(dl, MVT::i1, Mask,
- DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
+ assert(SrcPartials[0].getBitWidth() == NumElts &&
+ "Unexpected partial reduction mask");
+ SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
+ SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
+ Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
+ return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
}
}
}
+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+ return R;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -40803,8 +43278,33 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
- if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
- return R;
+ // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
+ // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
+ // iff the upper elements of the non-shifted arg are zero.
+ // KUNPCK require 16+ bool vector elements.
+ if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfElts = NumElts / 2;
+ APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
+ if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
+ N1.getConstantOperandAPInt(1) == HalfElts &&
+ DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
+ SDLoc dl(N);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, dl, VT,
+ extractSubVector(N0, 0, DAG, dl, HalfElts),
+ extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
+ }
+ if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
+ N0.getConstantOperandAPInt(1) == HalfElts &&
+ DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
+ SDLoc dl(N);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, dl, VT,
+ extractSubVector(N1, 0, DAG, dl, HalfElts),
+ extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
+ }
+ }
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
@@ -41153,18 +43653,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
// A lambda checking the given SDValue is a constant vector and each element
// is in the range [Min, Max].
auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
- BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
- if (!BV || !BV->isConstant())
- return false;
- for (SDValue Op : V->ops()) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
- if (!C)
- return false;
- const APInt &Val = C->getAPIntValue();
- if (Val.ult(Min) || Val.ugt(Max))
- return false;
- }
- return true;
+ return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
+ return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
+ });
};
// Check if each element of the vector is right-shifted by one.
@@ -41265,10 +43756,10 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
- unsigned Alignment = Ld->getAlignment();
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
- ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
+ ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
+ Ld->getAlignment() >= 16) ||
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
*Ld->getMemOperand(), &Fast) &&
!Fast))) {
@@ -41276,17 +43767,18 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
if (NumElems < 2)
return SDValue();
- unsigned HalfAlign = 16;
+ unsigned HalfOffset = 16;
SDValue Ptr1 = Ld->getBasePtr();
- SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
+ SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems / 2);
SDValue Load1 =
DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
- Alignment, Ld->getMemOperand()->getFlags());
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
- Ld->getPointerInfo().getWithOffset(HalfAlign),
- MinAlign(Alignment, HalfAlign),
+ Ld->getPointerInfo().getWithOffset(HalfOffset),
+ Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load1.getValue(1), Load2.getValue(1));
@@ -41303,13 +43795,28 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
if (TLI.isTypeLegal(IntVT)) {
SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Alignment,
+ Ld->getPointerInfo(),
+ Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
}
}
+ // Cast ptr32 and ptr64 pointers to the default address space before a load.
+ unsigned AddrSpace = Ld->getAddressSpace();
+ if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
+ AddrSpace == X86AS::PTR32_UPTR) {
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
+ SDValue Cast =
+ DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
+ return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ }
+ }
+
return SDValue();
}
@@ -41456,7 +43963,7 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+ auto *Mld = cast<MaskedLoadSDNode>(N);
// TODO: Expanding load with constant mask may be optimized as well.
if (Mld->isExpandingLoad())
@@ -41465,12 +43972,33 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
return ScalarLoad;
+
// TODO: Do some AVX512 subsets benefit from this transform?
if (!Subtarget.hasAVX512())
if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
return Blend;
}
+ // If the mask value has been legalized to a non-boolean vector, try to
+ // simplify ops leading up to it. We only demand the MSB of each lane.
+ SDValue Mask = Mld->getMask();
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ EVT VT = Mld->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ if (SDValue NewMask =
+ TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
+ return DAG.getMaskedLoad(
+ VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
+ NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
+ Mld->getAddressingMode(), Mld->getExtensionType());
+ }
+
return SDValue();
}
@@ -41522,9 +44050,18 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mst->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
- APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
return SDValue(N, 0);
+ }
+ if (SDValue NewMask =
+ TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
+ Mst->getBasePtr(), Mst->getOffset(), NewMask,
+ Mst->getMemoryVT(), Mst->getMemOperand(),
+ Mst->getAddressingMode());
}
SDValue Value = Mst->getValue();
@@ -41546,7 +44083,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoreSDNode *St = cast<StoreSDNode>(N);
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
- unsigned Alignment = St->getAlignment();
SDValue StoredVal = St->getValue();
EVT VT = StoredVal.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -41559,7 +44095,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoredVal = DAG.getBitcast(NewVT, StoredVal);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
@@ -41570,7 +44106,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoredVal.getOperand(0).getValueType() == MVT::i8) {
return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
St->getBasePtr(), St->getPointerInfo(),
- St->getAlignment(), St->getMemOperand()->getFlags());
+ St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
}
// Widen v2i1/v4i1 stores to v8i1.
@@ -41581,7 +44118,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
Ops[0] = StoredVal;
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
@@ -41590,7 +44127,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
// If its a v64i1 store without 64-bit support, we need two stores.
- if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
StoredVal->ops().slice(0, 32));
Lo = combinevXi1ConstantToInteger(Lo, DAG);
@@ -41603,18 +44140,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
SDValue Ch0 =
DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
- Alignment, St->getMemOperand()->getFlags());
+ St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
SDValue Ch1 =
DAG.getStore(St->getChain(), dl, Hi, Ptr1,
St->getPointerInfo().getWithOffset(4),
- MinAlign(Alignment, 4U),
+ St->getOriginalAlign(),
St->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
}
StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
@@ -41633,7 +44171,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
}
// Split under-aligned vector non-temporal stores.
- if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
+ if (St->isNonTemporal() && StVT == VT &&
+ St->getAlignment() < VT.getStoreSize()) {
// ZMM/YMM nt-stores - either it can be stored as a series of shorter
// vectors or the legalizer can scalarize it to use MOVNTI.
if (VT.is256BitVector() || VT.is512BitVector()) {
@@ -41687,7 +44226,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
Subtarget, dl))
return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
if (TLI.isTruncStoreLegal(VT, StVT)) {
@@ -41705,6 +44244,20 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+ // Cast ptr32 and ptr64 pointers to the default address space before a store.
+ unsigned AddrSpace = St->getAddressSpace();
+ if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
+ AddrSpace == X86AS::PTR32_UPTR) {
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (PtrVT != St->getBasePtr().getSimpleValueType()) {
+ SDValue Cast =
+ DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
+ return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags(), St->getAAInfo());
+ }
+ }
+
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
// the FP state in cases where an emms may be missing.
// A preferable solution to the general problem is to figure out the right
@@ -41759,13 +44312,38 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
BitCast, OldExtract.getOperand(1));
return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
return SDValue();
}
+static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ auto *St = cast<MemIntrinsicSDNode>(N);
+
+ SDValue StoredVal = N->getOperand(1);
+ MVT VT = StoredVal.getSimpleValueType();
+ EVT MemVT = St->getMemoryVT();
+
+ // Figure out which elements we demand.
+ unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
+ APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
+
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
+ KnownZero, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
/// Return 'true' if this vector operation is "horizontal"
/// and return the operands for the horizontal operation in LHS and RHS. A
/// horizontal operation performs the binary operation on successive elements
@@ -42002,17 +44580,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
// of one truncation.
// i.e. if one of the inputs will constant fold or the input is repeated.
switch (SrcOpcode) {
- case ISD::AND:
- case ISD::XOR:
- case ISD::OR: {
- SDValue Op0 = Src.getOperand(0);
- SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
- (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
- return TruncateArithmetic(Op0, Op1);
- break;
- }
-
case ISD::MUL:
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.
@@ -42021,21 +44588,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
!TLI.isOperationLegal(SrcOpcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
- case ISD::ADD: {
- SDValue Op0 = Src.getOperand(0);
- SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegal(SrcOpcode, VT) &&
- (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
- return TruncateArithmetic(Op0, Op1);
- break;
- }
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ case ISD::ADD:
case ISD::SUB: {
- // TODO: ISD::SUB We are conservative and require both sides to be freely
- // truncatable to avoid interfering with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(SrcOpcode, VT) &&
- (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
+ (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
}
@@ -42146,13 +44707,17 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
MVT InSVT = InVT.getScalarType();
// Check we have a truncation suited for PACKSS/PACKUS.
- if (!VT.is128BitVector() && !VT.is256BitVector())
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
return SDValue();
if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
return SDValue();
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
+ // Truncation to sub-128bit vXi32 can be better handled with shuffles.
+ if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
+ return SDValue();
+
// AVX512 has fast truncate, but if the input is already going to be split,
// there's no harm in trying pack.
if (Subtarget.hasAVX512() &&
@@ -42173,6 +44738,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
// Use PACKSS if the input has sign-bits that extend all the way to the
// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+
+ // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
+ // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
+ // on and combines/simplifications can't then use it.
+ if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
+ return SDValue();
+
if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
@@ -42201,9 +44773,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
return SDValue();
- // Input type should be vXi32.
+ // Input type should be at least vXi32.
EVT InVT = Src.getValueType();
- if (InVT.getVectorElementType() != MVT::i32)
+ if (InVT.getVectorElementType().getSizeInBits() < 32)
return SDValue();
// Need a shift by 16.
@@ -42412,7 +44984,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return combineVectorTruncation(N, DAG, Subtarget);
}
-static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
SDLoc DL(N);
@@ -42422,6 +44995,11 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
+ return SDValue(N, 0);
+
return SDValue();
}
@@ -42514,37 +45092,46 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
if (NegMul) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMADD: Opcode = ISD::FMA; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case ISD::FMA: Opcode = X86ISD::FNMADD; break;
+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMADD: Opcode = ISD::FMA; break;
+ case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
}
}
if (NegAcc) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FMSUB: Opcode = ISD::FMA; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
- case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
- case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
- case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
+ case ISD::FMA: Opcode = X86ISD::FMSUB; break;
+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
+ case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
+ case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
+ case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
}
}
if (NegRes) {
switch (Opcode) {
+ // For accuracy reason, we never combine fneg and fma under strict FP.
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
@@ -42562,18 +45149,20 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
/// Do target-specific dag combines on floating point negations.
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT OrigVT = N->getValueType(0);
SDValue Arg = isFNEG(DAG, N);
if (!Arg)
return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = Arg.getValueType();
EVT SVT = VT.getScalarType();
SDLoc DL(N);
// Let legalize expand this if it isn't a legal type yet.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ if (!TLI.isTypeLegal(VT))
return SDValue();
// If we're negating a FMUL node on a target with FMA, then we can avoid the
@@ -42587,80 +45176,25 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(OrigVT, NewNode);
}
- // If we're negating an FMA node, then we can adjust the
- // instruction to include the extra negation.
- if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
- switch (Arg.getOpcode()) {
- case ISD::FMA:
- case X86ISD::FMSUB:
- case X86ISD::FNMADD:
- case X86ISD::FNMSUB:
- case X86ISD::FMADD_RND:
- case X86ISD::FMSUB_RND:
- case X86ISD::FNMADD_RND:
- case X86ISD::FNMSUB_RND: {
- // We can't handle scalar intrinsic node here because it would only
- // invert one element and not the whole vector. But we could try to handle
- // a negation of the lower element only.
- unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true);
- return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops()));
- }
- }
- }
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOperations = !DCI.isBeforeLegalizeOps();
+ if (SDValue NegArg =
+ TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
+ return DAG.getBitcast(OrigVT, NegArg);
return SDValue();
}
-char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
- bool LegalOperations,
- bool ForCodeSize,
- unsigned Depth) const {
- // fneg patterns are removable even if they have multiple uses.
- if (isFNEG(DAG, Op.getNode(), Depth))
- return 2;
-
- // Don't recurse exponentially.
- if (Depth > SelectionDAG::MaxRecursionDepth)
- return 0;
-
- EVT VT = Op.getValueType();
- EVT SVT = VT.getScalarType();
- switch (Op.getOpcode()) {
- case ISD::FMA:
- case X86ISD::FMSUB:
- case X86ISD::FNMADD:
- case X86ISD::FNMSUB:
- case X86ISD::FMADD_RND:
- case X86ISD::FMSUB_RND:
- case X86ISD::FNMADD_RND:
- case X86ISD::FNMSUB_RND: {
- if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
- !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
- break;
-
- // This is always negatible for free but we might be able to remove some
- // extra operand negations as well.
- for (int i = 0; i != 3; ++i) {
- char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
- ForCodeSize, Depth + 1);
- if (V == 2)
- return V;
- }
- return 1;
- }
- }
-
- return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
- ForCodeSize, Depth);
-}
-
SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
bool LegalOperations,
bool ForCodeSize,
+ NegatibleCost &Cost,
unsigned Depth) const {
// fneg patterns are removable even if they have multiple uses.
- if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
+ if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
+ Cost = NegatibleCost::Cheaper;
return DAG.getBitcast(Op.getValueType(), Arg);
+ }
EVT VT = Op.getValueType();
EVT SVT = VT.getScalarType();
@@ -42675,35 +45209,41 @@ SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
case X86ISD::FNMADD_RND:
case X86ISD::FNMSUB_RND: {
if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
- !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
+ !(SVT == MVT::f32 || SVT == MVT::f64) ||
+ !isOperationLegal(ISD::FMA, VT))
break;
// This is always negatible for free but we might be able to remove some
// extra operand negations as well.
SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
- for (int i = 0; i != 3; ++i) {
- char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
- ForCodeSize, Depth + 1);
- if (V == 2)
- NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
- ForCodeSize, Depth + 1);
- }
+ for (int i = 0; i != 3; ++i)
+ NewOps[i] = getCheaperNegatedExpression(
+ Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
bool NegA = !!NewOps[0];
bool NegB = !!NewOps[1];
bool NegC = !!NewOps[2];
unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
+ Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
+ : NegatibleCost::Neutral;
+
// Fill in the non-negated ops with the original values.
for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
if (!NewOps[i])
NewOps[i] = Op.getOperand(i);
return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
}
+ case X86ISD::FRCP:
+ if (SDValue NegOp0 =
+ getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
+ ForCodeSize, Cost, Depth + 1))
+ return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
+ break;
}
return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
- ForCodeSize, Depth);
+ ForCodeSize, Cost, Depth);
}
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
@@ -42764,6 +45304,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
return Cmp;
+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+ return R;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -42776,33 +45319,21 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
- return combineFneg(N, DAG, Subtarget);
+ return combineFneg(N, DAG, DCI, Subtarget);
}
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
unsigned NumBits = VT.getSizeInBits();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
// TODO - Constant Folding.
- if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
- // Reduce Cst1 to the bottom 16-bits.
- // NOTE: SimplifyDemandedBits won't do this for constants.
- const APInt &Val1 = Cst1->getAPIntValue();
- APInt MaskedVal1 = Val1 & 0xFFFF;
- if (MaskedVal1 != Val1)
- return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
- DAG.getConstant(MaskedVal1, SDLoc(N), VT));
- }
-
- // Only bottom 16-bits of the control bits are required.
- APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
- if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
+
+ // Simplify the inputs.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getAllOnesValue(NumBits));
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
return SDValue(N, 0);
return SDValue();
@@ -42893,6 +45424,7 @@ static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
@@ -42904,7 +45436,7 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
if (isNullFPScalarOrVectorConst(N->getOperand(1)))
return N->getOperand(0);
- if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
+ if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
return NewVal;
return lowerX86FPLogicOp(N, DAG, Subtarget);
@@ -43015,23 +45547,16 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- // Unless the load is volatile or atomic.
- if (LN->isSimple()) {
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getIntegerVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
SDLoc dl(N);
- unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
- MVT MemVT = MVT::getIntegerVT(NumBits);
- MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
- SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
- SDValue VZLoad =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
- LN->getPointerInfo(),
- LN->getAlignment(),
- LN->getMemOperand()->getFlags());
SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
DAG.getBitcast(InVT, VZLoad));
DCI.CombineTo(N, Convert);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
return SDValue(N, 0);
}
}
@@ -43041,33 +45566,33 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
- // FIXME: Handle strict fp nodes.
+ bool IsStrict = N->isTargetStrictFPOpcode();
EVT VT = N->getValueType(0);
// Convert a full vector load into vzload when not all bits are needed.
- SDValue In = N->getOperand(0);
+ SDValue In = N->getOperand(IsStrict ? 1 : 0);
MVT InVT = In.getSimpleValueType();
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(In);
- // Unless the load is volatile or atomic.
- if (LN->isSimple()) {
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getFloatingPointVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
SDLoc dl(N);
- unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
- MVT MemVT = MVT::getFloatingPointVT(NumBits);
- MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
- SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
- SDValue VZLoad =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
- LN->getPointerInfo(),
- LN->getAlignment(),
- LN->getMemOperand()->getFlags());
- SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
- DAG.getBitcast(InVT, VZLoad));
- DCI.CombineTo(N, Convert);
+ if (IsStrict) {
+ SDValue Convert =
+ DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
+ {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
+ DCI.CombineTo(N, Convert, Convert.getValue(1));
+ } else {
+ SDValue Convert =
+ DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ }
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
return SDValue(N, 0);
}
}
@@ -43106,14 +45631,58 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
- SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// BT ignores high bits in the bit index operand.
unsigned BitWidth = N1.getValueSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
- if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
- return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
+ if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+
+ if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedElts = APInt::getLowBitsSet(8, 4);
+ if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
+ DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ // Convert a full vector load into vzload when not all bits are needed.
+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
+ SDLoc dl(N);
+ if (IsStrict) {
+ SDValue Convert = DAG.getNode(
+ N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
+ DCI.CombineTo(N, Convert, Convert.getValue(1));
+ } else {
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
+ DAG.getBitcast(MVT::v8i16, VZLoad));
+ DCI.CombineTo(N, Convert);
+ }
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return SDValue(N, 0);
+ }
+ }
+ }
return SDValue();
}
@@ -43199,7 +45768,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
- N0.getOpcode() == ISD::SIGN_EXTEND)) {
+ N0.getOpcode() == ISD::SIGN_EXTEND)) {
SDValue N00 = N0.getOperand(0);
// EXTLOAD has a better solution on AVX2,
@@ -43208,9 +45777,14 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
if (!ISD::isNormalLoad(N00.getNode()))
return SDValue();
+ // Attempt to promote any comparison mask ops before moving the
+ // SIGN_EXTEND_INREG in the way.
+ if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
+
if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
- SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
- N00, N1);
+ SDValue Tmp =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
}
}
@@ -43395,6 +45969,21 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
for (unsigned i = 0; i != Scale; ++i)
ShuffleMask.append(EltSizeInBits, i);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+ } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
+ (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
+ // If we have register broadcast instructions, use the scalar size as the
+ // element type for the shuffle. Then cast to the wider element type. The
+ // widened bits won't be used, and this might allow the use of a broadcast
+ // load.
+ assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
+ unsigned Scale = EltSizeInBits / NumElts;
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+ ShuffleMask.append(NumElts * Scale, 0);
+ Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
+ Vec = DAG.getBitcast(VT, Vec);
} else {
// For smaller scalar integers, we can simply any-extend it to the vector
// element size (we don't care about the upper bits) and broadcast it to all
@@ -43402,8 +45991,8 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
ShuffleMask.append(NumElts, 0);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
}
- Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
// Now, mask the relevant bit in each element.
SmallVector<SDValue, 32> Bits;
@@ -43448,7 +46037,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
// We can only do this if the vector size in 256 bits or less.
unsigned Size = VT.getSizeInBits();
- if (Size > 256)
+ if (Size > 256 && Subtarget.useAVX512Regs())
return SDValue();
// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
@@ -43466,7 +46055,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
if (N->getOpcode() == ISD::ZERO_EXTEND)
- Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
+ Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
return Res;
}
@@ -43479,6 +46068,23 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
EVT InVT = N0.getValueType();
SDLoc DL(N);
+ // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
+ if (!DCI.isBeforeLegalizeOps() &&
+ N0.getOpcode() == X86ISD::SETCC_CARRY) {
+ SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
+ N0->getOperand(1));
+ bool ReplaceOtherUses = !N0.hasOneUse();
+ DCI.CombineTo(N, Setcc);
+ // Replace other uses with a truncate of the widened setcc_carry.
+ if (ReplaceOtherUses) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+ N0.getValueType(), Setcc);
+ DCI.CombineTo(N0.getNode(), Trunc);
+ }
+
+ return SDValue(N, 0);
+ }
+
if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
return NewCMov;
@@ -43516,6 +46122,7 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
+ bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
// Let legalize expand this if it isn't a legal type yet.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -43526,15 +46133,16 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
return SDValue();
- SDValue A = N->getOperand(0);
- SDValue B = N->getOperand(1);
- SDValue C = N->getOperand(2);
+ SDValue A = N->getOperand(IsStrict ? 1 : 0);
+ SDValue B = N->getOperand(IsStrict ? 2 : 1);
+ SDValue C = N->getOperand(IsStrict ? 3 : 2);
auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool LegalOperations = !DCI.isBeforeLegalizeOps();
- if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) {
- V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
+ if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
+ CodeSize)) {
+ V = NegV;
return true;
}
// Look through extract_vector_elts. If it comes from an FNEG, create a
@@ -43542,11 +46150,10 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isNullConstant(V.getOperand(1))) {
SDValue Vec = V.getOperand(0);
- if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) {
- SDValue NegVal =
- TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
+ if (SDValue NegV = TLI.getCheaperNegatedExpression(
+ Vec, DAG, LegalOperations, CodeSize)) {
V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
- NegVal, V.getOperand(1));
+ NegV, V.getOperand(1));
return true;
}
}
@@ -43566,9 +46173,15 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
unsigned NewOpcode =
negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
- if (N->getNumOperands() == 4)
- return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
- return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ if (IsStrict) {
+ assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
+ return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
+ {N->getOperand(0), A, B, C});
+ } else {
+ if (N->getNumOperands() == 4)
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ }
}
// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
@@ -43582,10 +46195,11 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
bool LegalOperations = !DCI.isBeforeLegalizeOps();
SDValue N2 = N->getOperand(2);
- if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2)
- return SDValue();
- SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+ SDValue NegN2 =
+ TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+ if (!NegN2)
+ return SDValue();
unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
if (N->getNumOperands() == 4)
@@ -43598,38 +46212,26 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
- // (and (i32 x86isd::setcc_carry), 1)
- // This eliminates the zext. This transformation is necessary because
- // ISD::SETCC is always legalized to i8.
SDLoc dl(N);
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
- if (N0.getOpcode() == ISD::AND &&
- N0.hasOneUse() &&
- N0.getOperand(0).hasOneUse()) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
- if (!isOneConstant(N0.getOperand(1)))
- return SDValue();
- return DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
- N00.getOperand(0), N00.getOperand(1)),
- DAG.getConstant(1, dl, VT));
+ // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
+ // FIXME: Is this needed? We don't seem to have any tests for it.
+ if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
+ N0.getOpcode() == X86ISD::SETCC_CARRY) {
+ SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
+ N0->getOperand(1));
+ bool ReplaceOtherUses = !N0.hasOneUse();
+ DCI.CombineTo(N, Setcc);
+ // Replace other uses with a truncate of the widened setcc_carry.
+ if (ReplaceOtherUses) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+ N0.getValueType(), Setcc);
+ DCI.CombineTo(N0.getNode(), Trunc);
}
- }
- if (N0.getOpcode() == ISD::TRUNCATE &&
- N0.hasOneUse() &&
- N0.getOperand(0).hasOneUse()) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
- return DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
- N00.getOperand(0), N00.getOperand(1)),
- DAG.getConstant(1, dl, VT));
- }
+ return SDValue(N, 0);
}
if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
@@ -43742,13 +46344,12 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);
- bool HasAVX = Subtarget.hasAVX();
// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
// Otherwise use PCMPEQ (plus AND) and mask testing.
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
- (OpSize == 256 && HasAVX) ||
+ (OpSize == 256 && Subtarget.hasAVX()) ||
(OpSize == 512 && Subtarget.useAVX512Regs())) {
bool HasPT = Subtarget.hasSSE41();
@@ -43802,11 +46403,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
X = DAG.getBitcast(TmpCastVT, X);
if (!NeedZExt && !TmpZext)
return X;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
DAG.getConstant(0, DL, VecVT), X,
- DAG.getConstant(0, DL, VecIdxVT));
+ DAG.getVectorIdxConstant(0, DL));
};
SDValue Cmp;
@@ -43839,17 +46438,16 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
Cmp);
SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
- SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);
- return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));
+ SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
}
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
- // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
- // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
+ assert(Cmp.getValueType() == MVT::v16i8 &&
+ "Non 128-bit vector on pre-SSE41 target");
SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
- SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
- MVT::i32);
+ SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
}
@@ -43866,23 +46464,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
if (CC == ISD::SETNE || CC == ISD::SETEQ) {
- // 0-x == y --> x+y == 0
- // 0-x != y --> x+y != 0
- if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
- LHS.hasOneUse()) {
- SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
- return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
- }
- // x == 0-y --> x+y == 0
- // x != 0-y --> x+y != 0
- if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
- RHS.hasOneUse()) {
- SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
- return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
- }
-
if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
return V;
+
+ if (VT == MVT::i1 && isNullConstant(RHS)) {
+ SDValue X86CC;
+ if (SDValue V =
+ MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
+ return DAG.getNode(ISD::TRUNCATE, DL, VT,
+ DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
+ }
}
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
@@ -43905,7 +46496,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
if (IsSEXT0 && IsVZero1) {
assert(VT == Op0.getOperand(0).getValueType() &&
- "Uexpected operand type");
+ "Unexpected operand type");
if (TmpCC == ISD::SETGT)
return DAG.getConstant(0, DL, VT);
if (TmpCC == ISD::SETLE)
@@ -43995,20 +46586,43 @@ static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
if (Mask.getScalarValueSizeInBits() != 1) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
return SDValue(N, 0);
+ }
}
return SDValue();
}
+static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
+ SDValue Index, SDValue Base, SDValue Scale,
+ SelectionDAG &DAG) {
+ SDLoc DL(GorS);
+
+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+ SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
+ Gather->getMask(), Base, Index, Scale } ;
+ return DAG.getMaskedGather(Gather->getVTList(),
+ Gather->getMemoryVT(), DL, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType());
+ }
+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+ SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
+ Scatter->getMask(), Base, Index, Scale };
+ return DAG.getMaskedScatter(Scatter->getVTList(),
+ Scatter->getMemoryVT(), DL,
+ Ops, Scatter->getMemOperand(),
+ Scatter->getIndexType());
+}
+
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
auto *GorS = cast<MaskedGatherScatterSDNode>(N);
- SDValue Chain = GorS->getChain();
SDValue Index = GorS->getIndex();
- SDValue Mask = GorS->getMask();
SDValue Base = GorS->getBasePtr();
SDValue Scale = GorS->getScale();
@@ -44028,21 +46642,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
unsigned NumElts = Index.getValueType().getVectorNumElements();
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
- SDValue Ops[] = { Chain, Gather->getPassThru(),
- Mask, Base, Index, Scale } ;
- return DAG.getMaskedGather(Gather->getVTList(),
- Gather->getMemoryVT(), DL, Ops,
- Gather->getMemOperand(),
- Gather->getIndexType());
- }
- auto *Scatter = cast<MaskedScatterSDNode>(GorS);
- SDValue Ops[] = { Chain, Scatter->getValue(),
- Mask, Base, Index, Scale };
- return DAG.getMaskedScatter(Scatter->getVTList(),
- Scatter->getMemoryVT(), DL,
- Ops, Scatter->getMemOperand(),
- Scatter->getIndexType());
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
}
@@ -44057,21 +46657,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
unsigned NumElts = Index.getValueType().getVectorNumElements();
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
- SDValue Ops[] = { Chain, Gather->getPassThru(),
- Mask, Base, Index, Scale } ;
- return DAG.getMaskedGather(Gather->getVTList(),
- Gather->getMemoryVT(), DL, Ops,
- Gather->getMemOperand(),
- Gather->getIndexType());
- }
- auto *Scatter = cast<MaskedScatterSDNode>(GorS);
- SDValue Ops[] = { Chain, Scatter->getValue(),
- Mask, Base, Index, Scale };
- return DAG.getMaskedScatter(Scatter->getVTList(),
- Scatter->getMemoryVT(), DL,
- Ops, Scatter->getMemOperand(),
- Scatter->getIndexType());
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
}
@@ -44084,30 +46670,20 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Index.getValueType().getVectorNumElements());
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
- SDValue Ops[] = { Chain, Gather->getPassThru(),
- Mask, Base, Index, Scale } ;
- return DAG.getMaskedGather(Gather->getVTList(),
- Gather->getMemoryVT(), DL, Ops,
- Gather->getMemOperand(),
- Gather->getIndexType());
- }
- auto *Scatter = cast<MaskedScatterSDNode>(GorS);
- SDValue Ops[] = { Chain, Scatter->getValue(),
- Mask, Base, Index, Scale };
- return DAG.getMaskedScatter(Scatter->getVTList(),
- Scatter->getMemoryVT(), DL,
- Ops, Scatter->getMemOperand(),
- Scatter->getIndexType());
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
}
// With vector masks we only demand the upper bit of the mask.
+ SDValue Mask = GorS->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
return SDValue(N, 0);
+ }
}
return SDValue();
@@ -44146,10 +46722,11 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// TODO: Could we move this to DAGCombine?
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
SelectionDAG &DAG) {
- // Take advantage of vector comparisons producing 0 or -1 in each lane to
- // optimize away operation when it's from a constant.
+ // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
+ // to optimize away operation when it's from a constant.
//
// The general transformation is:
// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
@@ -44161,9 +46738,10 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
// aren't the same.
EVT VT = N->getValueType(0);
bool IsStrict = N->isStrictFPOpcode();
+ unsigned NumEltBits = VT.getScalarSizeInBits();
SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
- if (!VT.isVector() || Op0->getOpcode() != ISD::AND ||
- Op0->getOperand(0)->getOpcode() != ISD::SETCC ||
+ if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
+ DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
VT.getSizeInBits() != Op0.getValueSizeInBits())
return SDValue();
@@ -44336,7 +46914,6 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
Op0.getOpcode() == ISD::LOAD) {
LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
- EVT LdVT = Ld->getValueType(0);
// This transformation is not supported if the result type is f16 or f128.
if (VT == MVT::f16 || VT == MVT::f128)
@@ -44347,11 +46924,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasDQI() && VT != MVT::f80)
return SDValue();
- if (Ld->isSimple() && !VT.isVector() &&
- ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
- !Subtarget.is64Bit() && LdVT == MVT::i64) {
- std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD(
- SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
+ if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
+ Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
+ std::pair<SDValue, SDValue> Tmp =
+ Subtarget.getTargetLowering()->BuildFILD(
+ VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
return Tmp.first;
}
@@ -44685,7 +47263,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
}
if (CC == X86::COND_A) {
- SDValue EFLAGS = Y->getOperand(1);
+ SDValue EFLAGS = Y.getOperand(1);
// Try to convert COND_A into COND_B in an attempt to facilitate
// materializing "setb reg".
//
@@ -44698,13 +47276,44 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
EFLAGS.getNode()->getVTList(),
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
- SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
DAG.getVTList(VT, MVT::i32), X,
DAG.getConstant(0, DL, VT), NewEFLAGS);
}
}
+ if (CC == X86::COND_AE) {
+ // X + SETAE --> sbb X, -1
+ // X - SETAE --> adc X, -1
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(-1, DL, VT), Y.getOperand(1));
+ }
+
+ if (CC == X86::COND_BE) {
+ // X + SETBE --> sbb X, -1
+ // X - SETBE --> adc X, -1
+ SDValue EFLAGS = Y.getOperand(1);
+ // Try to convert COND_BE into COND_AE in an attempt to facilitate
+ // materializing "setae reg".
+ //
+ // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(-1, DL, VT), NewEFLAGS);
+ }
+ }
+
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
@@ -44741,15 +47350,18 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
SDValue One = DAG.getConstant(1, DL, ZVT);
- SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ Cmp1.getValue(1));
}
}
// (cmp Z, 1) sets the carry flag if Z is 0.
SDValue One = DAG.getConstant(1, DL, ZVT);
- SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
// Add the flags type for ADC/SBB nodes.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
@@ -44758,151 +47370,12 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
if (CC == X86::COND_NE)
return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
- DAG.getConstant(-1ULL, DL, VT), Cmp1);
+ DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
- DAG.getConstant(0, DL, VT), Cmp1);
-}
-
-static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- EVT VT = N->getValueType(0);
-
- // If the vector size is less than 128, or greater than the supported RegSize,
- // do not use PMADD.
- if (!VT.isVector() || VT.getVectorNumElements() < 8)
- return SDValue();
-
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
-
- auto UsePMADDWD = [&](SDValue Op) {
- ShrinkMode Mode;
- return Op.getOpcode() == ISD::MUL &&
- canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
- Mode != ShrinkMode::MULU16 &&
- (!Subtarget.hasSSE41() ||
- (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
- Op->isOnlyUserOf(Op.getOperand(1).getNode())));
- };
-
- SDValue MulOp, OtherOp;
- if (UsePMADDWD(Op0)) {
- MulOp = Op0;
- OtherOp = Op1;
- } else if (UsePMADDWD(Op1)) {
- MulOp = Op1;
- OtherOp = Op0;
- } else
- return SDValue();
-
- SDLoc DL(N);
- EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
- VT.getVectorNumElements());
- EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- VT.getVectorNumElements() / 2);
-
- // Shrink the operands of mul.
- SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
- SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
-
- // Madd vector size is half of the original vector size
- auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
- };
- SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
- PMADDWDBuilder);
- // Fill the rest of the output with 0
- SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
- SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
-
- // Preserve the reduction flag on the ADD. We may need to revisit for the
- // other operand.
- SDNodeFlags Flags;
- Flags.setVectorReduction(true);
- return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
-}
-
-static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- // TODO: There's nothing special about i32, any integer type above i16 should
- // work just as well.
- if (!VT.isVector() || !VT.isSimple() ||
- !(VT.getVectorElementType() == MVT::i32))
- return SDValue();
-
- unsigned RegSize = 128;
- if (Subtarget.useBWIRegs())
- RegSize = 512;
- else if (Subtarget.hasAVX())
- RegSize = 256;
-
- // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
- // TODO: We should be able to handle larger vectors by splitting them before
- // feeding them into several SADs, and then reducing over those.
- if (VT.getSizeInBits() / 4 > RegSize)
- return SDValue();
-
- // We know N is a reduction add. To match SAD, we need one of the operands to
- // be an ABS.
- SDValue AbsOp = N->getOperand(0);
- SDValue OtherOp = N->getOperand(1);
- if (AbsOp.getOpcode() != ISD::ABS)
- std::swap(AbsOp, OtherOp);
- if (AbsOp.getOpcode() != ISD::ABS)
- return SDValue();
-
- // Check whether we have an abs-diff pattern feeding into the select.
- SDValue SadOp0, SadOp1;
- if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
- return SDValue();
-
- // SAD pattern detected. Now build a SAD instruction and an addition for
- // reduction. Note that the number of elements of the result of SAD is less
- // than the number of elements of its input. Therefore, we could only update
- // part of elements in the reduction vector.
- SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
-
- // The output of PSADBW is a vector of i64.
- // We need to turn the vector of i64 into a vector of i32.
- // If the reduction vector is at least as wide as the psadbw result, just
- // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
- // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
- // result to v2i32 which will be removed by type legalization. If we/ widen
- // narrow vectors then we bitcast to v4i32 and extract v2i32.
- MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
- Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
-
- if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
- // Fill the upper elements with zero to match the add width.
- assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
- unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
- SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
- Ops[0] = Sad;
- Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
- } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
- Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
- DAG.getIntPtrConstant(0, DL));
- }
-
- // Preserve the reduction flag on the ADD. We may need to revisit for the
- // other operand.
- SDNodeFlags Flags;
- Flags.setVectorReduction(true);
- return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
+ DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
}
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
@@ -44994,30 +47467,25 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
Mode == ShrinkMode::MULU16)
return SDValue();
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements() * 2);
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- // Shrink by adding truncate nodes and let DAGCombine fold with the
- // sources.
EVT InVT = Ops[0].getValueType();
- assert(InVT.getScalarType() == MVT::i32 &&
- "Unexpected scalar element type");
assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements() / 2);
- EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
- InVT.getVectorNumElements());
- return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
- DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
- DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
+ return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
};
- return SplitOpsAndApply(DAG, Subtarget, DL, VT,
- { Mul.getOperand(0), Mul.getOperand(1) },
- PMADDBuilder);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
}
// Attempt to turn this pattern into PMADDWD.
-// (mul (add (sext (build_vector)), (sext (build_vector))),
-// (add (sext (build_vector)), (sext (build_vector)))
+// (add (mul (sext (build_vector)), (sext (build_vector))),
+// (mul (sext (build_vector)), (sext (build_vector)))
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
@@ -45139,13 +47607,6 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- const SDNodeFlags Flags = N->getFlags();
- if (Flags.hasVectorReduction()) {
- if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
- return Sad;
- if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
- return MAdd;
- }
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
@@ -45236,6 +47697,38 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SubusRHS = MinLHS;
else
return SDValue();
+ } else if (Op1.getOpcode() == ISD::TRUNCATE &&
+ Op1.getOperand(0).getOpcode() == ISD::UMIN &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16)) {
+ // Special case where the UMIN has been truncated. Try to push the truncate
+ // further up. This is similar to the i32/i64 special processing.
+ SubusLHS = Op0;
+ SDValue MinLHS = Op1.getOperand(0).getOperand(0);
+ SDValue MinRHS = Op1.getOperand(0).getOperand(1);
+ EVT TruncVT = Op1.getOperand(0).getValueType();
+ if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
+ TruncVT == MVT::v8i64)) &&
+ !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
+ return SDValue();
+ SDValue OpToSaturate;
+ if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
+ MinLHS.getOperand(0) == Op0)
+ OpToSaturate = MinRHS;
+ else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
+ MinRHS.getOperand(0) == Op0)
+ OpToSaturate = MinLHS;
+ else
+ return SDValue();
+
+ // Saturate the non-extended input and then truncate it.
+ SDLoc DL(N);
+ SDValue SaturationConst =
+ DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
+ VT.getScalarSizeInBits()),
+ DL, TruncVT);
+ SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
+ SaturationConst);
+ SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
} else
return SDValue();
@@ -45350,6 +47843,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
return DAG.getUNDEF(VT);
@@ -45360,6 +47854,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
return getZeroVector(VT, Subtarget, DAG, DL);
SDValue Op0 = Ops[0];
+ bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
// Fold subvector loads into one.
// If needed, look through bitcasts to get to the load.
@@ -45376,13 +47871,28 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
// Repeated subvectors.
- if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
+ if (IsSplat) {
// If this broadcast/subv_broadcast is inserted into both halves, use a
// larger broadcast/subv_broadcast.
if (Op0.getOpcode() == X86ISD::VBROADCAST ||
Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
+ // If this broadcast_load is inserted into both halves, use a larger
+ // broadcast_load. Update other uses to use an extracted subvector.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(
+ Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+
// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
(Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
@@ -45394,12 +47904,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
(Subtarget.hasAVX2() ||
- (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+ (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
Op0.getOperand(0).getValueType() == VT.getScalarType())
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
- }
- bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
+ // concat_vectors(extract_subvector(broadcast(x)),
+ // extract_subvector(broadcast(x))) -> broadcast(x)
+ if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op0.getOperand(0).getValueType() == VT) {
+ if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
+ Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
+ return Op0.getOperand(0);
+ }
+ }
// Repeated opcode.
// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
@@ -45409,6 +47926,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
})) {
unsigned NumOps = Ops.size();
switch (Op0.getOpcode()) {
+ case X86ISD::SHUFP: {
+ // Add SHUFPD support if/when necessary.
+ if (!IsSplat && VT.getScalarType() == MVT::f32 &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op.getOperand(2) == Op0.getOperand(2);
+ })) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
+ Op0.getOperand(2));
+ }
+ break;
+ }
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFD:
@@ -45435,8 +47970,42 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
return DAG.getBitcast(VT, Res);
}
break;
+ case X86ISD::VSHLI:
+ case X86ISD::VSRAI:
+ case X86ISD::VSRLI:
+ if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
+ (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ })) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ break;
+ case X86ISD::VPERMI:
+ case X86ISD::VROTLI:
+ case X86ISD::VROTRI:
+ if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ })) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ break;
+ case X86ISD::PACKSS:
case X86ISD::PACKUS:
- if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
+ if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
+ Subtarget.hasInt256()) {
SmallVector<SDValue, 2> LHS, RHS;
for (unsigned i = 0; i != NumOps; ++i) {
LHS.push_back(Ops[i].getOperand(0));
@@ -45450,6 +48019,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
}
break;
+ case X86ISD::PALIGNR:
+ if (!IsSplat &&
+ ((VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(2) == Op.getOperand(2);
+ })) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
+ Op0.getOperand(2));
+ }
+ break;
}
}
@@ -45539,7 +48126,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// if the insert or extract can be represented with a subregister operation.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
SubVec.getOperand(0).getSimpleValueType() == OpVT &&
- (IdxVal != 0 || !Vec.isUndef())) {
+ (IdxVal != 0 ||
+ !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
int ExtIdxVal = SubVec.getConstantOperandVal(1);
if (ExtIdxVal != 0) {
int VecNumElts = OpVT.getVectorNumElements();
@@ -45628,7 +48216,7 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
unsigned SelElts = SelVT.getVectorNumElements();
unsigned CastedElts = WideVT.getVectorNumElements();
- unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
+ unsigned ExtIdx = Ext->getConstantOperandVal(1);
if (SelElts % CastedElts == 0) {
// The select has the same or more (narrower) elements than the extract
// operand. The extraction index gets scaled by that factor.
@@ -45673,6 +48261,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
MVT VT = N->getSimpleValueType(0);
SDValue InVec = N->getOperand(0);
+ unsigned IdxVal = N->getConstantOperandVal(1);
SDValue InVecBC = peekThroughBitcasts(InVec);
EVT InVecVT = InVec.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -45690,7 +48279,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (isConcatenatedNot(InVecBC.getOperand(0)) ||
isConcatenatedNot(InVecBC.getOperand(1))) {
// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
- SDValue Concat = split256IntArith(InVecBC, DAG);
+ SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
}
@@ -45702,8 +48291,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (SDValue V = narrowExtractedVectorSelect(N, DAG))
return V;
- unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-
if (ISD::isBuildVectorAllZeros(InVec.getNode()))
return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
@@ -45753,6 +48340,43 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
}
}
+ // If we're extracting an upper subvector from a broadcast we should just
+ // extract the lowest subvector instead which should allow
+ // SimplifyDemandedVectorElts do more simplifications.
+ if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
+ InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
+ return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits());
+
+ // If we're extracting a broadcasted subvector, just use the source.
+ if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&
+ InVec.getOperand(0).getValueType() == VT)
+ return InVec.getOperand(0);
+
+ // Attempt to extract from the source of a shuffle vector.
+ if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 &&
+ (IdxVal % VT.getVectorNumElements()) == 0) {
+ SmallVector<int, 32> ShuffleMask;
+ SmallVector<int, 32> ScaledMask;
+ SmallVector<SDValue, 2> ShuffleInputs;
+ unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits();
+ // Decode the shuffle mask and scale it so its shuffling subvectors.
+ if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
+ scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
+ unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
+ if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
+ return DAG.getUNDEF(VT);
+ if (ScaledMask[SubVecIdx] == SM_SentinelZero)
+ return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
+ SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
+ if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) {
+ unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
+ unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
+ return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
+ SDLoc(N), VT.getSizeInBits());
+ }
+ }
+ }
+
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
if (IdxVal == 0 && InVec.hasOneUse()) {
@@ -45825,13 +48449,30 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
Src.getOperand(1));
// Reduce v2i64 to v4i32 if we don't need the upper bits.
- // TODO: Move to DAGCombine?
- if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
- Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
- Src.getOperand(0).getScalarValueSizeInBits() <= 32)
- return DAG.getBitcast(
- VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
- DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
+ // TODO: Move to DAGCombine/SimplifyDemandedBits?
+ if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ auto IsAnyExt64 = [](SDValue Op) {
+ if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
+ return SDValue();
+ if (Op.getOpcode() == ISD::ANY_EXTEND &&
+ Op.getOperand(0).getScalarValueSizeInBits() <= 32)
+ return Op.getOperand(0);
+ if (auto *Ld = dyn_cast<LoadSDNode>(Op))
+ if (Ld->getExtensionType() == ISD::EXTLOAD &&
+ Ld->getMemoryVT().getScalarSizeInBits() <= 32)
+ return Op;
+ return SDValue();
+ };
+ if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
+ DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
+ }
+
+ // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
+ if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
+ Src.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
return SDValue();
}
@@ -45902,13 +48543,16 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
auto *Ld = cast<LoadSDNode>(In);
if (Ld->isSimple()) {
MVT SVT = In.getSimpleValueType().getVectorElementType();
- ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
- EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
- VT.getVectorNumElements());
+ ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
+ ? ISD::SEXTLOAD
+ : ISD::ZEXTLOAD;
+ EVT MemVT =
+ EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());
if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
SDValue Load =
DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+ Ld->getPointerInfo(), MemVT,
+ Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
return Load;
@@ -45945,6 +48589,196 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
+// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
+// extra instructions between the conversion due to going to scalar and back.
+static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
+ return SDValue();
+
+ if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
+ return SDValue();
+
+ if (N->getValueType(0) != MVT::f32 ||
+ N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
+ N->getOperand(0).getOperand(0));
+ Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
+ DAG.getTargetConstant(4, dl, MVT::i32));
+ Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
+ return SDValue();
+
+ bool IsStrict = N->isStrictFPOpcode();
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ EVT SrcVT = Src.getValueType();
+
+ if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
+ return SDValue();
+
+ if (VT.getVectorElementType() != MVT::f32 &&
+ VT.getVectorElementType() != MVT::f64)
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 1 || !isPowerOf2_32(NumElts))
+ return SDValue();
+
+ SDLoc dl(N);
+
+ // Convert the input to vXi16.
+ EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
+ Src = DAG.getBitcast(IntVT, Src);
+
+ // Widen to at least 8 input elements.
+ if (NumElts < 8) {
+ unsigned NumConcats = 8 / NumElts;
+ SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
+ : DAG.getConstant(0, dl, IntVT);
+ SmallVector<SDValue, 4> Ops(NumConcats, Fill);
+ Ops[0] = Src;
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
+ }
+
+ // Destination is vXf32 with at least 4 elements.
+ EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
+ std::max(4U, NumElts));
+ SDValue Cvt, Chain;
+ if (IsStrict) {
+ Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
+ {N->getOperand(0), Src});
+ Chain = Cvt.getValue(1);
+ } else {
+ Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
+ }
+
+ if (NumElts < 4) {
+ assert(NumElts == 2 && "Unexpected size");
+ Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ if (IsStrict) {
+ // Extend to the original VT if necessary.
+ if (Cvt.getValueType() != VT) {
+ Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
+ {Chain, Cvt});
+ Chain = Cvt.getValue(1);
+ }
+ return DAG.getMergeValues({Cvt, Chain}, dl);
+ }
+
+ // Extend to the original VT if necessary.
+ return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
+}
+
+// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
+// cases where the loads have the same input chain and the output chains are
+// unused. This avoids any memory ordering issues.
+static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // Only do this if the chain result is unused.
+ if (N->hasAnyUseOfValue(1))
+ return SDValue();
+
+ auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
+
+ SDValue Ptr = MemIntrin->getBasePtr();
+ SDValue Chain = MemIntrin->getChain();
+ EVT VT = N->getSimpleValueType(0);
+ EVT MemVT = MemIntrin->getMemoryVT();
+
+ // Look at other users of our base pointer and try to find a wider broadcast.
+ // The input chain and the size of the memory VT must match.
+ for (SDNode *User : Ptr->uses())
+ if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
+ cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
+ cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
+ MemVT.getSizeInBits() &&
+ !User->hasAnyUseOfValue(1) &&
+ User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+ SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
+ VT.getSizeInBits());
+ Extract = DAG.getBitcast(VT, Extract);
+ return DCI.CombineTo(N, Extract, SDValue(User, 1));
+ }
+
+ return SDValue();
+}
+
+static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
+ SrcVT.getVectorElementType() != MVT::f32)
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 1 || !isPowerOf2_32(NumElts))
+ return SDValue();
+
+ SDLoc dl(N);
+
+ // Widen to at least 4 input elements.
+ if (NumElts < 4)
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getConstantFP(0.0, dl, SrcVT));
+
+ // Destination is v8i16 with at least 8 elements.
+ EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ std::max(8U, NumElts));
+ SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
+ DAG.getTargetConstant(4, dl, MVT::i32));
+
+ // Extract down to real number of elements.
+ if (NumElts < 8) {
+ EVT IntVT = VT.changeVectorElementTypeToInteger();
+ Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ return DAG.getBitcast(VT, Cvt);
+}
+
+static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
+ SDValue Src = N->getOperand(0);
+
+ // Turn MOVDQ2Q+simple_load into an mmx load.
+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
+
+ if (LN->isSimple()) {
+ SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
+ LN->getBasePtr(),
+ LN->getPointerInfo(),
+ LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
+ return NewLd;
+ }
+ }
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -45976,8 +48810,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
case ISD::SHL: return combineShiftLeft(N, DAG);
- case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
- case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
+ case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
+ case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
@@ -45986,6 +48820,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
+ case X86ISD::VEXTRACT_STORE:
+ return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP:
case ISD::STRICT_SINT_TO_FP:
return combineSIntToFP(N, DAG, DCI, Subtarget);
@@ -45994,14 +48830,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
return combineUIntToFP(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
- case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
+ case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
- case X86ISD::VTRUNC: return combineVTRUNC(N, DAG);
+ case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
case X86ISD::FXOR:
- case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
+ case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
case X86ISD::FMIN:
case X86ISD::FMAX: return combineFMinFMax(N, DAG);
case ISD::FMINNUM:
@@ -46010,8 +48846,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
case X86ISD::CVTP2SI:
case X86ISD::CVTP2UI:
+ case X86ISD::STRICT_CVTTP2SI:
case X86ISD::CVTTP2SI:
- case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
+ case X86ISD::STRICT_CVTTP2UI:
+ case X86ISD::CVTTP2UI:
+ return combineCVTP2I_CVTTP2I(N, DAG, DCI);
+ case X86ISD::STRICT_CVTPH2PS:
+ case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
case X86ISD::BT: return combineBT(N, DAG, DCI);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
@@ -46034,12 +48875,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VSRAI:
case X86ISD::VSRLI:
return combineVectorShiftImm(N, DAG, DCI, Subtarget);
+ case ISD::INSERT_VECTOR_ELT:
case X86ISD::PINSRB:
case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::INSERTPS:
case X86ISD::EXTRQI:
case X86ISD::INSERTQI:
+ case X86ISD::VALIGN:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -46071,12 +48914,16 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
case X86ISD::FMADD_RND:
case X86ISD::FMSUB:
+ case X86ISD::STRICT_FMSUB:
case X86ISD::FMSUB_RND:
case X86ISD::FNMADD:
+ case X86ISD::STRICT_FNMADD:
case X86ISD::FNMADD_RND:
case X86ISD::FNMSUB:
+ case X86ISD::STRICT_FNMSUB:
case X86ISD::FNMSUB_RND:
- case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);
+ case ISD::FMA:
+ case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND:
case X86ISD::FMADDSUB:
@@ -46092,6 +48939,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
case X86ISD::KSHIFTL:
case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
+ case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
+ case ISD::STRICT_FP_EXTEND:
+ case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
+ case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
+ case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
+ case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
}
return SDValue();
@@ -46240,27 +49093,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
return true;
}
-bool X86TargetLowering::
- isDesirableToCombineBuildVectorToShuffleTruncate(
- ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
-
- assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
- "Element count mismatch");
- assert(
- Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
- "Shuffle Mask expected to be legal");
-
- // For 32-bit elements VPERMD is better than shuffle+truncate.
- // TODO: After we improve lowerBuildVector, add execption for VPERMW.
- if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
- return false;
-
- if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
- return false;
-
- return true;
-}
-
//===----------------------------------------------------------------------===//
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
@@ -46301,7 +49133,7 @@ static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
}
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
- InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
const std::string &AsmStr = IA->getAsmString();
@@ -46424,7 +49256,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
case 'y':
case 'x':
case 'v':
- case 'Y':
case 'l':
case 'k': // AVX512 masking registers.
return C_RegisterClass;
@@ -46461,7 +49292,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
default:
break;
case 'z':
- case '0':
return C_Register;
case 'i':
case 'm':
@@ -46517,19 +49347,17 @@ TargetLowering::ConstraintWeight
if (type->isX86_MMXTy() && Subtarget.hasMMX())
weight = CW_SpecificReg;
break;
- case 'Y': {
- unsigned Size = StringRef(constraint).size();
- // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
- char NextChar = Size == 2 ? constraint[1] : 'i';
- if (Size > 2)
+ case 'Y':
+ if (StringRef(constraint).size() != 2)
break;
- switch (NextChar) {
+ switch (constraint[1]) {
default:
return CW_Invalid;
// XMM0
case 'z':
- case '0':
- if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
+ ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
return CW_SpecificReg;
return CW_Invalid;
// Conditional OpMask regs (AVX512)
@@ -46542,7 +49370,7 @@ TargetLowering::ConstraintWeight
if (type->isX86_MMXTy() && Subtarget.hasMMX())
return weight;
return CW_Invalid;
- // Any SSE reg when ISA >= SSE2, same as 'Y'
+ // Any SSE reg when ISA >= SSE2, same as 'x'
case 'i':
case 't':
case '2':
@@ -46550,9 +49378,7 @@ TargetLowering::ConstraintWeight
return CW_Invalid;
break;
}
- // Fall through (handle "Y" constraint).
- LLVM_FALLTHROUGH;
- }
+ break;
case 'v':
if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
weight = CW_Register;
@@ -46634,8 +49460,6 @@ LowerXConstraint(EVT ConstraintVT) const {
// FP X constraints get lowered to SSE1/2 registers if available, otherwise
// 'f' like normal targets.
if (ConstraintVT.isFloatingPoint()) {
- if (Subtarget.hasSSE2())
- return "Y";
if (Subtarget.hasSSE1())
return "x";
}
@@ -46884,26 +49708,26 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
break;
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
if (Subtarget.is64Bit()) {
- if (VT == MVT::i32 || VT == MVT::f32)
- return std::make_pair(0U, &X86::GR32RegClass);
- if (VT == MVT::i16)
- return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8RegClass);
- if (VT == MVT::i64 || VT == MVT::f64)
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32RegClass);
+ if (VT != MVT::f80)
return std::make_pair(0U, &X86::GR64RegClass);
break;
}
LLVM_FALLTHROUGH;
// 32-bit fallthrough
case 'Q': // Q_REGS
- if (VT == MVT::i32 || VT == MVT::f32)
- return std::make_pair(0U, &X86::GR32_ABCDRegClass);
- if (VT == MVT::i16)
- return std::make_pair(0U, &X86::GR16_ABCDRegClass);
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
- if (VT == MVT::i64)
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+ return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+ if (VT != MVT::f80)
return std::make_pair(0U, &X86::GR64_ABCDRegClass);
break;
case 'r': // GENERAL_REGS
@@ -46914,15 +49738,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32RegClass);
- return std::make_pair(0U, &X86::GR64RegClass);
+ if (VT != MVT::f80)
+ return std::make_pair(0U, &X86::GR64RegClass);
+ break;
case 'R': // LEGACY_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_NOREXRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16_NOREXRegClass);
- if (VT == MVT::i32 || !Subtarget.is64Bit())
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32_NOREXRegClass);
- return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+ if (VT != MVT::f80)
+ return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+ break;
case 'f': // FP Stack registers.
// If SSE is enabled for this VT, use f80 to ensure the isel moves the
// value to the correct fpstack register class.
@@ -46930,13 +49758,12 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::RFP32RegClass);
if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
return std::make_pair(0U, &X86::RFP64RegClass);
- return std::make_pair(0U, &X86::RFP80RegClass);
+ if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
+ return std::make_pair(0U, &X86::RFP80RegClass);
+ break;
case 'y': // MMX_REGS if MMX allowed.
if (!Subtarget.hasMMX()) break;
return std::make_pair(0U, &X86::VR64RegClass);
- case 'Y': // SSE_REGS if SSE2 allowed
- if (!Subtarget.hasSSE2()) break;
- LLVM_FALLTHROUGH;
case 'v':
case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
if (!Subtarget.hasSSE1()) break;
@@ -46955,7 +49782,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR64XRegClass);
return std::make_pair(0U, &X86::FR64RegClass);
- // TODO: Handle i128 in FR128RegClass after it is tested well.
+ case MVT::i128:
+ if (Subtarget.is64Bit()) {
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::VR128XRegClass);
+ return std::make_pair(0U, &X86::VR128RegClass);
+ }
+ break;
// Vector types and fp128.
case MVT::f128:
case MVT::v16i8:
@@ -46979,6 +49812,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Subtarget.hasAVX())
return std::make_pair(0U, &X86::VR256RegClass);
break;
+ case MVT::v64i8:
+ case MVT::v32i16:
case MVT::v8f64:
case MVT::v16f32:
case MVT::v16i32:
@@ -46997,14 +49832,50 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 'i':
case 't':
case '2':
- return getRegForInlineAsmConstraint(TRI, "Y", VT);
+ return getRegForInlineAsmConstraint(TRI, "x", VT);
case 'm':
if (!Subtarget.hasMMX()) break;
return std::make_pair(0U, &X86::VR64RegClass);
case 'z':
- case '0':
if (!Subtarget.hasSSE1()) break;
- return std::make_pair(X86::XMM0, &X86::VR128RegClass);
+ switch (VT.SimpleTy) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ return std::make_pair(X86::XMM0, &X86::FR32RegClass);
+ case MVT::f64:
+ case MVT::i64:
+ return std::make_pair(X86::XMM0, &X86::FR64RegClass);
+ case MVT::f128:
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return std::make_pair(X86::XMM0, &X86::VR128RegClass);
+ // AVX types.
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ if (Subtarget.hasAVX())
+ return std::make_pair(X86::YMM0, &X86::VR256RegClass);
+ break;
+ case MVT::v64i8:
+ case MVT::v32i16:
+ case MVT::v8f64:
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (Subtarget.hasAVX512())
+ return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
+ break;
+ }
+ break;
case 'k':
// This register class doesn't allocate k0 for masked vector operation.
if (Subtarget.hasAVX512()) {
@@ -47030,7 +49901,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
- std::pair<unsigned, const TargetRegisterClass*> Res;
+ std::pair<Register, const TargetRegisterClass*> Res;
Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
@@ -47101,7 +49972,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (isGRClass(*Class)) {
unsigned Size = VT.getSizeInBits();
if (Size == 1) Size = 8;
- unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
+ Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
if (DestReg > 0) {
bool is64Bit = Subtarget.is64Bit();
const TargetRegisterClass *RC =
@@ -47217,8 +50088,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
- bool OptSize =
- Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+ bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
return OptSize && !VT.isVector();
}
@@ -47275,10 +50145,35 @@ bool X86TargetLowering::supportSwiftError() const {
return Subtarget.is64Bit();
}
+/// Returns true if stack probing through a function call is requested.
+bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
+ return !getStackProbeSymbolName(MF).empty();
+}
+
+/// Returns true if stack probing through inline assembly is requested.
+bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+
+ // No inline stack probe for Windows, they have their own mechanism.
+ if (Subtarget.isOSWindows() ||
+ MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
+ return false;
+
+ // If the function specifically requests inline stack probes, emit them.
+ if (MF.getFunction().hasFnAttribute("probe-stack"))
+ return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+ "inline-asm";
+
+ return false;
+}
+
/// Returns the name of the symbol used to emit stack probes or the empty
/// string if not applicable.
StringRef
X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+ // Inline Stack probes disable stack probe call
+ if (hasInlineStackProbe(MF))
+ return "";
+
// If the function specifically requests stack probes, emit them.
if (MF.getFunction().hasFnAttribute("probe-stack"))
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();