aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/ARM/ARMISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
committerDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
commiteb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/ARM/ARMISelLowering.cpp
parentb8a2042aa938069e862750553db0e4d82d25822c (diff)
downloadsrc-eb11fae6d08f479c0799db45860a98af528fa6e7.tar.gz
src-eb11fae6d08f479c0799db45860a98af528fa6e7.zip
Vendor import of llvm trunk r338150:vendor/llvm/llvm-trunk-r338150
Notes
Notes: svn path=/vendor/llvm/dist/; revision=336809 svn path=/vendor/llvm/llvm-trunk-r338150/; revision=336814; tag=vendor/llvm/llvm-trunk-r338150
Diffstat (limited to 'lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp871
1 files changed, 700 insertions, 171 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index aeda7c06a27a..47222a66f798 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -53,7 +53,6 @@
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -97,6 +96,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
@@ -308,13 +308,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setCmpLibcallCC(LC.Op, LC.Cond);
}
}
-
- // Set the correct calling convention for ARMv7k WatchOS. It's just
- // AAPCS_VFP for functions as simple as libcalls.
- if (Subtarget->isTargetWatchABI()) {
- for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)
- setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);
- }
}
// These libcalls are not available in 32-bit.
@@ -522,6 +515,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::f64, &ARM::DPRRegClass);
}
+ if (Subtarget->hasFullFP16()) {
+ addRegisterClass(MVT::f16, &ARM::HPRRegClass);
+ setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+ setOperationAction(ISD::BITCAST, MVT::i32, Custom);
+ setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+
+ setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
+ }
+
for (MVT VT : MVT::vector_valuetypes()) {
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
@@ -558,6 +561,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
+ if (Subtarget->hasFullFP16()) {
+ addQRTypeForNEON(MVT::v8f16);
+ addDRTypeForNEON(MVT::v4f16);
+ }
+
// v2f64 is legal so that QR subregs can be extracted as f64 elements, but
// neither Neon nor VFP support any arithmetic operations on it.
// The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
@@ -820,10 +828,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SRA, MVT::i64, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
- setOperationAction(ISD::ADDC, MVT::i32, Custom);
- setOperationAction(ISD::ADDE, MVT::i32, Custom);
- setOperationAction(ISD::SUBC, MVT::i32, Custom);
- setOperationAction(ISD::SUBE, MVT::i32, Custom);
+ // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
+ if (Subtarget->isThumb1Only()) {
+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+ }
if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
@@ -949,7 +959,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
- if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+ if (Subtarget->isTargetWindows())
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
else
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
@@ -1036,13 +1046,18 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+ if (Subtarget->hasFullFP16()) {
+ setOperationAction(ISD::SETCC, MVT::f16, Expand);
+ setOperationAction(ISD::SELECT, MVT::f16, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
+ }
- // Thumb-1 cannot currently select ARMISD::SUBE.
- if (!Subtarget->isThumb1Only())
- setOperationAction(ISD::SETCCE, MVT::i32, Custom);
+ setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+ if (Subtarget->hasFullFP16())
+ setOperationAction(ISD::BR_CC, MVT::f16, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Custom);
@@ -1121,6 +1136,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
if (Subtarget->hasNEON()) {
// vmin and vmax aren't available in a scalar form, so we use
// a NEON instruction with an undef lane instead.
+ setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
+ setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
@@ -1259,6 +1276,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD";
case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";
+ case ARMISD::VMOVhr: return "ARMISD::VMOVhr";
+ case ARMISD::VMOVrh: return "ARMISD::VMOVrh";
+ case ARMISD::VMOVSR: return "ARMISD::VMOVSR";
case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
@@ -1337,6 +1357,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::SMLALDX: return "ARMISD::SMLALDX";
case ARMISD::SMLSLD: return "ARMISD::SMLSLD";
case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX";
+ case ARMISD::SMMLAR: return "ARMISD::SMMLAR";
+ case ARMISD::SMMLSR: return "ARMISD::SMMLSR";
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
case ARMISD::BFI: return "ARMISD::BFI";
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
@@ -2465,12 +2487,37 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Arg = OutVals[realRVLocIdx];
+ bool ReturnF16 = false;
+
+ if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
+ // Half-precision return values can be returned like this:
+ //
+ // t11 f16 = fadd ...
+ // t12: i16 = bitcast t11
+ // t13: i32 = zero_extend t12
+ // t14: f32 = bitcast t13 <~~~~~~~ Arg
+ //
+ // to avoid code generation for bitcasts, we simply set Arg to the node
+ // that produces the f16 value, t11 in this case.
+ //
+ if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
+ SDValue ZE = Arg.getOperand(0);
+ if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
+ SDValue BC = ZE.getOperand(0);
+ if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
+ Arg = BC.getOperand(0);
+ ReturnF16 = true;
+ }
+ }
+ }
+ }
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::BCvt:
- Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+ if (!ReturnF16)
+ Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
break;
}
@@ -2518,7 +2565,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Guarantee that all emitted copies are
// stuck together, avoiding something bad.
Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(),
+ ReturnF16 ? MVT::f16 : VA.getLocVT()));
}
const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
@@ -2738,7 +2786,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
}
-/// \brief Convert a TLS address reference into the correct sequence of loads
+/// Convert a TLS address reference into the correct sequence of loads
/// and calls to compute the variable's address for Darwin, and return an
/// SDValue containing the final node.
@@ -2959,7 +3007,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
SDValue
ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
- if (DAG.getTarget().Options.EmulatedTLS)
+ if (DAG.getTarget().useEmulatedTLS())
return LowerToTLSEmulatedModel(GA, DAG);
if (Subtarget->isTargetDarwin())
@@ -3675,11 +3723,14 @@ SDValue ARMTargetLowering::LowerFormalArguments(
} else {
const TargetRegisterClass *RC;
- if (RegVT == MVT::f32)
+
+ if (RegVT == MVT::f16)
+ RC = &ARM::HPRRegClass;
+ else if (RegVT == MVT::f32)
RC = &ARM::SPRRegClass;
- else if (RegVT == MVT::f64)
+ else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
RC = &ARM::DPRRegClass;
- else if (RegVT == MVT::v2f64)
+ else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
RC = &ARM::QPRRegClass;
else if (RegVT == MVT::i32)
RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
@@ -3799,8 +3850,8 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
const SDLoc &dl) const {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
unsigned C = RHSC->getZExtValue();
- if (!isLegalICmpImmediate(C)) {
- // Constant does not fit, try adjusting it by one?
+ if (!isLegalICmpImmediate((int32_t)C)) {
+ // Constant does not fit, try adjusting it by one.
switch (CC) {
default: break;
case ISD::SETLT:
@@ -3940,6 +3991,29 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
break;
+ case ISD::UMULO:
+ // We generate a UMUL_LOHI and then check if the high word is 0.
+ ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
+ Value = DAG.getNode(ISD::UMUL_LOHI, dl,
+ DAG.getVTList(Op.getValueType(), Op.getValueType()),
+ LHS, RHS);
+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
+ DAG.getConstant(0, dl, MVT::i32));
+ Value = Value.getValue(0); // We only want the low 32 bits for the result.
+ break;
+ case ISD::SMULO:
+ // We generate a SMUL_LOHI and then check if all the bits of the high word
+ // are the same as the sign bit of the low word.
+ ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
+ Value = DAG.getNode(ISD::SMUL_LOHI, dl,
+ DAG.getVTList(Op.getValueType(), Op.getValueType()),
+ LHS, RHS);
+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
+ DAG.getNode(ISD::SRA, dl, Op.getValueType(),
+ Value.getValue(0),
+ DAG.getConstant(31, dl, MVT::i32)));
+ Value = Value.getValue(0); // We only want the low 32 bits for the result.
+ break;
} // switch (...)
return std::make_pair(Value, OverflowCmp);
@@ -3973,11 +4047,12 @@ static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
SDLoc DL(BoolCarry);
EVT CarryVT = BoolCarry.getValueType();
- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
// This converts the boolean value carry into the carry flag by doing
- // ARMISD::ADDC Carry, ~0
- return DAG.getNode(ARMISD::ADDC, DL, DAG.getVTList(CarryVT, MVT::i32),
- BoolCarry, DAG.getConstant(NegOne, DL, CarryVT));
+ // ARMISD::SUBC Carry, 1
+ SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
+ DAG.getVTList(CarryVT, MVT::i32),
+ BoolCarry, DAG.getConstant(1, DL, CarryVT));
+ return Carry.getValue(1);
}
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
@@ -4313,6 +4388,48 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
return false;
}
+// Check if a condition of the type x < k ? k : x can be converted into a
+// bit operation instead of conditional moves.
+// Currently this is allowed given:
+// - The conditions and values match up
+// - k is 0 or -1 (all ones)
+// This function will not check the last condition, thats up to the caller
+// It returns true if the transformation can be made, and in such case
+// returns x in V, and k in SatK.
+static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
+ SDValue &SatK)
+{
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+ SDValue TrueVal = Op.getOperand(2);
+ SDValue FalseVal = Op.getOperand(3);
+
+ SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
+ ? &RHS
+ : nullptr;
+
+ // No constant operation in comparison, early out
+ if (!K)
+ return false;
+
+ SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
+ V = (KTmp == TrueVal) ? FalseVal : TrueVal;
+ SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
+
+ // If the constant on left and right side, or variable on left and right,
+ // does not match, early out
+ if (*K != KTmp || V != VTmp)
+ return false;
+
+ if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
+ SatK = *K;
+ return true;
+ }
+
+ return false;
+}
+
SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc dl(Op);
@@ -4331,6 +4448,25 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
}
+ // Try to convert expressions of the form x < k ? k : x (and similar forms)
+ // into more efficient bit operations, which is possible when k is 0 or -1
+ // On ARM and Thumb-2 which have flexible operand 2 this will result in
+ // single instructions. On Thumb the shift and the bit operation will be two
+ // instructions.
+ // Only allow this transformation on full-width (32-bit) operations
+ SDValue LowerSatConstant;
+ if (VT == MVT::i32 &&
+ isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
+ SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
+ DAG.getConstant(31, dl, VT));
+ if (isNullConstant(LowerSatConstant)) {
+ SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
+ DAG.getAllOnesConstant(dl, VT));
+ return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
+ } else if (isAllOnesConstant(LowerSatConstant))
+ return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
+ }
+
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
@@ -4380,9 +4516,12 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
bool InvalidOnQNaN;
FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
- // Try to generate VMAXNM/VMINNM on ARMv8.
- if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
- TrueVal.getValueType() == MVT::f64)) {
+ // Normalize the fp compare. If RHS is zero we keep it there so we match
+ // CMPFPw0 instead of CMPFP.
+ if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) &&
+ (TrueVal.getValueType() == MVT::f16 ||
+ TrueVal.getValueType() == MVT::f32 ||
+ TrueVal.getValueType() == MVT::f64)) {
bool swpCmpOps = false;
bool swpVselOps = false;
checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
@@ -4532,10 +4671,14 @@ SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
- // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.
+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+ // instruction.
unsigned Opc = Cond.getOpcode();
- if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO ||
- Opc == ISD::SSUBO || Opc == ISD::USUBO)) {
+ bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
+ !Subtarget->isThumb1Only();
+ if (Cond.getResNo() == 1 &&
+ (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+ Opc == ISD::USUBO || OptimizeMul)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
return SDValue();
@@ -4579,11 +4722,15 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
}
}
- // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.
+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+ // instruction.
unsigned Opc = LHS.getOpcode();
+ bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
+ !Subtarget->isThumb1Only();
if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
- Opc == ISD::USUBO) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ Opc == ISD::USUBO || OptimizeMul) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
return SDValue();
@@ -4614,8 +4761,6 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
Chain, Dest, ARMcc, CCR, Cmp);
}
- assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
if (getTargetMachine().Options.UnsafeFPMath &&
(CC == ISD::SETEQ || CC == ISD::SETOEQ ||
CC == ISD::SETNE || CC == ISD::SETUNE)) {
@@ -4979,7 +5124,8 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
/// operand type is illegal (e.g., v2f32 for a target that doesn't support
/// vectors), since the legalizer won't know what to do with that.
-static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
+ const ARMSubtarget *Subtarget) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc dl(N);
SDValue Op = N->getOperand(0);
@@ -4988,8 +5134,78 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
// source or destination of the bit convert.
EVT SrcVT = Op.getValueType();
EVT DstVT = N->getValueType(0);
- assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
- "ExpandBITCAST called for non-i64 type");
+ const bool HasFullFP16 = Subtarget->hasFullFP16();
+
+ if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
+ // FullFP16: half values are passed in S-registers, and we don't
+ // need any of the bitcast and moves:
+ //
+ // t2: f32,ch = CopyFromReg t0, Register:f32 %0
+ // t5: i32 = bitcast t2
+ // t18: f16 = ARMISD::VMOVhr t5
+ if (Op.getOpcode() != ISD::CopyFromReg ||
+ Op.getValueType() != MVT::f32)
+ return SDValue();
+
+ auto Move = N->use_begin();
+ if (Move->getOpcode() != ARMISD::VMOVhr)
+ return SDValue();
+
+ SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
+ SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
+ DAG.ReplaceAllUsesWith(*Move, &Copy);
+ return Copy;
+ }
+
+ if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
+ if (!HasFullFP16)
+ return SDValue();
+ // SoftFP: read half-precision arguments:
+ //
+ // t2: i32,ch = ...
+ // t7: i16 = truncate t2 <~~~~ Op
+ // t8: f16 = bitcast t7 <~~~~ N
+ //
+ if (Op.getOperand(0).getValueType() == MVT::i32)
+ return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
+ MVT::f16, Op.getOperand(0));
+
+ return SDValue();
+ }
+
+ // Half-precision return values
+ if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
+ if (!HasFullFP16)
+ return SDValue();
+ //
+ // t11: f16 = fadd t8, t10
+ // t12: i16 = bitcast t11 <~~~ SDNode N
+ // t13: i32 = zero_extend t12
+ // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
+ // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
+ //
+ // transform this into:
+ //
+ // t20: i32 = ARMISD::VMOVrh t11
+ // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
+ //
+ auto ZeroExtend = N->use_begin();
+ if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
+ ZeroExtend->getValueType(0) != MVT::i32)
+ return SDValue();
+
+ auto Copy = ZeroExtend->use_begin();
+ if (Copy->getOpcode() == ISD::CopyToReg &&
+ Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
+ SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
+ DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
+ return Cvt;
+ }
+ return SDValue();
+ }
+
+ if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
+ return SDValue();
// Turn i64->f64 into VMOVDRR.
if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
@@ -5566,16 +5782,22 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
return Result;
}
-static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue Carry = Op.getOperand(2);
SDValue Cond = Op.getOperand(3);
SDLoc DL(Op);
- assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
+ assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
+
+ // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
+ // have to invert the carry first.
+ Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
+ DAG.getConstant(1, DL, MVT::i32), Carry);
+ // This converts the boolean value carry into the carry flag.
+ Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
- assert(Carry.getOpcode() != ISD::CARRY_FALSE);
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
@@ -5731,23 +5953,34 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
const ARMSubtarget *ST) const {
- bool IsDouble = Op.getValueType() == MVT::f64;
+ EVT VT = Op.getValueType();
+ bool IsDouble = (VT == MVT::f64);
ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
const APFloat &FPVal = CFP->getValueAPF();
// Prevent floating-point constants from using literal loads
// when execute-only is enabled.
if (ST->genExecuteOnly()) {
+ // If we can represent the constant as an immediate, don't lower it
+ if (isFPImmLegal(FPVal, VT))
+ return Op;
+ // Otherwise, construct as integer, and move to float register
APInt INTVal = FPVal.bitcastToAPInt();
SDLoc DL(CFP);
- if (IsDouble) {
- SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
- SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
- if (!ST->isLittle())
- std::swap(Lo, Hi);
- return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
- } else {
- return DAG.getConstant(INTVal, DL, MVT::i32);
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("Unknown floating point type!");
+ break;
+ case MVT::f64: {
+ SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
+ SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
+ if (!ST->isLittle())
+ std::swap(Lo, Hi);
+ return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
+ }
+ case MVT::f32:
+ return DAG.getNode(ARMISD::VMOVSR, DL, VT,
+ DAG.getConstant(INTVal, DL, MVT::i32));
}
}
@@ -6598,10 +6831,9 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
}
// Final sanity check before we try to actually produce a shuffle.
- DEBUG(
- for (auto Src : Sources)
- assert(Src.ShuffleVec.getValueType() == ShuffleVT);
- );
+ LLVM_DEBUG(for (auto Src
+ : Sources)
+ assert(Src.ShuffleVec.getValueType() == ShuffleVT););
// The stars all align, our next step is to produce the mask for the shuffle.
SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
@@ -7490,39 +7722,15 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
return N0;
}
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getNode()->getValueType(0);
- SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
- unsigned Opc;
- bool ExtraOp = false;
- switch (Op.getOpcode()) {
- default: llvm_unreachable("Invalid code");
- case ISD::ADDC: Opc = ARMISD::ADDC; break;
- case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
- case ISD::SUBC: Opc = ARMISD::SUBC; break;
- case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
- }
-
- if (!ExtraOp)
- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
- Op.getOperand(1));
- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
- Op.getOperand(1), Op.getOperand(2));
-}
-
static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
EVT VT = N->getValueType(0);
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
SDValue Carry = Op.getOperand(2);
- EVT CarryVT = Carry.getValueType();
SDLoc DL(Op);
- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
-
SDValue Result;
if (Op.getOpcode() == ISD::ADDCARRY) {
// This converts the boolean value carry into the carry flag.
@@ -7530,7 +7738,7 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
// Do the addition proper using the carry flag we wanted.
Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
- Op.getOperand(1), Carry.getValue(1));
+ Op.getOperand(1), Carry);
// Now convert the carry flag into a boolean value.
Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
@@ -7544,7 +7752,7 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
// Do the subtraction proper using the carry flag we wanted.
Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
- Op.getOperand(1), Carry.getValue(1));
+ Op.getOperand(1), Carry);
// Now convert the carry flag into a boolean value.
Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
@@ -7851,7 +8059,7 @@ static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
}
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
- DEBUG(dbgs() << "Lowering node: "; Op.dump());
+ LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
switch (Op.getOpcode()) {
default: llvm_unreachable("Don't know how to custom lower this!");
case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
@@ -7879,7 +8087,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
Subtarget);
- case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG);
+ case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
case ISD::SHL:
case ISD::SRL:
case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
@@ -7892,7 +8100,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
case ISD::SETCC: return LowerVSETCC(Op, DAG);
- case ISD::SETCCE: return LowerSETCCE(Op, DAG);
+ case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
@@ -7909,10 +8117,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
return LowerDIV_Windows(Op, DAG, /* Signed */ false);
return LowerUDIV(Op, DAG);
- case ISD::ADDC:
- case ISD::ADDE:
- case ISD::SUBC:
- case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::ADDCARRY:
case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
case ISD::SADDO:
@@ -7927,7 +8131,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SDIVREM:
case ISD::UDIVREM: return LowerDivRem(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
- if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
+ if (Subtarget->isTargetWindows())
return LowerDYNAMIC_STACKALLOC(Op, DAG);
llvm_unreachable("Don't know how to custom lower this!");
case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
@@ -7981,7 +8185,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
ExpandREAD_REGISTER(N, Results, DAG);
break;
case ISD::BITCAST:
- Res = ExpandBITCAST(N, DAG);
+ Res = ExpandBITCAST(N, DAG, Subtarget);
break;
case ISD::SRL:
case ISD::SRA:
@@ -9055,8 +9259,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
// Thumb1 post-indexed loads are really just single-register LDMs.
case ARM::tLDR_postidx: {
MachineOperand Def(MI.getOperand(1));
- if (TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
- Def.setIsRenamable(false);
BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
.add(Def) // Rn_wb
.add(MI.getOperand(2)) // Rn
@@ -9323,7 +9525,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
}
-/// \brief Attaches vregs to MEMCPY that it will use as scratch registers
+/// Attaches vregs to MEMCPY that it will use as scratch registers
/// when it is expanded into LDM/STM. This is done as a post-isel lowering
/// instead of as a custom inserter because we need the use list from the SDNode.
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
@@ -9860,7 +10062,7 @@ static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
return resNode;
}
-static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
+static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Look for multiply add opportunities.
@@ -9877,49 +10079,61 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
// V V
// ADDE <- hiAdd
//
- assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE");
-
- assert(AddeNode->getNumOperands() == 3 &&
- AddeNode->getOperand(2).getValueType() == MVT::i32 &&
+ // In the special case where only the higher part of a signed result is used
+ // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
+ // a constant with the exact value of 0x80000000, we recognize we are dealing
+ // with a "rounded multiply and add" (or subtract) and transform it into
+ // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
+
+ assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
+ AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
+ "Expect an ADDE or SUBE");
+
+ assert(AddeSubeNode->getNumOperands() == 3 &&
+ AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
"ADDE node has the wrong inputs");
- // Check that we are chained to the right ADDC node.
- SDNode* AddcNode = AddeNode->getOperand(2).getNode();
- if (AddcNode->getOpcode() != ARMISD::ADDC)
+ // Check that we are chained to the right ADDC or SUBC node.
+ SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
+ if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
+ AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
+ (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
+ AddcSubcNode->getOpcode() != ARMISD::SUBC))
return SDValue();
- SDValue AddcOp0 = AddcNode->getOperand(0);
- SDValue AddcOp1 = AddcNode->getOperand(1);
+ SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
+ SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
// Check if the two operands are from the same mul_lohi node.
- if (AddcOp0.getNode() == AddcOp1.getNode())
+ if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
return SDValue();
- assert(AddcNode->getNumValues() == 2 &&
- AddcNode->getValueType(0) == MVT::i32 &&
+ assert(AddcSubcNode->getNumValues() == 2 &&
+ AddcSubcNode->getValueType(0) == MVT::i32 &&
"Expect ADDC with two result values. First: i32");
// Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
// maybe a SMLAL which multiplies two 16-bit values.
- if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
- AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
- AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
- AddcOp1->getOpcode() != ISD::SMUL_LOHI)
- return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget);
+ if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
+ AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
+ AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
+ AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
+ AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
+ return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
// Check for the triangle shape.
- SDValue AddeOp0 = AddeNode->getOperand(0);
- SDValue AddeOp1 = AddeNode->getOperand(1);
+ SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
+ SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
- // Make sure that the ADDE operands are not coming from the same node.
- if (AddeOp0.getNode() == AddeOp1.getNode())
+ // Make sure that the ADDE/SUBE operands are not coming from the same node.
+ if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
return SDValue();
- // Find the MUL_LOHI node walking up ADDE's operands.
+ // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
bool IsLeftOperandMUL = false;
- SDValue MULOp = findMUL_LOHI(AddeOp0);
+ SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
if (MULOp == SDValue())
- MULOp = findMUL_LOHI(AddeOp1);
+ MULOp = findMUL_LOHI(AddeSubeOp1);
else
IsLeftOperandMUL = true;
if (MULOp == SDValue())
@@ -9930,63 +10144,88 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
// Figure out the high and low input values to the MLAL node.
- SDValue* HiAdd = nullptr;
- SDValue* LoMul = nullptr;
- SDValue* LowAdd = nullptr;
+ SDValue *HiAddSub = nullptr;
+ SDValue *LoMul = nullptr;
+ SDValue *LowAddSub = nullptr;
- // Ensure that ADDE is from high result of ISD::xMUL_LOHI.
- if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))
+ // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
+ if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
return SDValue();
if (IsLeftOperandMUL)
- HiAdd = &AddeOp1;
+ HiAddSub = &AddeSubeOp1;
else
- HiAdd = &AddeOp0;
+ HiAddSub = &AddeSubeOp0;
+ // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
+ // whose low result is fed to the ADDC/SUBC we are checking.
- // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node
- // whose low result is fed to the ADDC we are checking.
-
- if (AddcOp0 == MULOp.getValue(0)) {
- LoMul = &AddcOp0;
- LowAdd = &AddcOp1;
+ if (AddcSubcOp0 == MULOp.getValue(0)) {
+ LoMul = &AddcSubcOp0;
+ LowAddSub = &AddcSubcOp1;
}
- if (AddcOp1 == MULOp.getValue(0)) {
- LoMul = &AddcOp1;
- LowAdd = &AddcOp0;
+ if (AddcSubcOp1 == MULOp.getValue(0)) {
+ LoMul = &AddcSubcOp1;
+ LowAddSub = &AddcSubcOp0;
}
if (!LoMul)
return SDValue();
- // If HiAdd is the same node as ADDC or is a predecessor of ADDC the
- // replacement below will create a cycle.
- if (AddcNode == HiAdd->getNode() ||
- AddcNode->isPredecessorOf(HiAdd->getNode()))
+ // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
+ // the replacement below will create a cycle.
+ if (AddcSubcNode == HiAddSub->getNode() ||
+ AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
return SDValue();
// Create the merged node.
SelectionDAG &DAG = DCI.DAG;
- // Build operand list.
+ // Start building operand list.
SmallVector<SDValue, 8> Ops;
Ops.push_back(LoMul->getOperand(0));
Ops.push_back(LoMul->getOperand(1));
- Ops.push_back(*LowAdd);
- Ops.push_back(*HiAdd);
- SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode),
+ // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
+ // the case, we must be doing signed multiplication and only use the higher
+ // part of the result of the MLAL, furthermore the LowAddSub must be a constant
+ // addition or subtraction with the value of 0x800000.
+ if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
+ FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
+ LowAddSub->getNode()->getOpcode() == ISD::Constant &&
+ static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
+ 0x80000000) {
+ Ops.push_back(*HiAddSub);
+ if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
+ FinalOpc = ARMISD::SMMLSR;
+ } else {
+ FinalOpc = ARMISD::SMMLAR;
+ }
+ SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
+
+ return SDValue(AddeSubeNode, 0);
+ } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
+ // SMMLS is generated during instruction selection and the rest of this
+ // function can not handle the case where AddcSubcNode is a SUBC.
+ return SDValue();
+
+ // Finish building the operand list for {U/S}MLAL
+ Ops.push_back(*LowAddSub);
+ Ops.push_back(*HiAddSub);
+
+ SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
DAG.getVTList(MVT::i32, MVT::i32), Ops);
// Replace the ADDs' nodes uses by the MLA node's values.
SDValue HiMLALResult(MLALNode.getNode(), 1);
- DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
SDValue LoMLALResult(MLALNode.getNode(), 0);
- DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
// Return original node to notify the driver to stop replacing.
- return SDValue(AddeNode, 0);
+ return SDValue(AddeSubeNode, 0);
}
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
@@ -10071,13 +10310,13 @@ static SDValue PerformAddcSubcCombine(SDNode *N,
const ARMSubtarget *Subtarget) {
SelectionDAG &DAG(DCI.DAG);
- if (N->getOpcode() == ARMISD::ADDC) {
- // (ADDC (ADDE 0, 0, C), -1) -> C
+ if (N->getOpcode() == ARMISD::SUBC) {
+ // (SUBC (ADDE 0, 0, C), 1) -> C
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
if (LHS->getOpcode() == ARMISD::ADDE &&
isNullConstant(LHS->getOperand(0)) &&
- isNullConstant(LHS->getOperand(1)) && isAllOnesConstant(RHS)) {
+ isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
}
}
@@ -10095,12 +10334,15 @@ static SDValue PerformAddcSubcCombine(SDNode *N,
}
}
}
+
return SDValue();
}
-static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue PerformAddeSubeCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
if (Subtarget->isThumb1Only()) {
+ SelectionDAG &DAG = DCI.DAG;
SDValue RHS = N->getOperand(1);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
int64_t imm = C->getSExtValue();
@@ -10118,6 +10360,8 @@ static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
N->getOperand(0), RHS, N->getOperand(2));
}
}
+ } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
+ return AddCombineTo64bitMLAL(N, DCI, Subtarget);
}
return SDValue();
}
@@ -10130,7 +10374,7 @@ static SDValue PerformADDECombine(SDNode *N,
const ARMSubtarget *Subtarget) {
// Only ARM and Thumb2 support UMLAL/SMLAL.
if (Subtarget->isThumb1Only())
- return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
+ return PerformAddeSubeCombine(N, DCI, Subtarget);
// Only perform the checks after legalize when the pattern is available.
if (DCI.isBeforeLegalize()) return SDValue();
@@ -10201,7 +10445,14 @@ static SDValue PerformSHLSimplify(SDNode *N,
case ISD::XOR:
case ISD::SETCC:
case ARMISD::CMP:
- // Check that its not already using a shl.
+ // Check that the user isn't already using a constant because there
+ // aren't any instructions that support an immediate operand and a
+ // shifted operand.
+ if (isa<ConstantSDNode>(U->getOperand(0)) ||
+ isa<ConstantSDNode>(U->getOperand(1)))
+ return SDValue();
+
+ // Check that it's not already using a shift.
if (U->getOperand(0).getOpcode() == ISD::SHL ||
U->getOperand(1).getOpcode() == ISD::SHL)
return SDValue();
@@ -10223,8 +10474,6 @@ static SDValue PerformSHLSimplify(SDNode *N,
if (!C1ShlC2 || !C2)
return SDValue();
- DEBUG(dbgs() << "Trying to simplify shl: "; N->dump());
-
APInt C2Int = C2->getAPIntValue();
APInt C1Int = C1ShlC2->getAPIntValue();
@@ -10238,12 +10487,12 @@ static SDValue PerformSHLSimplify(SDNode *N,
C1Int.lshrInPlace(C2Int);
// The immediates are encoded as an 8-bit value that can be rotated.
- unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros();
- if (C1Int.getBitWidth() - Zeros > 8)
- return SDValue();
+ auto LargeImm = [](const APInt &Imm) {
+ unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
+ return Imm.getBitWidth() - Zeros > 8;
+ };
- Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros();
- if (C2Int.getBitWidth() - Zeros > 8)
+ if (LargeImm(C1Int) || LargeImm(C2Int))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -10254,6 +10503,10 @@ static SDValue PerformSHLSimplify(SDNode *N,
// Shift left to compensate for the lshr of C1Int.
SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
+ LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
+ SHL.dump(); N->dump());
+ LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
+
DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
return SDValue(N, 0);
}
@@ -10423,6 +10676,83 @@ static SDValue PerformMULCombine(SDNode *N,
return SDValue();
}
+static SDValue CombineANDShift(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ // Allow DAGCombine to pattern-match before we touch the canonical form.
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ if (N->getValueType(0) != MVT::i32)
+ return SDValue();
+
+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!N1C)
+ return SDValue();
+
+ uint32_t C1 = (uint32_t)N1C->getZExtValue();
+ // Don't transform uxtb/uxth.
+ if (C1 == 255 || C1 == 65535)
+ return SDValue();
+
+ SDNode *N0 = N->getOperand(0).getNode();
+ if (!N0->hasOneUse())
+ return SDValue();
+
+ if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
+ return SDValue();
+
+ bool LeftShift = N0->getOpcode() == ISD::SHL;
+
+ ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ if (!N01C)
+ return SDValue();
+
+ uint32_t C2 = (uint32_t)N01C->getZExtValue();
+ if (!C2 || C2 >= 32)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ // We have a pattern of the form "(and (shl x, c2) c1)" or
+ // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
+ // transform to a pair of shifts, to save materializing c1.
+
+ // First pattern: right shift, and c1+1 is a power of two.
+ // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power
+ // of two).
+ // FIXME: Use demanded bits?
+ if (!LeftShift && isMask_32(C1)) {
+ uint32_t C3 = countLeadingZeros(C1);
+ if (C2 < C3) {
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
+ DAG.getConstant(C3 - C2, DL, MVT::i32));
+ return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
+ DAG.getConstant(C3, DL, MVT::i32));
+ }
+ }
+
+ // Second pattern: left shift, and (c1>>c2)+1 is a power of two.
+ // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1
+ // is a power of two).
+ // FIXME: Use demanded bits?
+ if (LeftShift && isShiftedMask_32(C1)) {
+ uint32_t C3 = countLeadingZeros(C1);
+ if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) {
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
+ DAG.getConstant(C2 + C3, DL, MVT::i32));
+ return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
+ DAG.getConstant(C3, DL, MVT::i32));
+ }
+ }
+
+ // FIXME: Transform "(and (shl x, c2) c1)" ->
+ // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
+ // c1.
+ return SDValue();
+}
+
static SDValue PerformANDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
@@ -10464,6 +10794,10 @@ static SDValue PerformANDCombine(SDNode *N,
return Result;
}
+ if (Subtarget->isThumb1Only())
+ if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
+ return Result;
+
return SDValue();
}
@@ -11012,7 +11346,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
return DAG.getNode(ISD::BITCAST, dl, VT, BV);
}
-/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static SDValue
PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
// ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
@@ -11228,6 +11562,12 @@ static SDValue CombineBaseUpdate(SDNode *N,
NumVecs = 3; break;
case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
NumVecs = 4; break;
+ case Intrinsic::arm_neon_vld2dup:
+ case Intrinsic::arm_neon_vld3dup:
+ case Intrinsic::arm_neon_vld4dup:
+ // TODO: Support updating VLDxDUP nodes. For now, we just skip
+ // combining base updates for such intrinsics.
+ continue;
case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
NumVecs = 2; isLaneOp = true; break;
case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
@@ -12306,6 +12646,89 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
}
}
+ if (!VT.isInteger())
+ return SDValue();
+
+ // Materialize a boolean comparison for integers so we can avoid branching.
+ if (isNullConstant(FalseVal)) {
+ if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
+ if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
+ // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
+ // right 5 bits will make that 32 be 1, otherwise it will be 0.
+ // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
+ DAG.getConstant(5, dl, MVT::i32));
+ } else {
+ // CMOV 0, 1, ==, (CMPZ x, y) ->
+ // (ADDCARRY (SUB x, y), t:0, t:1)
+ // where t = (SUBCARRY 0, (SUB x, y), 0)
+ //
+ // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
+ // x != y. In other words, a carry C == 1 when x == y, C == 0
+ // otherwise.
+ // The final ADDCARRY computes
+ // x - y + (0 - (x - y)) + C == C
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
+ // ISD::SUBCARRY returns a borrow but we want the carry here
+ // actually.
+ SDValue Carry =
+ DAG.getNode(ISD::SUB, dl, MVT::i32,
+ DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
+ Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
+ }
+ } else if (CC == ARMCC::NE && LHS != RHS &&
+ (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
+ // This seems pointless but will allow us to combine it further below.
+ // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
+ N->getOperand(3), Cmp);
+ }
+ } else if (isNullConstant(TrueVal)) {
+ if (CC == ARMCC::EQ && LHS != RHS &&
+ (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
+ // This seems pointless but will allow us to combine it further below
+ // Note that we change == for != as this is the dual for the case above.
+ // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+ Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
+ DAG.getConstant(ARMCC::NE, dl, MVT::i32),
+ N->getOperand(3), Cmp);
+ }
+ }
+
+ // On Thumb1, the DAG above may be further combined if z is a power of 2
+ // (z == 2 ^ K).
+ // CMOV (SUB x, y), z, !=, (CMPZ x, y) ->
+ // merge t3, t4
+ // where t1 = (SUBCARRY (SUB x, y), z, 0)
+ // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
+ // t3 = if K != 0 then (SHL t2:0, K) else t2:0
+ // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ]
+ const APInt *TrueConst;
+ if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
+ (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) &&
+ (FalseVal.getOperand(1) == RHS) &&
+ (TrueConst = isPowerOf2Constant(TrueVal))) {
+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+ unsigned ShiftAmount = TrueConst->logBase2();
+ if (ShiftAmount)
+ TrueVal = DAG.getConstant(1, dl, VT);
+ SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
+ Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
+ // Make it a carry, not a borrow.
+ SDValue Carry = DAG.getNode(
+ ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1));
+ Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry);
+
+ if (ShiftAmount)
+ Res = DAG.getNode(ISD::SHL, dl, VT, Res,
+ DAG.getConstant(ShiftAmount, dl, MVT::i32));
+ }
+
if (Res.getNode()) {
KnownBits Known;
DAG.computeKnownBits(SDValue(N,0), Known);
@@ -12338,7 +12761,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
case ARMISD::ADDC:
case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
- case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
+ case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
case ARMISD::BFI: return PerformBFICombine(N, DCI);
case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
@@ -12424,13 +12847,22 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
case Intrinsic::arm_neon_vld1:
+ case Intrinsic::arm_neon_vld1x2:
+ case Intrinsic::arm_neon_vld1x3:
+ case Intrinsic::arm_neon_vld1x4:
case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
case Intrinsic::arm_neon_vld4:
case Intrinsic::arm_neon_vld2lane:
case Intrinsic::arm_neon_vld3lane:
case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vld2dup:
+ case Intrinsic::arm_neon_vld3dup:
+ case Intrinsic::arm_neon_vld4dup:
case Intrinsic::arm_neon_vst1:
+ case Intrinsic::arm_neon_vst1x2:
+ case Intrinsic::arm_neon_vst1x3:
+ case Intrinsic::arm_neon_vst1x4:
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
case Intrinsic::arm_neon_vst4:
@@ -12454,6 +12886,10 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned,
unsigned,
bool *Fast) const {
+ // Depends what it gets converted into if the type is weird.
+ if (!VT.isSimple())
+ return false;
+
// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
@@ -12560,6 +12996,24 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
+bool ARMTargetLowering::isFNegFree(EVT VT) const {
+ if (!VT.isSimple())
+ return false;
+
+ // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
+ // negate values directly (fneg is free). So, we don't want to let the DAG
+ // combiner rewrite fneg into xors and some other instructions. For f16 and
+ // FullFP16 argument passing, some bitcast nodes may be introduced,
+ // triggering this DAG combine rewrite, so we are avoiding that with this.
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: break;
+ case MVT::f16:
+ return Subtarget->hasFullFP16();
+ }
+
+ return false;
+}
+
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT VT = ExtVal.getValueType();
@@ -12828,9 +13282,11 @@ bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
// Thumb2 and ARM modes can use cmn for negative immediates.
if (!Subtarget->isThumb())
- return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;
+ return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
+ ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
if (Subtarget->isThumb2())
- return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;
+ return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
+ ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
// Thumb1 doesn't have cmn, and only 8-bit immediates.
return Imm >= 0 && Imm <= 255;
}
@@ -13262,8 +13718,14 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
return RCPair(0U, &ARM::QPR_8RegClass);
break;
case 't':
+ if (VT == MVT::Other)
+ break;
if (VT == MVT::f32 || VT == MVT::i32)
return RCPair(0U, &ARM::SPRRegClass);
+ if (VT.getSizeInBits() == 64)
+ return RCPair(0U, &ARM::DPR_VFP2RegClass);
+ if (VT.getSizeInBits() == 128)
+ return RCPair(0U, &ARM::QPR_VFP2RegClass);
break;
}
}
@@ -13593,6 +14055,20 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
+ if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+ "no-stack-arg-probe")) {
+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
+ Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
+ SDValue Ops[2] = { SP, Chain };
+ return DAG.getMergeValues(Ops, DL);
+ }
+
SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
DAG.getConstant(2, DL, MVT::i32));
@@ -13656,6 +14132,8 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {
bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
if (!Subtarget->hasVFP3())
return false;
+ if (VT == MVT::f16 && Subtarget->hasFullFP16())
+ return ARM_AM::getFP16Imm(Imm) != -1;
if (VT == MVT::f32)
return ARM_AM::getFP32Imm(Imm) != -1;
if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
@@ -13677,7 +14155,10 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::arm_neon_vld4:
case Intrinsic::arm_neon_vld2lane:
case Intrinsic::arm_neon_vld3lane:
- case Intrinsic::arm_neon_vld4lane: {
+ case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vld2dup:
+ case Intrinsic::arm_neon_vld3dup:
+ case Intrinsic::arm_neon_vld4dup: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
@@ -13691,6 +14172,21 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOLoad;
return true;
}
+ case Intrinsic::arm_neon_vld1x2:
+ case Intrinsic::arm_neon_vld1x3:
+ case Intrinsic::arm_neon_vld1x4: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ // Conservatively set memVT to the entire set of vectors loaded.
+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+ uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+ Info.offset = 0;
+ Info.align = 0;
+ // volatile loads with NEON intrinsics not supported
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ }
case Intrinsic::arm_neon_vst1:
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
@@ -13717,6 +14213,27 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags = MachineMemOperand::MOStore;
return true;
}
+ case Intrinsic::arm_neon_vst1x2:
+ case Intrinsic::arm_neon_vst1x3:
+ case Intrinsic::arm_neon_vst1x4: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ // Conservatively set memVT to the entire set of vectors stored.
+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
+ unsigned NumElts = 0;
+ for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+ Type *ArgTy = I.getArgOperand(ArgI)->getType();
+ if (!ArgTy->isVectorTy())
+ break;
+ NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
+ }
+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.align = 0;
+ // volatile stores with NEON intrinsics not supported
+ Info.flags = MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::arm_ldaex:
case Intrinsic::arm_ldrex: {
auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
@@ -13768,7 +14285,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return false;
}
-/// \brief Returns true if it is beneficial to convert a load of a constant
+/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
@@ -14064,7 +14581,7 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(
return VecSize == 64 || VecSize % 128 == 0;
}
-/// \brief Lower an interleaved load into a vldN intrinsic.
+/// Lower an interleaved load into a vldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
@@ -14182,7 +14699,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
return true;
}
-/// \brief Lower an interleaved store into a vstN intrinsic.
+/// Lower an interleaved store into a vstN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
@@ -14380,7 +14897,19 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
return (Members > 0 && Members <= 4);
}
-/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
+/// Return the correct alignment for the current calling convention.
+unsigned
+ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
+ DataLayout DL) const {
+ if (!ArgTy->isVectorTy())
+ return DL.getABITypeAlignment(ArgTy);
+
+ // Avoid over-aligning vector parameters. It would require realigning the
+ // stack and waste space for no real benefit.
+ return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
+}
+
+/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
/// passing according to AAPCS rules.
bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
@@ -14392,7 +14921,7 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
HABaseType Base = HA_UNKNOWN;
uint64_t Members = 0;
bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
- DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
+ LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
return IsHA || IsIntArray;