src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2018-07-28 10:51:19 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2018-07-28 10:51:19 +0000
commit	eb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree	44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/ARM/ARMISelLowering.cpp
parent	b8a2042aa938069e862750553db0e4d82d25822c (diff)
download	src-eb11fae6d08f479c0799db45860a98af528fa6e7.tar.gz src-eb11fae6d08f479c0799db45860a98af528fa6e7.zip

Vendor import of llvm trunk r338150:vendor/llvm/llvm-trunk-r338150

https://llvm.org/svn/llvm-project/llvm/trunk@338150

Notes

Notes: svn path=/vendor/llvm/dist/; revision=336809 svn path=/vendor/llvm/llvm-trunk-r338150/; revision=336814; tag=vendor/llvm/llvm-trunk-r338150

Diffstat (limited to 'lib/Target/ARM/ARMISelLowering.cpp')

-rw-r--r--

lib/Target/ARM/ARMISelLowering.cpp

871

1 files changed, 700 insertions, 171 deletions

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index aeda7c06a27a..47222a66f798 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp

@@ -53,7 +53,6 @@

#include "llvm/CodeGen/MachineMemOperand.h"

#include "llvm/CodeGen/MachineOperand.h"

#include "llvm/CodeGen/MachineRegisterInfo.h"

-#include "llvm/CodeGen/MachineValueType.h"

#include "llvm/CodeGen/RuntimeLibcalls.h"

#include "llvm/CodeGen/SelectionDAG.h"

#include "llvm/CodeGen/SelectionDAGNodes.h"

@@ -97,6 +96,7 @@

#include "llvm/Support/Debug.h"

#include "llvm/Support/ErrorHandling.h"

#include "llvm/Support/KnownBits.h"

+#include "llvm/Support/MachineValueType.h"

#include "llvm/Support/MathExtras.h"

#include "llvm/Support/raw_ostream.h"

#include "llvm/Target/TargetMachine.h"

@@ -308,13 +308,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,

setCmpLibcallCC(LC.Op, LC.Cond);

}

- // Set the correct calling convention for ARMv7k WatchOS. It's just

- // AAPCS_VFP for functions as simple as libcalls.

- if (Subtarget->isTargetWatchABI()) {

- for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i)

- setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP);

- }

}

// These libcalls are not available in 32-bit.

@@ -522,6 +515,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,

addRegisterClass(MVT::f64, &ARM::DPRRegClass);

}

+ if (Subtarget->hasFullFP16()) {

+ addRegisterClass(MVT::f16, &ARM::HPRRegClass);

+ setOperationAction(ISD::BITCAST, MVT::i16, Custom);

+ setOperationAction(ISD::BITCAST, MVT::i32, Custom);

+ setOperationAction(ISD::BITCAST, MVT::f16, Custom);

+ setOperationAction(ISD::FMINNUM, MVT::f16, Legal);

+ setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);

+ }

for (MVT VT : MVT::vector_valuetypes()) {

for (MVT InnerVT : MVT::vector_valuetypes()) {

setTruncStoreAction(VT, InnerVT, Expand);

@@ -558,6 +561,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,

addQRTypeForNEON(MVT::v4i32);

addQRTypeForNEON(MVT::v2i64);

+ if (Subtarget->hasFullFP16()) {

+ addQRTypeForNEON(MVT::v8f16);

+ addDRTypeForNEON(MVT::v4f16);

+ }

// v2f64 is legal so that QR subregs can be extracted as f64 elements, but

// neither Neon nor VFP support any arithmetic operations on it.

// The same with v4f32. But keep in mind that vadd, vsub, vmul are natively

@@ -820,10 +828,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,

setOperationAction(ISD::SRA, MVT::i64, Custom);

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);

- setOperationAction(ISD::ADDC, MVT::i32, Custom);

- setOperationAction(ISD::ADDE, MVT::i32, Custom);

- setOperationAction(ISD::SUBC, MVT::i32, Custom);

- setOperationAction(ISD::SUBE, MVT::i32, Custom);

+ // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.

+ if (Subtarget->isThumb1Only()) {

+ setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);

+ setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);

+ setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);

+ }

if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())

setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);

@@ -949,7 +959,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

- if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())

+ if (Subtarget->isTargetWindows())

setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);

else

setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);

@@ -1036,13 +1046,18 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,

setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);

setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);

setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);

+ if (Subtarget->hasFullFP16()) {

+ setOperationAction(ISD::SETCC, MVT::f16, Expand);

+ setOperationAction(ISD::SELECT, MVT::f16, Custom);

+ setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);

+ }

- // Thumb-1 cannot currently select ARMISD::SUBE.

- if (!Subtarget->isThumb1Only())

- setOperationAction(ISD::SETCCE, MVT::i32, Custom);

+ setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);

setOperationAction(ISD::BRCOND, MVT::Other, Custom);

setOperationAction(ISD::BR_CC, MVT::i32, Custom);

+ if (Subtarget->hasFullFP16())

+ setOperationAction(ISD::BR_CC, MVT::f16, Custom);

setOperationAction(ISD::BR_CC, MVT::f32, Custom);

setOperationAction(ISD::BR_CC, MVT::f64, Custom);

setOperationAction(ISD::BR_JT, MVT::Other, Custom);

@@ -1121,6 +1136,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,

if (Subtarget->hasNEON()) {

// vmin and vmax aren't available in a scalar form, so we use

// a NEON instruction with an undef lane instead.

+ setOperationAction(ISD::FMINNAN, MVT::f16, Legal);

+ setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);

setOperationAction(ISD::FMINNAN, MVT::f32, Legal);

setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);

setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);

@@ -1259,6 +1276,9 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {

case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD";

case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR";

+ case ARMISD::VMOVhr: return "ARMISD::VMOVhr";

+ case ARMISD::VMOVrh: return "ARMISD::VMOVrh";

+ case ARMISD::VMOVSR: return "ARMISD::VMOVSR";

case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";

case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";

@@ -1337,6 +1357,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {

case ARMISD::SMLALDX: return "ARMISD::SMLALDX";

case ARMISD::SMLSLD: return "ARMISD::SMLSLD";

case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX";

+ case ARMISD::SMMLAR: return "ARMISD::SMMLAR";

+ case ARMISD::SMMLSR: return "ARMISD::SMMLSR";

case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";

case ARMISD::BFI: return "ARMISD::BFI";

case ARMISD::VORRIMM: return "ARMISD::VORRIMM";

@@ -2465,12 +2487,37 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

assert(VA.isRegLoc() && "Can only return in registers!");

SDValue Arg = OutVals[realRVLocIdx];

+ bool ReturnF16 = false;

+ if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {

+ // Half-precision return values can be returned like this:

+ //

+ // t11 f16 = fadd ...

+ // t12: i16 = bitcast t11

+ // t13: i32 = zero_extend t12

+ // t14: f32 = bitcast t13 <~~~~~~~ Arg

+ //

+ // to avoid code generation for bitcasts, we simply set Arg to the node

+ // that produces the f16 value, t11 in this case.

+ //

+ if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {

+ SDValue ZE = Arg.getOperand(0);

+ if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {

+ SDValue BC = ZE.getOperand(0);

+ if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {

+ Arg = BC.getOperand(0);

+ ReturnF16 = true;

+ }

switch (VA.getLocInfo()) {

default: llvm_unreachable("Unknown loc info!");

case CCValAssign::Full: break;

case CCValAssign::BCvt:

- Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);

+ if (!ReturnF16)

+ Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);

break;

}

@@ -2518,7 +2565,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

// Guarantee that all emitted copies are

// stuck together, avoiding something bad.

Flag = Chain.getValue(1);

- RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));

+ RetOps.push_back(DAG.getRegister(VA.getLocReg(),

+ ReturnF16 ? MVT::f16 : VA.getLocVT()));

}

const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();

const MCPhysReg *I =

@@ -2738,7 +2786,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,

return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);

}

-/// \brief Convert a TLS address reference into the correct sequence of loads

+/// Convert a TLS address reference into the correct sequence of loads

/// and calls to compute the variable's address for Darwin, and return an

/// SDValue containing the final node.

@@ -2959,7 +3007,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,

SDValue

ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

- if (DAG.getTarget().Options.EmulatedTLS)

+ if (DAG.getTarget().useEmulatedTLS())

return LowerToTLSEmulatedModel(GA, DAG);

if (Subtarget->isTargetDarwin())

@@ -3675,11 +3723,14 @@ SDValue ARMTargetLowering::LowerFormalArguments(

} else {

const TargetRegisterClass *RC;

- if (RegVT == MVT::f32)

+ if (RegVT == MVT::f16)

+ RC = &ARM::HPRRegClass;

+ else if (RegVT == MVT::f32)

RC = &ARM::SPRRegClass;

- else if (RegVT == MVT::f64)

+ else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)

RC = &ARM::DPRRegClass;

- else if (RegVT == MVT::v2f64)

+ else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)

RC = &ARM::QPRRegClass;

else if (RegVT == MVT::i32)

RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass

@@ -3799,8 +3850,8 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,

const SDLoc &dl) const {

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {

unsigned C = RHSC->getZExtValue();

- if (!isLegalICmpImmediate(C)) {

- // Constant does not fit, try adjusting it by one?

+ if (!isLegalICmpImmediate((int32_t)C)) {

+ // Constant does not fit, try adjusting it by one.

switch (CC) {

default: break;

case ISD::SETLT:

@@ -3940,6 +3991,29 @@ ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,

Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);

OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);

break;

+ case ISD::UMULO:

+ // We generate a UMUL_LOHI and then check if the high word is 0.

+ ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);

+ Value = DAG.getNode(ISD::UMUL_LOHI, dl,

+ DAG.getVTList(Op.getValueType(), Op.getValueType()),

+ LHS, RHS);

+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),

+ DAG.getConstant(0, dl, MVT::i32));

+ Value = Value.getValue(0); // We only want the low 32 bits for the result.

+ break;

+ case ISD::SMULO:

+ // We generate a SMUL_LOHI and then check if all the bits of the high word

+ // are the same as the sign bit of the low word.

+ ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);

+ Value = DAG.getNode(ISD::SMUL_LOHI, dl,

+ DAG.getVTList(Op.getValueType(), Op.getValueType()),

+ LHS, RHS);

+ OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),

+ DAG.getNode(ISD::SRA, dl, Op.getValueType(),

+ Value.getValue(0),

+ DAG.getConstant(31, dl, MVT::i32)));

+ Value = Value.getValue(0); // We only want the low 32 bits for the result.

+ break;

} // switch (...)

return std::make_pair(Value, OverflowCmp);

@@ -3973,11 +4047,12 @@ static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,

SDLoc DL(BoolCarry);

EVT CarryVT = BoolCarry.getValueType();

- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());

// This converts the boolean value carry into the carry flag by doing

- // ARMISD::ADDC Carry, ~0

- return DAG.getNode(ARMISD::ADDC, DL, DAG.getVTList(CarryVT, MVT::i32),

- BoolCarry, DAG.getConstant(NegOne, DL, CarryVT));

+ // ARMISD::SUBC Carry, 1

+ SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,

+ DAG.getVTList(CarryVT, MVT::i32),

+ BoolCarry, DAG.getConstant(1, DL, CarryVT));

+ return Carry.getValue(1);

}

static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,

@@ -4313,6 +4388,48 @@ static bool isSaturatingConditional(const SDValue &Op, SDValue &V,

return false;

}

+// Check if a condition of the type x < k ? k : x can be converted into a

+// bit operation instead of conditional moves.

+// Currently this is allowed given:

+// - The conditions and values match up

+// - k is 0 or -1 (all ones)

+// This function will not check the last condition, thats up to the caller

+// It returns true if the transformation can be made, and in such case

+// returns x in V, and k in SatK.

+static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,

+ SDValue &SatK)

+ SDValue LHS = Op.getOperand(0);

+ SDValue RHS = Op.getOperand(1);

+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();

+ SDValue TrueVal = Op.getOperand(2);

+ SDValue FalseVal = Op.getOperand(3);

+ SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)

+ ? &RHS

+ : nullptr;

+ // No constant operation in comparison, early out

+ if (!K)

+ return false;

+ SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;

+ V = (KTmp == TrueVal) ? FalseVal : TrueVal;

+ SDValue VTmp = (K && *K == LHS) ? RHS : LHS;

+ // If the constant on left and right side, or variable on left and right,

+ // does not match, early out

+ if (*K != KTmp || V != VTmp)

+ return false;

+ if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {

+ SatK = *K;

+ return true;

+ }

+ return false;

SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {

EVT VT = Op.getValueType();

SDLoc dl(Op);

@@ -4331,6 +4448,25 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {

DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));

}

+ // Try to convert expressions of the form x < k ? k : x (and similar forms)

+ // into more efficient bit operations, which is possible when k is 0 or -1

+ // On ARM and Thumb-2 which have flexible operand 2 this will result in

+ // single instructions. On Thumb the shift and the bit operation will be two

+ // instructions.

+ // Only allow this transformation on full-width (32-bit) operations

+ SDValue LowerSatConstant;

+ if (VT == MVT::i32 &&

+ isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {

+ SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,

+ DAG.getConstant(31, dl, VT));

+ if (isNullConstant(LowerSatConstant)) {

+ SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,

+ DAG.getAllOnesConstant(dl, VT));

+ return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);

+ } else if (isAllOnesConstant(LowerSatConstant))

+ return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);

+ }

SDValue LHS = Op.getOperand(0);

SDValue RHS = Op.getOperand(1);

ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();

@@ -4380,9 +4516,12 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {

bool InvalidOnQNaN;

FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);

- // Try to generate VMAXNM/VMINNM on ARMv8.

- if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||

- TrueVal.getValueType() == MVT::f64)) {

+ // Normalize the fp compare. If RHS is zero we keep it there so we match

+ // CMPFPw0 instead of CMPFP.

+ if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) &&

+ (TrueVal.getValueType() == MVT::f16 ||

+ TrueVal.getValueType() == MVT::f32 ||

+ TrueVal.getValueType() == MVT::f64)) {

bool swpCmpOps = false;

bool swpVselOps = false;

checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);

@@ -4532,10 +4671,14 @@ SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

SDValue Dest = Op.getOperand(2);

SDLoc dl(Op);

- // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.

+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch

+ // instruction.

unsigned Opc = Cond.getOpcode();

- if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO ||

- Opc == ISD::SSUBO || Opc == ISD::USUBO)) {

+ bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&

+ !Subtarget->isThumb1Only();

+ if (Cond.getResNo() == 1 &&

+ (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||

+ Opc == ISD::USUBO || OptimizeMul)) {

// Only lower legal XALUO ops.

if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))

return SDValue();

@@ -4579,11 +4722,15 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {

}

- // Optimize {s|u}{add|sub}.with.overflow feeding into a branch instruction.

+ // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch

+ // instruction.

unsigned Opc = LHS.getOpcode();

+ bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&

+ !Subtarget->isThumb1Only();

if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&

(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||

- Opc == ISD::USUBO) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {

+ Opc == ISD::USUBO || OptimizeMul) &&

+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {

// Only lower legal XALUO ops.

if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))

return SDValue();

@@ -4614,8 +4761,6 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {

Chain, Dest, ARMcc, CCR, Cmp);

}

- assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);

if (getTargetMachine().Options.UnsafeFPMath &&

(CC == ISD::SETEQ || CC == ISD::SETOEQ ||

CC == ISD::SETNE || CC == ISD::SETUNE)) {

@@ -4979,7 +5124,8 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,

/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64

/// operand type is illegal (e.g., v2f32 for a target that doesn't support

/// vectors), since the legalizer won't know what to do with that.

-static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {

+static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,

+ const ARMSubtarget *Subtarget) {

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

SDLoc dl(N);

SDValue Op = N->getOperand(0);

@@ -4988,8 +5134,78 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {

// source or destination of the bit convert.

EVT SrcVT = Op.getValueType();

EVT DstVT = N->getValueType(0);

- assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&

- "ExpandBITCAST called for non-i64 type");

+ const bool HasFullFP16 = Subtarget->hasFullFP16();

+ if (SrcVT == MVT::f32 && DstVT == MVT::i32) {

+ // FullFP16: half values are passed in S-registers, and we don't

+ // need any of the bitcast and moves:

+ //

+ // t2: f32,ch = CopyFromReg t0, Register:f32 %0

+ // t5: i32 = bitcast t2

+ // t18: f16 = ARMISD::VMOVhr t5

+ if (Op.getOpcode() != ISD::CopyFromReg ||

+ Op.getValueType() != MVT::f32)

+ return SDValue();

+ auto Move = N->use_begin();

+ if (Move->getOpcode() != ARMISD::VMOVhr)

+ return SDValue();

+ SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };

+ SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);

+ DAG.ReplaceAllUsesWith(*Move, &Copy);

+ return Copy;

+ }

+ if (SrcVT == MVT::i16 && DstVT == MVT::f16) {

+ if (!HasFullFP16)

+ return SDValue();

+ // SoftFP: read half-precision arguments:

+ //

+ // t2: i32,ch = ...

+ // t7: i16 = truncate t2 <~~~~ Op

+ // t8: f16 = bitcast t7 <~~~~ N

+ //

+ if (Op.getOperand(0).getValueType() == MVT::i32)

+ return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),

+ MVT::f16, Op.getOperand(0));

+ return SDValue();

+ }

+ // Half-precision return values

+ if (SrcVT == MVT::f16 && DstVT == MVT::i16) {

+ if (!HasFullFP16)

+ return SDValue();

+ //

+ // t11: f16 = fadd t8, t10

+ // t12: i16 = bitcast t11 <~~~ SDNode N

+ // t13: i32 = zero_extend t12

+ // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13

+ // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1

+ //

+ // transform this into:

+ //

+ // t20: i32 = ARMISD::VMOVrh t11

+ // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20

+ //

+ auto ZeroExtend = N->use_begin();

+ if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||

+ ZeroExtend->getValueType(0) != MVT::i32)

+ return SDValue();

+ auto Copy = ZeroExtend->use_begin();

+ if (Copy->getOpcode() == ISD::CopyToReg &&

+ Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {

+ SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);

+ DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);

+ return Cvt;

+ }

+ return SDValue();

+ }

+ if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))

+ return SDValue();

// Turn i64->f64 into VMOVDRR.

if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {

@@ -5566,16 +5782,22 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {

return Result;

}

-static SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) {

+static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {

SDValue LHS = Op.getOperand(0);

SDValue RHS = Op.getOperand(1);

SDValue Carry = Op.getOperand(2);

SDValue Cond = Op.getOperand(3);

SDLoc DL(Op);

- assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");

+ assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");

+ // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we

+ // have to invert the carry first.

+ Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,

+ DAG.getConstant(1, DL, MVT::i32), Carry);

+ // This converts the boolean value carry into the carry flag.

+ Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);

- assert(Carry.getOpcode() != ISD::CARRY_FALSE);

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);

@@ -5731,23 +5953,34 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,

SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,

const ARMSubtarget *ST) const {

- bool IsDouble = Op.getValueType() == MVT::f64;

+ EVT VT = Op.getValueType();

+ bool IsDouble = (VT == MVT::f64);

ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);

const APFloat &FPVal = CFP->getValueAPF();

// Prevent floating-point constants from using literal loads

// when execute-only is enabled.

if (ST->genExecuteOnly()) {

+ // If we can represent the constant as an immediate, don't lower it

+ if (isFPImmLegal(FPVal, VT))

+ return Op;

+ // Otherwise, construct as integer, and move to float register

APInt INTVal = FPVal.bitcastToAPInt();

SDLoc DL(CFP);

- if (IsDouble) {

- SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);

- SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);

- if (!ST->isLittle())

- std::swap(Lo, Hi);

- return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);

- } else {

- return DAG.getConstant(INTVal, DL, MVT::i32);

+ switch (VT.getSimpleVT().SimpleTy) {

+ default:

+ llvm_unreachable("Unknown floating point type!");

+ break;

+ case MVT::f64: {

+ SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);

+ SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);

+ if (!ST->isLittle())

+ std::swap(Lo, Hi);

+ return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);

+ }

+ case MVT::f32:

+ return DAG.getNode(ARMISD::VMOVSR, DL, VT,

+ DAG.getConstant(INTVal, DL, MVT::i32));

}

@@ -6598,10 +6831,9 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,

}

// Final sanity check before we try to actually produce a shuffle.

- DEBUG(

- for (auto Src : Sources)

- assert(Src.ShuffleVec.getValueType() == ShuffleVT);

- );

+ LLVM_DEBUG(for (auto Src

+ : Sources)

+ assert(Src.ShuffleVec.getValueType() == ShuffleVT););

// The stars all align, our next step is to produce the mask for the shuffle.

SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);

@@ -7490,39 +7722,15 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {

return N0;

}

-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {

- EVT VT = Op.getNode()->getValueType(0);

- SDVTList VTs = DAG.getVTList(VT, MVT::i32);

- unsigned Opc;

- bool ExtraOp = false;

- switch (Op.getOpcode()) {

- default: llvm_unreachable("Invalid code");

- case ISD::ADDC: Opc = ARMISD::ADDC; break;

- case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;

- case ISD::SUBC: Opc = ARMISD::SUBC; break;

- case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;

- }

- if (!ExtraOp)

- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),

- Op.getOperand(1));

- return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),

- Op.getOperand(1), Op.getOperand(2));

static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

SDNode *N = Op.getNode();

EVT VT = N->getValueType(0);

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

SDValue Carry = Op.getOperand(2);

- EVT CarryVT = Carry.getValueType();

SDLoc DL(Op);

- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());

SDValue Result;

if (Op.getOpcode() == ISD::ADDCARRY) {

// This converts the boolean value carry into the carry flag.

@@ -7530,7 +7738,7 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

// Do the addition proper using the carry flag we wanted.

Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),

- Op.getOperand(1), Carry.getValue(1));

+ Op.getOperand(1), Carry);

// Now convert the carry flag into a boolean value.

Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);

@@ -7544,7 +7752,7 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

// Do the subtraction proper using the carry flag we wanted.

Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),

- Op.getOperand(1), Carry.getValue(1));

+ Op.getOperand(1), Carry);

// Now convert the carry flag into a boolean value.

Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);

@@ -7851,7 +8059,7 @@ static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,

}

SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

- DEBUG(dbgs() << "Lowering node: "; Op.dump());

+ LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());

switch (Op.getOpcode()) {

default: llvm_unreachable("Don't know how to custom lower this!");

case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);

@@ -7879,7 +8087,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,

Subtarget);

- case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG);

+ case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);

case ISD::SHL:

case ISD::SRL:

case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);

@@ -7892,7 +8100,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);

case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);

case ISD::SETCC: return LowerVSETCC(Op, DAG);

- case ISD::SETCCE: return LowerSETCCE(Op, DAG);

+ case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);

case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);

@@ -7909,10 +8117,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())

return LowerDIV_Windows(Op, DAG, /* Signed */ false);

return LowerUDIV(Op, DAG);

- case ISD::ADDC:

- case ISD::ADDE:

- case ISD::SUBC:

- case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);

case ISD::ADDCARRY:

case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);

case ISD::SADDO:

@@ -7927,7 +8131,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

case ISD::SDIVREM:

case ISD::UDIVREM: return LowerDivRem(Op, DAG);

case ISD::DYNAMIC_STACKALLOC:

- if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())

+ if (Subtarget->isTargetWindows())

return LowerDYNAMIC_STACKALLOC(Op, DAG);

llvm_unreachable("Don't know how to custom lower this!");

case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);

@@ -7981,7 +8185,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,

ExpandREAD_REGISTER(N, Results, DAG);

break;

case ISD::BITCAST:

- Res = ExpandBITCAST(N, DAG);

+ Res = ExpandBITCAST(N, DAG, Subtarget);

break;

case ISD::SRL:

case ISD::SRA:

@@ -9055,8 +9259,6 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

// Thumb1 post-indexed loads are really just single-register LDMs.

case ARM::tLDR_postidx: {

MachineOperand Def(MI.getOperand(1));

- if (TargetRegisterInfo::isPhysicalRegister(Def.getReg()))

- Def.setIsRenamable(false);

BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))

.add(Def) // Rn_wb

.add(MI.getOperand(2)) // Rn

@@ -9323,7 +9525,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

}

-/// \brief Attaches vregs to MEMCPY that it will use as scratch registers

+/// Attaches vregs to MEMCPY that it will use as scratch registers

/// when it is expanded into LDM/STM. This is done as a post-isel lowering

/// instead of as a custom inserter because we need the use list from the SDNode.

static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,

@@ -9860,7 +10062,7 @@ static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,

return resNode;

}

-static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,

+static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,

TargetLowering::DAGCombinerInfo &DCI,

const ARMSubtarget *Subtarget) {

// Look for multiply add opportunities.

@@ -9877,49 +10079,61 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,

// V V

// ADDE <- hiAdd

- assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE");

- assert(AddeNode->getNumOperands() == 3 &&

- AddeNode->getOperand(2).getValueType() == MVT::i32 &&

+ // In the special case where only the higher part of a signed result is used

+ // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts

+ // a constant with the exact value of 0x80000000, we recognize we are dealing

+ // with a "rounded multiply and add" (or subtract) and transform it into

+ // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.

+ assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||

+ AddeSubeNode->getOpcode() == ARMISD::SUBE) &&

+ "Expect an ADDE or SUBE");

+ assert(AddeSubeNode->getNumOperands() == 3 &&

+ AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&

"ADDE node has the wrong inputs");

- // Check that we are chained to the right ADDC node.

- SDNode* AddcNode = AddeNode->getOperand(2).getNode();

- if (AddcNode->getOpcode() != ARMISD::ADDC)

+ // Check that we are chained to the right ADDC or SUBC node.

+ SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();

+ if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&

+ AddcSubcNode->getOpcode() != ARMISD::ADDC) ||

+ (AddeSubeNode->getOpcode() == ARMISD::SUBE &&

+ AddcSubcNode->getOpcode() != ARMISD::SUBC))

return SDValue();

- SDValue AddcOp0 = AddcNode->getOperand(0);

- SDValue AddcOp1 = AddcNode->getOperand(1);

+ SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);

+ SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);

// Check if the two operands are from the same mul_lohi node.

- if (AddcOp0.getNode() == AddcOp1.getNode())

+ if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())

return SDValue();

- assert(AddcNode->getNumValues() == 2 &&

- AddcNode->getValueType(0) == MVT::i32 &&

+ assert(AddcSubcNode->getNumValues() == 2 &&

+ AddcSubcNode->getValueType(0) == MVT::i32 &&

"Expect ADDC with two result values. First: i32");

// Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it

// maybe a SMLAL which multiplies two 16-bit values.

- if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&

- AddcOp0->getOpcode() != ISD::SMUL_LOHI &&

- AddcOp1->getOpcode() != ISD::UMUL_LOHI &&

- AddcOp1->getOpcode() != ISD::SMUL_LOHI)

- return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget);

+ if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&

+ AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&

+ AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&

+ AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&

+ AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)

+ return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);

// Check for the triangle shape.

- SDValue AddeOp0 = AddeNode->getOperand(0);

- SDValue AddeOp1 = AddeNode->getOperand(1);

+ SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);

+ SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);

- // Make sure that the ADDE operands are not coming from the same node.

- if (AddeOp0.getNode() == AddeOp1.getNode())

+ // Make sure that the ADDE/SUBE operands are not coming from the same node.

+ if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())

return SDValue();

- // Find the MUL_LOHI node walking up ADDE's operands.

+ // Find the MUL_LOHI node walking up ADDE/SUBE's operands.

bool IsLeftOperandMUL = false;

- SDValue MULOp = findMUL_LOHI(AddeOp0);

+ SDValue MULOp = findMUL_LOHI(AddeSubeOp0);

if (MULOp == SDValue())

- MULOp = findMUL_LOHI(AddeOp1);

+ MULOp = findMUL_LOHI(AddeSubeOp1);

else

IsLeftOperandMUL = true;

if (MULOp == SDValue())

@@ -9930,63 +10144,88 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,

unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;

// Figure out the high and low input values to the MLAL node.

- SDValue* HiAdd = nullptr;

- SDValue* LoMul = nullptr;

- SDValue* LowAdd = nullptr;

+ SDValue *HiAddSub = nullptr;

+ SDValue *LoMul = nullptr;

+ SDValue *LowAddSub = nullptr;

- // Ensure that ADDE is from high result of ISD::xMUL_LOHI.

- if ((AddeOp0 != MULOp.getValue(1)) && (AddeOp1 != MULOp.getValue(1)))

+ // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.

+ if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))

return SDValue();

if (IsLeftOperandMUL)

- HiAdd = &AddeOp1;

+ HiAddSub = &AddeSubeOp1;

else

- HiAdd = &AddeOp0;

+ HiAddSub = &AddeSubeOp0;

+ // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node

+ // whose low result is fed to the ADDC/SUBC we are checking.

- // Ensure that LoMul and LowAdd are taken from correct ISD::SMUL_LOHI node

- // whose low result is fed to the ADDC we are checking.

- if (AddcOp0 == MULOp.getValue(0)) {

- LoMul = &AddcOp0;

- LowAdd = &AddcOp1;

+ if (AddcSubcOp0 == MULOp.getValue(0)) {

+ LoMul = &AddcSubcOp0;

+ LowAddSub = &AddcSubcOp1;

}

- if (AddcOp1 == MULOp.getValue(0)) {

- LoMul = &AddcOp1;

- LowAdd = &AddcOp0;

+ if (AddcSubcOp1 == MULOp.getValue(0)) {

+ LoMul = &AddcSubcOp1;

+ LowAddSub = &AddcSubcOp0;

}

if (!LoMul)

return SDValue();

- // If HiAdd is the same node as ADDC or is a predecessor of ADDC the

- // replacement below will create a cycle.

- if (AddcNode == HiAdd->getNode() ||

- AddcNode->isPredecessorOf(HiAdd->getNode()))

+ // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC

+ // the replacement below will create a cycle.

+ if (AddcSubcNode == HiAddSub->getNode() ||

+ AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))

return SDValue();

// Create the merged node.

SelectionDAG &DAG = DCI.DAG;

- // Build operand list.

+ // Start building operand list.

SmallVector<SDValue, 8> Ops;

Ops.push_back(LoMul->getOperand(0));

Ops.push_back(LoMul->getOperand(1));

- Ops.push_back(*LowAdd);

- Ops.push_back(*HiAdd);

- SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcNode),

+ // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be

+ // the case, we must be doing signed multiplication and only use the higher

+ // part of the result of the MLAL, furthermore the LowAddSub must be a constant

+ // addition or subtraction with the value of 0x800000.

+ if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&

+ FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&

+ LowAddSub->getNode()->getOpcode() == ISD::Constant &&

+ static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==

+ 0x80000000) {

+ Ops.push_back(*HiAddSub);

+ if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {

+ FinalOpc = ARMISD::SMMLSR;

+ } else {

+ FinalOpc = ARMISD::SMMLAR;

+ }

+ SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);

+ return SDValue(AddeSubeNode, 0);

+ } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)

+ // SMMLS is generated during instruction selection and the rest of this

+ // function can not handle the case where AddcSubcNode is a SUBC.

+ return SDValue();

+ // Finish building the operand list for {U/S}MLAL

+ Ops.push_back(*LowAddSub);

+ Ops.push_back(*HiAddSub);

+ SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),

DAG.getVTList(MVT::i32, MVT::i32), Ops);

// Replace the ADDs' nodes uses by the MLA node's values.

SDValue HiMLALResult(MLALNode.getNode(), 1);

- DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);

SDValue LoMLALResult(MLALNode.getNode(), 0);

- DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);

+ DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);

// Return original node to notify the driver to stop replacing.

- return SDValue(AddeNode, 0);

+ return SDValue(AddeSubeNode, 0);

}

static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,

@@ -10071,13 +10310,13 @@ static SDValue PerformAddcSubcCombine(SDNode *N,

const ARMSubtarget *Subtarget) {

SelectionDAG &DAG(DCI.DAG);

- if (N->getOpcode() == ARMISD::ADDC) {

- // (ADDC (ADDE 0, 0, C), -1) -> C

+ if (N->getOpcode() == ARMISD::SUBC) {

+ // (SUBC (ADDE 0, 0, C), 1) -> C

SDValue LHS = N->getOperand(0);

SDValue RHS = N->getOperand(1);

if (LHS->getOpcode() == ARMISD::ADDE &&

isNullConstant(LHS->getOperand(0)) &&

- isNullConstant(LHS->getOperand(1)) && isAllOnesConstant(RHS)) {

+ isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {

return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));

}

@@ -10095,12 +10334,15 @@ static SDValue PerformAddcSubcCombine(SDNode *N,

}

return SDValue();

}

-static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,

+static SDValue PerformAddeSubeCombine(SDNode *N,

+ TargetLowering::DAGCombinerInfo &DCI,

const ARMSubtarget *Subtarget) {

if (Subtarget->isThumb1Only()) {

+ SelectionDAG &DAG = DCI.DAG;

SDValue RHS = N->getOperand(1);

if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {

int64_t imm = C->getSExtValue();

@@ -10118,6 +10360,8 @@ static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,

N->getOperand(0), RHS, N->getOperand(2));

}

+ } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {

+ return AddCombineTo64bitMLAL(N, DCI, Subtarget);

}

return SDValue();

}

@@ -10130,7 +10374,7 @@ static SDValue PerformADDECombine(SDNode *N,

const ARMSubtarget *Subtarget) {

// Only ARM and Thumb2 support UMLAL/SMLAL.

if (Subtarget->isThumb1Only())

- return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);

+ return PerformAddeSubeCombine(N, DCI, Subtarget);

// Only perform the checks after legalize when the pattern is available.

if (DCI.isBeforeLegalize()) return SDValue();

@@ -10201,7 +10445,14 @@ static SDValue PerformSHLSimplify(SDNode *N,

case ISD::XOR:

case ISD::SETCC:

case ARMISD::CMP:

- // Check that its not already using a shl.

+ // Check that the user isn't already using a constant because there

+ // aren't any instructions that support an immediate operand and a

+ // shifted operand.

+ if (isa<ConstantSDNode>(U->getOperand(0)) ||

+ isa<ConstantSDNode>(U->getOperand(1)))

+ return SDValue();

+ // Check that it's not already using a shift.

if (U->getOperand(0).getOpcode() == ISD::SHL ||

U->getOperand(1).getOpcode() == ISD::SHL)

return SDValue();

@@ -10223,8 +10474,6 @@ static SDValue PerformSHLSimplify(SDNode *N,

if (!C1ShlC2 || !C2)

return SDValue();

- DEBUG(dbgs() << "Trying to simplify shl: "; N->dump());

APInt C2Int = C2->getAPIntValue();

APInt C1Int = C1ShlC2->getAPIntValue();

@@ -10238,12 +10487,12 @@ static SDValue PerformSHLSimplify(SDNode *N,

C1Int.lshrInPlace(C2Int);

// The immediates are encoded as an 8-bit value that can be rotated.

- unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros();

- if (C1Int.getBitWidth() - Zeros > 8)

- return SDValue();

+ auto LargeImm = [](const APInt &Imm) {

+ unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();

+ return Imm.getBitWidth() - Zeros > 8;

+ };

- Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros();

- if (C2Int.getBitWidth() - Zeros > 8)

+ if (LargeImm(C1Int) || LargeImm(C2Int))

return SDValue();

SelectionDAG &DAG = DCI.DAG;

@@ -10254,6 +10503,10 @@ static SDValue PerformSHLSimplify(SDNode *N,

// Shift left to compensate for the lshr of C1Int.

SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));

+ LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();

+ SHL.dump(); N->dump());

+ LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());

DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);

return SDValue(N, 0);

}

@@ -10423,6 +10676,83 @@ static SDValue PerformMULCombine(SDNode *N,

return SDValue();

}

+static SDValue CombineANDShift(SDNode *N,

+ TargetLowering::DAGCombinerInfo &DCI,

+ const ARMSubtarget *Subtarget) {

+ // Allow DAGCombine to pattern-match before we touch the canonical form.

+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

+ return SDValue();

+ if (N->getValueType(0) != MVT::i32)

+ return SDValue();

+ ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));

+ if (!N1C)

+ return SDValue();

+ uint32_t C1 = (uint32_t)N1C->getZExtValue();

+ // Don't transform uxtb/uxth.

+ if (C1 == 255 || C1 == 65535)

+ return SDValue();

+ SDNode *N0 = N->getOperand(0).getNode();

+ if (!N0->hasOneUse())

+ return SDValue();

+ if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)

+ return SDValue();

+ bool LeftShift = N0->getOpcode() == ISD::SHL;

+ ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));

+ if (!N01C)

+ return SDValue();

+ uint32_t C2 = (uint32_t)N01C->getZExtValue();

+ if (!C2 || C2 >= 32)

+ return SDValue();

+ SelectionDAG &DAG = DCI.DAG;

+ SDLoc DL(N);

+ // We have a pattern of the form "(and (shl x, c2) c1)" or

+ // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to

+ // transform to a pair of shifts, to save materializing c1.

+ // First pattern: right shift, and c1+1 is a power of two.

+ // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power

+ // of two).

+ // FIXME: Use demanded bits?

+ if (!LeftShift && isMask_32(C1)) {

+ uint32_t C3 = countLeadingZeros(C1);

+ if (C2 < C3) {

+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),

+ DAG.getConstant(C3 - C2, DL, MVT::i32));

+ return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,

+ DAG.getConstant(C3, DL, MVT::i32));

+ }

+ // Second pattern: left shift, and (c1>>c2)+1 is a power of two.

+ // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1

+ // is a power of two).

+ // FIXME: Use demanded bits?

+ if (LeftShift && isShiftedMask_32(C1)) {

+ uint32_t C3 = countLeadingZeros(C1);

+ if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) {

+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),

+ DAG.getConstant(C2 + C3, DL, MVT::i32));

+ return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,

+ DAG.getConstant(C3, DL, MVT::i32));

+ }

+ // FIXME: Transform "(and (shl x, c2) c1)" ->

+ // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than

+ // c1.

+ return SDValue();

static SDValue PerformANDCombine(SDNode *N,

TargetLowering::DAGCombinerInfo &DCI,

const ARMSubtarget *Subtarget) {

@@ -10464,6 +10794,10 @@ static SDValue PerformANDCombine(SDNode *N,

return Result;

}

+ if (Subtarget->isThumb1Only())

+ if (SDValue Result = CombineANDShift(N, DCI, Subtarget))

+ return Result;

return SDValue();

}

@@ -11012,7 +11346,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,

return DAG.getNode(ISD::BITCAST, dl, VT, BV);

}

-/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.

+/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.

static SDValue

PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {

// ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.

@@ -11228,6 +11562,12 @@ static SDValue CombineBaseUpdate(SDNode *N,

NumVecs = 3; break;

case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;

NumVecs = 4; break;

+ case Intrinsic::arm_neon_vld2dup:

+ case Intrinsic::arm_neon_vld3dup:

+ case Intrinsic::arm_neon_vld4dup:

+ // TODO: Support updating VLDxDUP nodes. For now, we just skip

+ // combining base updates for such intrinsics.

+ continue;

case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;

NumVecs = 2; isLaneOp = true; break;

case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;

@@ -12306,6 +12646,89 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {

}

+ if (!VT.isInteger())

+ return SDValue();

+ // Materialize a boolean comparison for integers so we can avoid branching.

+ if (isNullConstant(FalseVal)) {

+ if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {

+ if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {

+ // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it

+ // right 5 bits will make that 32 be 1, otherwise it will be 0.

+ // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5

+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);

+ Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),

+ DAG.getConstant(5, dl, MVT::i32));

+ } else {

+ // CMOV 0, 1, ==, (CMPZ x, y) ->

+ // (ADDCARRY (SUB x, y), t:0, t:1)

+ // where t = (SUBCARRY 0, (SUB x, y), 0)

+ //

+ // The SUBCARRY computes 0 - (x - y) and this will give a borrow when

+ // x != y. In other words, a carry C == 1 when x == y, C == 0

+ // otherwise.

+ // The final ADDCARRY computes

+ // x - y + (0 - (x - y)) + C == C

+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);

+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);

+ SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);

+ // ISD::SUBCARRY returns a borrow but we want the carry here

+ // actually.

+ SDValue Carry =

+ DAG.getNode(ISD::SUB, dl, MVT::i32,

+ DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));

+ Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);

+ }

+ } else if (CC == ARMCC::NE && LHS != RHS &&

+ (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {

+ // This seems pointless but will allow us to combine it further below.

+ // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)

+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);

+ Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,

+ N->getOperand(3), Cmp);

+ }

+ } else if (isNullConstant(TrueVal)) {

+ if (CC == ARMCC::EQ && LHS != RHS &&

+ (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {

+ // This seems pointless but will allow us to combine it further below

+ // Note that we change == for != as this is the dual for the case above.

+ // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)

+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);

+ Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,

+ DAG.getConstant(ARMCC::NE, dl, MVT::i32),

+ N->getOperand(3), Cmp);

+ }

+ // On Thumb1, the DAG above may be further combined if z is a power of 2

+ // (z == 2 ^ K).

+ // CMOV (SUB x, y), z, !=, (CMPZ x, y) ->

+ // merge t3, t4

+ // where t1 = (SUBCARRY (SUB x, y), z, 0)

+ // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)

+ // t3 = if K != 0 then (SHL t2:0, K) else t2:0

+ // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ]

+ const APInt *TrueConst;

+ if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&

+ (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) &&

+ (FalseVal.getOperand(1) == RHS) &&

+ (TrueConst = isPowerOf2Constant(TrueVal))) {

+ SDVTList VTs = DAG.getVTList(VT, MVT::i32);

+ unsigned ShiftAmount = TrueConst->logBase2();

+ if (ShiftAmount)

+ TrueVal = DAG.getConstant(1, dl, VT);

+ SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);

+ Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));

+ // Make it a carry, not a borrow.

+ SDValue Carry = DAG.getNode(

+ ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1));

+ Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry);

+ if (ShiftAmount)

+ Res = DAG.getNode(ISD::SHL, dl, VT, Res,

+ DAG.getConstant(ShiftAmount, dl, MVT::i32));

+ }

if (Res.getNode()) {

KnownBits Known;

DAG.computeKnownBits(SDValue(N,0), Known);

@@ -12338,7 +12761,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,

case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);

case ARMISD::ADDC:

case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);

- case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);

+ case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);

case ARMISD::BFI: return PerformBFICombine(N, DCI);

case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);

case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);

@@ -12424,13 +12847,22 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,

case ISD::INTRINSIC_W_CHAIN:

switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {

case Intrinsic::arm_neon_vld1:

+ case Intrinsic::arm_neon_vld1x2:

+ case Intrinsic::arm_neon_vld1x3:

+ case Intrinsic::arm_neon_vld1x4:

case Intrinsic::arm_neon_vld2:

case Intrinsic::arm_neon_vld3:

case Intrinsic::arm_neon_vld4:

case Intrinsic::arm_neon_vld2lane:

case Intrinsic::arm_neon_vld3lane:

case Intrinsic::arm_neon_vld4lane:

+ case Intrinsic::arm_neon_vld2dup:

+ case Intrinsic::arm_neon_vld3dup:

+ case Intrinsic::arm_neon_vld4dup:

case Intrinsic::arm_neon_vst1:

+ case Intrinsic::arm_neon_vst1x2:

+ case Intrinsic::arm_neon_vst1x3:

+ case Intrinsic::arm_neon_vst1x4:

case Intrinsic::arm_neon_vst2:

case Intrinsic::arm_neon_vst3:

case Intrinsic::arm_neon_vst4:

@@ -12454,6 +12886,10 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,

unsigned,

bool *Fast) const {

+ // Depends what it gets converted into if the type is weird.

+ if (!VT.isSimple())

+ return false;

// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus

bool AllowsUnaligned = Subtarget->allowsUnalignedMem();

@@ -12560,6 +12996,24 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

return false;

}

+bool ARMTargetLowering::isFNegFree(EVT VT) const {

+ if (!VT.isSimple())

+ return false;

+ // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that

+ // negate values directly (fneg is free). So, we don't want to let the DAG

+ // combiner rewrite fneg into xors and some other instructions. For f16 and

+ // FullFP16 argument passing, some bitcast nodes may be introduced,

+ // triggering this DAG combine rewrite, so we are avoiding that with this.

+ switch (VT.getSimpleVT().SimpleTy) {

+ default: break;

+ case MVT::f16:

+ return Subtarget->hasFullFP16();

+ }

+ return false;

bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

EVT VT = ExtVal.getValueType();

@@ -12828,9 +13282,11 @@ bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,

bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {

// Thumb2 and ARM modes can use cmn for negative immediates.

if (!Subtarget->isThumb())

- return ARM_AM::getSOImmVal(std::abs(Imm)) != -1;

+ return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||

+ ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;

if (Subtarget->isThumb2())

- return ARM_AM::getT2SOImmVal(std::abs(Imm)) != -1;

+ return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||

+ ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;

// Thumb1 doesn't have cmn, and only 8-bit immediates.

return Imm >= 0 && Imm <= 255;

}

@@ -13262,8 +13718,14 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint(

return RCPair(0U, &ARM::QPR_8RegClass);

break;

case 't':

+ if (VT == MVT::Other)

+ break;

if (VT == MVT::f32 || VT == MVT::i32)

return RCPair(0U, &ARM::SPRRegClass);

+ if (VT.getSizeInBits() == 64)

+ return RCPair(0U, &ARM::DPR_VFP2RegClass);

+ if (VT.getSizeInBits() == 128)

+ return RCPair(0U, &ARM::QPR_VFP2RegClass);

break;

}

@@ -13593,6 +14055,20 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const

SDValue Chain = Op.getOperand(0);

SDValue Size = Op.getOperand(1);

+ if (DAG.getMachineFunction().getFunction().hasFnAttribute(

+ "no-stack-arg-probe")) {

+ unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();

+ SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);

+ Chain = SP.getValue(1);

+ SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);

+ if (Align)

+ SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),

+ DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));

+ Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);

+ SDValue Ops[2] = { SP, Chain };

+ return DAG.getMergeValues(Ops, DL);

+ }

SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,

DAG.getConstant(2, DL, MVT::i32));

@@ -13656,6 +14132,8 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {

bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {

if (!Subtarget->hasVFP3())

return false;

+ if (VT == MVT::f16 && Subtarget->hasFullFP16())

+ return ARM_AM::getFP16Imm(Imm) != -1;

if (VT == MVT::f32)

return ARM_AM::getFP32Imm(Imm) != -1;

if (VT == MVT::f64 && !Subtarget->isFPOnlySP())

@@ -13677,7 +14155,10 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

case Intrinsic::arm_neon_vld4:

case Intrinsic::arm_neon_vld2lane:

case Intrinsic::arm_neon_vld3lane:

- case Intrinsic::arm_neon_vld4lane: {

+ case Intrinsic::arm_neon_vld4lane:

+ case Intrinsic::arm_neon_vld2dup:

+ case Intrinsic::arm_neon_vld3dup:

+ case Intrinsic::arm_neon_vld4dup: {

Info.opc = ISD::INTRINSIC_W_CHAIN;

// Conservatively set memVT to the entire set of vectors loaded.

auto &DL = I.getCalledFunction()->getParent()->getDataLayout();

@@ -13691,6 +14172,21 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

Info.flags = MachineMemOperand::MOLoad;

return true;

}

+ case Intrinsic::arm_neon_vld1x2:

+ case Intrinsic::arm_neon_vld1x3:

+ case Intrinsic::arm_neon_vld1x4: {

+ Info.opc = ISD::INTRINSIC_W_CHAIN;

+ // Conservatively set memVT to the entire set of vectors loaded.

+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();

+ uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;

+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);

+ Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);

+ Info.offset = 0;

+ Info.align = 0;

+ // volatile loads with NEON intrinsics not supported

+ Info.flags = MachineMemOperand::MOLoad;

+ return true;

+ }

case Intrinsic::arm_neon_vst1:

case Intrinsic::arm_neon_vst2:

case Intrinsic::arm_neon_vst3:

@@ -13717,6 +14213,27 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

Info.flags = MachineMemOperand::MOStore;

return true;

}

+ case Intrinsic::arm_neon_vst1x2:

+ case Intrinsic::arm_neon_vst1x3:

+ case Intrinsic::arm_neon_vst1x4: {

+ Info.opc = ISD::INTRINSIC_VOID;

+ // Conservatively set memVT to the entire set of vectors stored.

+ auto &DL = I.getCalledFunction()->getParent()->getDataLayout();

+ unsigned NumElts = 0;

+ for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {

+ Type *ArgTy = I.getArgOperand(ArgI)->getType();

+ if (!ArgTy->isVectorTy())

+ break;

+ NumElts += DL.getTypeSizeInBits(ArgTy) / 64;

+ }

+ Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);

+ Info.ptrVal = I.getArgOperand(0);

+ Info.offset = 0;

+ Info.align = 0;

+ // volatile stores with NEON intrinsics not supported

+ Info.flags = MachineMemOperand::MOStore;

+ return true;

+ }

case Intrinsic::arm_ldaex:

case Intrinsic::arm_ldrex: {

auto &DL = I.getCalledFunction()->getParent()->getDataLayout();

@@ -13768,7 +14285,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

return false;

}

-/// \brief Returns true if it is beneficial to convert a load of a constant

+/// Returns true if it is beneficial to convert a load of a constant

/// to just the constant itself.

bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

Type *Ty) const {

@@ -14064,7 +14581,7 @@ bool ARMTargetLowering::isLegalInterleavedAccessType(

return VecSize == 64 || VecSize % 128 == 0;

}

-/// \brief Lower an interleaved load into a vldN intrinsic.

+/// Lower an interleaved load into a vldN intrinsic.

///

/// E.g. Lower an interleaved load (Factor = 2):

/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4

@@ -14182,7 +14699,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(

return true;

}

-/// \brief Lower an interleaved store into a vstN intrinsic.

+/// Lower an interleaved store into a vstN intrinsic.

///

/// E.g. Lower an interleaved store (Factor = 3):

/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,

@@ -14380,7 +14897,19 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,

return (Members > 0 && Members <= 4);

}

-/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate or one of

+/// Return the correct alignment for the current calling convention.

+unsigned

+ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,

+ DataLayout DL) const {

+ if (!ArgTy->isVectorTy())

+ return DL.getABITypeAlignment(ArgTy);

+ // Avoid over-aligning vector parameters. It would require realigning the

+ // stack and waste space for no real benefit.

+ return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());

+/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of

/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when

/// passing according to AAPCS rules.

bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(

@@ -14392,7 +14921,7 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(

HABaseType Base = HA_UNKNOWN;

uint64_t Members = 0;

bool IsHA = isHomogeneousAggregate(Ty, Base, Members);

- DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());

+ LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());

bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();

return IsHA || IsIntArray;