diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 2271 |
1 files changed, 1471 insertions, 800 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index d9d37638c8a7..85f1e670045b 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -24,9 +24,9 @@ #include "PPCTargetMachine.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" @@ -47,6 +47,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -86,7 +87,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/KnownBits.h" -#include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -96,6 +96,7 @@ #include <cstdint> #include <iterator> #include <list> +#include <optional> #include <utility> #include <vector> @@ -121,14 +122,24 @@ cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden); static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables", cl::desc("use absolute jump tables on ppc"), cl::Hidden); -static cl::opt<bool> EnableQuadwordAtomics( - "ppc-quadword-atomics", - cl::desc("enable quadword lock-free atomic operations"), cl::init(false), - cl::Hidden); +static cl::opt<bool> + DisablePerfectShuffle("ppc-disable-perfect-shuffle", + cl::desc("disable vector permute decomposition"), + cl::init(true), cl::Hidden); + +cl::opt<bool> DisableAutoPairedVecSt( + "disable-auto-paired-vec-st", + cl::desc("disable automatically generated 32byte paired vector stores"), + cl::init(true), cl::Hidden); + +static cl::opt<unsigned> PPCMinimumJumpTableEntries( + "ppc-min-jump-table-entries", cl::init(64), cl::Hidden, + cl::desc("Set minimum number of entries to use a jump table on PPC")); STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumSiblingCalls, "Number of sibling calls"); -STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM"); +STATISTIC(ShufflesHandledWithVPERM, + "Number of shuffles lowered to a VPERM or XXPERM"); STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed"); static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int); @@ -137,6 +148,12 @@ static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl); static const char AIXSSPCanaryWordName[] = "__ssp_canary_word"; +// A faster local-exec TLS access sequence (enabled with the +// -maix-small-local-exec-tls option) can be produced for TLS variables; +// consistent with the IBM XL compiler, we apply a max size of slightly under +// 32KB. +constexpr uint64_t AIXSmallTlsPolicySizeLimit = 32751; + // FIXME: Remove this once the bug has been fixed! extern cl::opt<bool> ANDIGlueBug; @@ -379,6 +396,24 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM , MVT::f32, Expand); setOperationAction(ISD::FPOW , MVT::f32, Expand); + + // MASS transformation for LLVM intrinsics with replicating fast-math flag + // to be consistent to PPCGenScalarMASSEntries pass + if (TM.getOptLevel() == CodeGenOptLevel::Aggressive) { + setOperationAction(ISD::FSIN , MVT::f64, Custom); + setOperationAction(ISD::FCOS , MVT::f64, Custom); + setOperationAction(ISD::FPOW , MVT::f64, Custom); + setOperationAction(ISD::FLOG, MVT::f64, Custom); + setOperationAction(ISD::FLOG10, MVT::f64, Custom); + setOperationAction(ISD::FEXP, MVT::f64, Custom); + setOperationAction(ISD::FSIN , MVT::f32, Custom); + setOperationAction(ISD::FCOS , MVT::f32, Custom); + setOperationAction(ISD::FPOW , MVT::f32, Custom); + setOperationAction(ISD::FLOG, MVT::f32, Custom); + setOperationAction(ISD::FLOG10, MVT::f32, Custom); + setOperationAction(ISD::FEXP, MVT::f32, Custom); + } + if (Subtarget.hasSPE()) { setOperationAction(ISD::FMA , MVT::f64, Expand); setOperationAction(ISD::FMA , MVT::f32, Expand); @@ -390,7 +425,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.hasSPE()) setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); - setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); + setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); // If we're enabling GP optimizations, use hardware square root if (!Subtarget.hasFSQRT() && @@ -423,14 +458,19 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FROUND, MVT::f32, Legal); } - // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd - // to speed up scalar BSWAP64. + // Prior to P10, PowerPC does not have BSWAP, but we can use vector BSWAP + // instruction xxbrd to speed up scalar BSWAP64. + if (Subtarget.isISA3_1()) { + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BSWAP, MVT::i64, Legal); + } else { + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction( + ISD::BSWAP, MVT::i64, + (Subtarget.hasP9Vector() && Subtarget.isPPC64()) ? Custom : Expand); + } + // CTPOP or CTTZ were introduced in P8/P9 respectively - setOperationAction(ISD::BSWAP, MVT::i32 , Expand); - if (Subtarget.hasP9Vector() && Subtarget.isPPC64()) - setOperationAction(ISD::BSWAP, MVT::i64 , Custom); - else - setOperationAction(ISD::BSWAP, MVT::i64 , Expand); if (Subtarget.isISA3_0()) { setOperationAction(ISD::CTTZ , MVT::i32 , Legal); setOperationAction(ISD::CTTZ , MVT::i64 , Legal); @@ -603,6 +643,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom); // To handle counter-based loop conditions. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom); @@ -813,6 +855,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); + setOperationAction(ISD::FLDEXP, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); @@ -1000,7 +1043,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::STORE, MVT::v2f64, Legal); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); if (Subtarget.hasP8Vector()) addRegisterClass(MVT::f32, &PPC::VSSRCRegClass); @@ -1048,7 +1091,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2i64, Promote); AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal); @@ -1133,6 +1176,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setTruncStoreAction(MVT::f128, MVT::f32, Expand); // No implementation for these ops for PowerPC. + setOperationAction(ISD::FSINCOS, MVT::f128, Expand); setOperationAction(ISD::FSIN, MVT::f128, Expand); setOperationAction(ISD::FCOS, MVT::f128, Expand); setOperationAction(ISD::FPOW, MVT::f128, Expand); @@ -1149,6 +1193,13 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + // Test data class instructions store results in CR bits. + if (Subtarget.useCRBits()) { + setOperationAction(ISD::IS_FPCLASS, MVT::f32, Custom); + setOperationAction(ISD::IS_FPCLASS, MVT::f64, Custom); + setOperationAction(ISD::IS_FPCLASS, MVT::f128, Custom); + } + // 128 bit shifts can be accomplished via 3 instructions for SHL and // SRL, but not for SRA because of the instructions available: // VS{RL} and VS{RL}O. @@ -1263,6 +1314,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal); + + setOperationAction(ISD::ABDU, MVT::v16i8, Legal); + setOperationAction(ISD::ABDU, MVT::v8i16, Legal); + setOperationAction(ISD::ABDU, MVT::v4i32, Legal); + setOperationAction(ISD::ABDS, MVT::v4i32, Legal); + } + + if (Subtarget.hasP10Vector()) { + setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); } } @@ -1272,7 +1332,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STORE, MVT::v256i1, Custom); } if (Subtarget.hasMMA()) { - addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass); + if (Subtarget.isISAFuture()) + addRegisterClass(MVT::v512i1, &PPC::WACCRCRegClass); + else + addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass); setOperationAction(ISD::LOAD, MVT::v512i1, Custom); setOperationAction(ISD::STORE, MVT::v512i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom); @@ -1291,8 +1354,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); } - if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics()) { - setMaxAtomicSizeInBitsSupported(128); + if (shouldInlineQuadwordAtomics()) { setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::i128, Custom); @@ -1313,52 +1375,34 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLibcallName(RTLIB::SRA_I128, nullptr); setLibcallName(RTLIB::MUL_I128, nullptr); setLibcallName(RTLIB::MULO_I64, nullptr); - setLibcallName(RTLIB::MULO_I128, nullptr); } - if (!isPPC64) + if (shouldInlineQuadwordAtomics()) + setMaxAtomicSizeInBitsSupported(128); + else if (isPPC64) + setMaxAtomicSizeInBitsSupported(64); + else setMaxAtomicSizeInBitsSupported(32); setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: - setTargetDAGCombine(ISD::ADD); - setTargetDAGCombine(ISD::SHL); - setTargetDAGCombine(ISD::SRA); - setTargetDAGCombine(ISD::SRL); - setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::FMA); - setTargetDAGCombine(ISD::SINT_TO_FP); - setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine({ISD::AND, ISD::ADD, ISD::SHL, ISD::SRA, ISD::SRL, + ISD::MUL, ISD::FMA, ISD::SINT_TO_FP, ISD::BUILD_VECTOR}); if (Subtarget.hasFPCVT()) setTargetDAGCombine(ISD::UINT_TO_FP); - setTargetDAGCombine(ISD::LOAD); - setTargetDAGCombine(ISD::STORE); - setTargetDAGCombine(ISD::BR_CC); + setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC}); if (Subtarget.useCRBits()) setTargetDAGCombine(ISD::BRCOND); - setTargetDAGCombine(ISD::BSWAP); - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); - setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); - setTargetDAGCombine(ISD::INTRINSIC_VOID); - - setTargetDAGCombine(ISD::SIGN_EXTEND); - setTargetDAGCombine(ISD::ZERO_EXTEND); - setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine({ISD::BSWAP, ISD::INTRINSIC_WO_CHAIN, + ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_VOID}); - setTargetDAGCombine(ISD::TRUNCATE); - setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine({ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, ISD::ANY_EXTEND}); + setTargetDAGCombine({ISD::TRUNCATE, ISD::VECTOR_SHUFFLE}); if (Subtarget.useCRBits()) { - setTargetDAGCombine(ISD::TRUNCATE); - setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::SELECT_CC); - } - - if (Subtarget.hasP9Altivec()) { - setTargetDAGCombine(ISD::ABS); - setTargetDAGCombine(ISD::VSELECT); + setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC}); } setLibcallName(RTLIB::LOG_F128, "logf128"); @@ -1368,6 +1412,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLibcallName(RTLIB::EXP2_F128, "exp2f128"); setLibcallName(RTLIB::SIN_F128, "sinf128"); setLibcallName(RTLIB::COS_F128, "cosf128"); + setLibcallName(RTLIB::SINCOS_F128, "sincosf128"); setLibcallName(RTLIB::POW_F128, "powf128"); setLibcallName(RTLIB::FMIN_F128, "fminf128"); setLibcallName(RTLIB::FMAX_F128, "fmaxf128"); @@ -1384,6 +1429,14 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLibcallName(RTLIB::LLRINT_F128, "llrintf128"); setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128"); setLibcallName(RTLIB::FMA_F128, "fmaf128"); + setLibcallName(RTLIB::FREXP_F128, "frexpf128"); + + if (Subtarget.isAIXABI()) { + setLibcallName(RTLIB::MEMCPY, isPPC64 ? "___memmove64" : "___memmove"); + setLibcallName(RTLIB::MEMMOVE, isPPC64 ? "___memmove64" : "___memmove"); + setLibcallName(RTLIB::MEMSET, isPPC64 ? "___memset64" : "___memset"); + setLibcallName(RTLIB::BZERO, isPPC64 ? "___bzero64" : "___bzero"); + } // With 32 condition bits, we don't need to sink (and duplicate) compares // aggressively in CodeGenPrep. @@ -1392,6 +1445,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setJumpIsExpensive(); } + // TODO: The default entry number is set to 64. This stops most jump table + // generation on PPC. But it is good for current PPC HWs because the indirect + // branch instruction mtctr to the jump table may lead to bad branch predict. + // Re-evaluate this value on future HWs that can do better with mtctr. + setMinimumJumpTableEntries(PPCMinimumJumpTableEntries); + setMinFunctionAlignment(Align(4)); switch (Subtarget.getCPUDirective()) { @@ -1539,9 +1598,9 @@ static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) { return; if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { if (MaxMaxAlign >= 32 && - VTy->getPrimitiveSizeInBits().getFixedSize() >= 256) + VTy->getPrimitiveSizeInBits().getFixedValue() >= 256) MaxAlign = Align(32); - else if (VTy->getPrimitiveSizeInBits().getFixedSize() >= 128 && + else if (VTy->getPrimitiveSizeInBits().getFixedValue() >= 128 && MaxAlign < 16) MaxAlign = Align(16); } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { @@ -1585,12 +1644,33 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { return VT.isScalarInteger(); } +bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore( + Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const { + if (!Subtarget.isPPC64() || !Subtarget.hasVSX()) + return false; + + if (auto *VTy = dyn_cast<VectorType>(VectorTy)) { + if (VTy->getScalarType()->isIntegerTy()) { + // ElemSizeInBits 8/16 can fit in immediate field, not needed here. + if (ElemSizeInBits == 32) { + Index = Subtarget.isLittleEndian() ? 2 : 1; + return true; + } + if (ElemSizeInBits == 64) { + Index = Subtarget.isLittleEndian() ? 1 : 0; + return true; + } + } + } + return false; +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; case PPCISD::FSEL: return "PPCISD::FSEL"; - case PPCISD::XSMAXCDP: return "PPCISD::XSMAXCDP"; - case PPCISD::XSMINCDP: return "PPCISD::XSMINCDP"; + case PPCISD::XSMAXC: return "PPCISD::XSMAXC"; + case PPCISD::XSMINC: return "PPCISD::XSMINC"; case PPCISD::FCFID: return "PPCISD::FCFID"; case PPCISD::FCFIDU: return "PPCISD::FCFIDU"; case PPCISD::FCFIDS: return "PPCISD::FCFIDS"; @@ -1599,10 +1679,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ"; case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ"; case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ"; - case PPCISD::FP_TO_UINT_IN_VSR: - return "PPCISD::FP_TO_UINT_IN_VSR,"; - case PPCISD::FP_TO_SINT_IN_VSR: - return "PPCISD::FP_TO_SINT_IN_VSR"; case PPCISD::FRE: return "PPCISD::FRE"; case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE"; case PPCISD::FTSQRT: @@ -1618,6 +1694,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { return "PPCISD::XXSPLTI32DX"; case PPCISD::VECINSERT: return "PPCISD::VECINSERT"; case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI"; + case PPCISD::XXPERM: + return "PPCISD::XXPERM"; case PPCISD::VECSHL: return "PPCISD::VECSHL"; case PPCISD::CMPB: return "PPCISD::CMPB"; case PPCISD::Hi: return "PPCISD::Hi"; @@ -1649,7 +1727,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { return "PPCISD::BCTRL_RM"; case PPCISD::BCTRL_LOAD_TOC_RM: return "PPCISD::BCTRL_LOAD_TOC_RM"; - case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG"; + case PPCISD::RET_GLUE: return "PPCISD::RET_GLUE"; case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE"; case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP"; case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP"; @@ -1696,6 +1774,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA"; case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L"; case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR"; + case PPCISD::GET_TPOINTER: return "PPCISD::GET_TPOINTER"; case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR"; case PPCISD::TLSGD_AIX: return "PPCISD::TLSGD_AIX"; case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA"; @@ -1713,7 +1792,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::RFEBB: return "PPCISD::RFEBB"; case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; - case PPCISD::VABSD: return "PPCISD::VABSD"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64"; case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE"; @@ -1752,6 +1830,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::STRICT_FCFIDUS: return "PPCISD::STRICT_FCFIDUS"; case PPCISD::LXVRZX: return "PPCISD::LXVRZX"; + case PPCISD::STORE_COND: + return "PPCISD::STORE_COND"; } return nullptr; } @@ -1868,8 +1948,7 @@ bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, /// For the latter, the input operands are swapped (see PPCInstrAltivec.td). bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind, SelectionDAG &DAG) { - const PPCSubtarget& Subtarget = - static_cast<const PPCSubtarget&>(DAG.getSubtarget()); + const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>(); if (!Subtarget.hasP8Vector()) return false; @@ -2123,7 +2202,11 @@ int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind, /// specifies a splat of a single element that is suitable for input to /// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.). bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) { - assert(N->getValueType(0) == MVT::v16i8 && isPowerOf2_32(EltSize) && + EVT VT = N->getValueType(0); + if (VT == MVT::v2i64 || VT == MVT::v2f64) + return EltSize == 8 && N->getMaskElt(0) == N->getMaskElt(1); + + assert(VT == MVT::v16i8 && isPowerOf2_32(EltSize) && EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes"); // The consecutive indices need to specify an element, not part of two @@ -2424,6 +2507,12 @@ unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); assert(isSplatShuffleMask(SVOp, EltSize)); + EVT VT = SVOp->getValueType(0); + + if (VT == MVT::v2i64 || VT == MVT::v2f64) + return DAG.getDataLayout().isLittleEndian() ? 1 - SVOp->getMaskElt(0) + : SVOp->getMaskElt(0); + if (DAG.getDataLayout().isLittleEndian()) return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize); else @@ -2477,7 +2566,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { if (LeadingZero) { if (!UniquedVals[Multiple-1].getNode()) return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef - int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue(); + int Val = UniquedVals[Multiple - 1]->getAsZExtVal(); if (Val < 16) // 0,0,0,4 -> vspltisw(4) return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); } @@ -2509,7 +2598,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { Value = CN->getZExtValue(); } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) { assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!"); - Value = FloatToBits(CN->getValueAPF().convertToFloat()); + Value = llvm::bit_cast<uint32_t>(CN->getValueAPF().convertToFloat()); } // If the splat value is larger than the element value, then we can never do @@ -2546,11 +2635,11 @@ bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) { if (!isa<ConstantSDNode>(N)) return false; - Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue(); + Imm = (int16_t)N->getAsZExtVal(); if (N->getValueType(0) == MVT::i32) - return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); + return Imm == (int32_t)N->getAsZExtVal(); else - return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); + return Imm == (int64_t)N->getAsZExtVal(); } bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { return isIntS16Immediate(Op.getNode(), Imm); @@ -2595,7 +2684,7 @@ bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) { if (!isa<ConstantSDNode>(N)) return false; - Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); + Imm = (int64_t)N->getAsZExtVal(); return isInt<34>(Imm); } bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) { @@ -2728,8 +2817,8 @@ bool PPCTargetLowering::SelectAddressRegImm( return true; // [r+i] } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) { // Match LOAD (ADD (X, Lo(G))). - assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue() - && "Cannot handle constant offsets yet!"); + assert(!N.getOperand(1).getConstantOperandVal(1) && + "Cannot handle constant offsets yet!"); Disp = N.getOperand(1).getOperand(0); // The global address. assert(Disp.getOpcode() == ISD::TargetGlobalAddress || Disp.getOpcode() == ISD::TargetGlobalTLSAddress || @@ -2885,7 +2974,7 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, template <typename Ty> static bool isValidPCRelNode(SDValue N) { Ty *PCRelCand = dyn_cast<Ty>(N); - return PCRelCand && (PCRelCand->getTargetFlags() & PPCII::MO_PCREL_FLAG); + return PCRelCand && (PPCInstrInfo::hasPCRelFlag(PCRelCand->getTargetFlags())); } /// Returns true if this address is a PC Relative address. @@ -2960,15 +3049,15 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, bool isLoad = true; SDValue Ptr; EVT VT; - unsigned Alignment; + Align Alignment; if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); - Alignment = LD->getAlignment(); + Alignment = LD->getAlign(); } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); - Alignment = ST->getAlignment(); + Alignment = ST->getAlign(); isLoad = false; } else return false; @@ -3008,11 +3097,11 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, // LDU/STU can only handle immediates that are a multiple of 4. if (VT != MVT::i64) { - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, None)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, std::nullopt)) return false; } else { // LDU/STU need an address with at least 4-byte alignment. - if (Alignment < 4) + if (Alignment < Align(4)) return false; if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4))) @@ -3046,8 +3135,8 @@ static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget, // Don't use the pic base if not in PIC relocation model. if (IsPIC) { - HiOpFlags |= PPCII::MO_PIC_FLAG; - LoOpFlags |= PPCII::MO_PIC_FLAG; + HiOpFlags = PPCII::MO_PIC_HA_FLAG; + LoOpFlags = PPCII::MO_PIC_LO_FLAG; } } @@ -3090,7 +3179,7 @@ SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, SDValue Ops[] = { GA, Reg }; return DAG.getMemIntrinsicNode( PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, - MachinePointerInfo::getGOT(DAG.getMachineFunction()), None, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), std::nullopt, MachineMemOperand::MOLoad); } @@ -3274,9 +3363,61 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op, SDLoc dl(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); + bool Is64Bit = Subtarget.isPPC64(); + bool HasAIXSmallLocalExecTLS = Subtarget.hasAIXSmallLocalExecTLS(); + TLSModel::Model Model = getTargetMachine().getTLSModel(GV); + bool IsTLSLocalExecModel = Model == TLSModel::LocalExec; + + if (IsTLSLocalExecModel || Model == TLSModel::InitialExec) { + SDValue VariableOffsetTGA = + DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG); + SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA); + SDValue TLSReg; + if (Is64Bit) { + // For local-exec and initial-exec on AIX (64-bit), the sequence generated + // involves a load of the variable offset (from the TOC), followed by an + // add of the loaded variable offset to R13 (the thread pointer). + // This code sequence looks like: + // ld reg1,var[TC](2) + // add reg2, reg1, r13 // r13 contains the thread pointer + TLSReg = DAG.getRegister(PPC::X13, MVT::i64); + + // With the -maix-small-local-exec-tls option, produce a faster access + // sequence for local-exec TLS variables where the offset from the TLS + // base is encoded as an immediate operand. + // + // We only utilize the faster local-exec access sequence when the TLS + // variable has a size within the policy limit. We treat types that are + // not sized or are empty as being over the policy size limit. + if (HasAIXSmallLocalExecTLS && IsTLSLocalExecModel) { + Type *GVType = GV->getValueType(); + if (GVType->isSized() && !GVType->isEmptyTy() && + GV->getParent()->getDataLayout().getTypeAllocSize(GVType) <= + AIXSmallTlsPolicySizeLimit) + return DAG.getNode(PPCISD::Lo, dl, PtrVT, VariableOffsetTGA, TLSReg); + } + } else { + // For local-exec and initial-exec on AIX (32-bit), the sequence generated + // involves loading the variable offset from the TOC, generating a call to + // .__get_tpointer to get the thread pointer (which will be in R3), and + // adding the two together: + // lwz reg1,var[TC](2) + // bla .__get_tpointer + // add reg2, reg1, r3 + TLSReg = DAG.getNode(PPCISD::GET_TPOINTER, dl, PtrVT); + + // We do not implement the 32-bit version of the faster access sequence + // for local-exec that is controlled by -maix-small-local-exec-tls. + if (HasAIXSmallLocalExecTLS) + report_fatal_error("The small-local-exec TLS access sequence is " + "currently only supported on AIX (64-bit mode)."); + } + return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, VariableOffset); + } - // The general-dynamic model is the only access model supported for now, so - // all the GlobalTLSAddress nodes are lowered with this model. + // Only Local-Exec, Initial-Exec and General-Dynamic TLS models are currently + // supported models. If Local- or Initial-exec are not possible or specified, + // all GlobalTLSAddress nodes are lowered using the general-dynamic model. // We need to generate two TOC entries, one for the variable offset, one for // the region handle. The global address for the TOC entry of the region // handle is created with the MO_TLSGDM_FLAG flag and the global address @@ -3314,8 +3455,8 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op, if (Model == TLSModel::LocalExec) { if (Subtarget.isUsingPCRelativeCalls()) { SDValue TLSReg = DAG.getRegister(PPC::X13, MVT::i64); - SDValue TGA = DAG.getTargetGlobalAddress( - GV, dl, PtrVT, 0, (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG)); + SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, + PPCII::MO_TPREL_PCREL_FLAG); SDValue MatAddr = DAG.getNode(PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, dl, PtrVT, TGA); return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, MatAddr); @@ -3337,8 +3478,7 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op, SDValue TGA = DAG.getTargetGlobalAddress( GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0); SDValue TGATLS = DAG.getTargetGlobalAddress( - GV, dl, PtrVT, 0, - IsPCRel ? (PPCII::MO_TLS | PPCII::MO_PCREL_FLAG) : PPCII::MO_TLS); + GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_TLS_PCREL_FLAG : PPCII::MO_TLS); SDValue TPOffset; if (IsPCRel) { SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, dl, PtrVT, TGA); @@ -3434,8 +3574,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op, EVT Ty = getPointerTy(DAG.getDataLayout()); if (isAccessedAsGotIndirect(Op)) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(), - PPCII::MO_PCREL_FLAG | - PPCII::MO_GOT_FLAG); + PPCII::MO_GOT_PCREL_FLAG); SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA); SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel, MachinePointerInfo()); @@ -3685,21 +3824,21 @@ SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const { // Check all operands that may contain the LR. for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) { - unsigned Flags = cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue(); - unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags); + const InlineAsm::Flag Flags(Op.getConstantOperandVal(i)); + unsigned NumVals = Flags.getNumOperandRegisters(); ++i; // Skip the ID value. - switch (InlineAsm::getKind(Flags)) { + switch (Flags.getKind()) { default: llvm_unreachable("Bad flags!"); - case InlineAsm::Kind_RegUse: - case InlineAsm::Kind_Imm: - case InlineAsm::Kind_Mem: + case InlineAsm::Kind::RegUse: + case InlineAsm::Kind::Imm: + case InlineAsm::Kind::Mem: i += NumVals; break; - case InlineAsm::Kind_Clobber: - case InlineAsm::Kind_RegDef: - case InlineAsm::Kind_RegDefEarlyClobber: { + case InlineAsm::Kind::Clobber: + case InlineAsm::Kind::RegDef: + case InlineAsm::Kind::RegDefEarlyClobber: { for (; NumVals; --NumVals, ++i) { Register Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg(); if (Reg != PPC::LR && Reg != PPC::LR8) @@ -4126,12 +4265,12 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. - CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign); + CCByValInfo.AllocateStack(CCInfo.getStackSize(), PtrAlign); CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal); // Area that is at least reserved in the caller of this function. - unsigned MinReservedArea = CCByValInfo.getNextStackOffset(); + unsigned MinReservedArea = CCByValInfo.getStackSize(); MinReservedArea = std::max(MinReservedArea, LinkageSize); // Set the size that is at least reserved in caller of this function. Tail @@ -4151,13 +4290,13 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( PPC::R3, PPC::R4, PPC::R5, PPC::R6, PPC::R7, PPC::R8, PPC::R9, PPC::R10, }; - const unsigned NumGPArgRegs = array_lengthof(GPArgRegs); + const unsigned NumGPArgRegs = std::size(GPArgRegs); static const MCPhysReg FPArgRegs[] = { PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7, PPC::F8 }; - unsigned NumFPArgRegs = array_lengthof(FPArgRegs); + unsigned NumFPArgRegs = std::size(FPArgRegs); if (useSoftFloat() || hasSPE()) NumFPArgRegs = 0; @@ -4169,9 +4308,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4( int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 + NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8; - FuncInfo->setVarArgsStackOffset( - MFI.CreateFixedObject(PtrVT.getSizeInBits()/8, - CCInfo.getNextStackOffset(), true)); + FuncInfo->setVarArgsStackOffset(MFI.CreateFixedObject( + PtrVT.getSizeInBits() / 8, CCInfo.getStackSize(), true)); FuncInfo->setVarArgsFrameIndex( MFI.CreateStackObject(Depth, Align(8), false)); @@ -4269,9 +4407,9 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; - const unsigned Num_GPR_Regs = array_lengthof(GPR); + const unsigned Num_GPR_Regs = std::size(GPR); const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13; - const unsigned Num_VR_Regs = array_lengthof(VR); + const unsigned Num_VR_Regs = std::size(VR); // Do a first pass over the arguments to determine whether the ABI // guarantees that our caller has allocated the parameter save area @@ -4419,8 +4557,11 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue Off = DAG.getConstant(j, dl, PtrVT); Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); } - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, - MachinePointerInfo(&*FuncArg, j)); + unsigned StoreSizeInBits = std::min(PtrByteSize, (ObjSize - j)) * 8; + EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), StoreSizeInBits); + SDValue Store = + DAG.getTruncStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo(&*FuncArg, j), ObjType); MemOps.push_back(Store); ++GPR_idx; } @@ -4628,9 +4769,10 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall, return SPDiff; } -static bool isFunctionGlobalAddress(SDValue Callee); +static bool isFunctionGlobalAddress(const GlobalValue *CalleeGV); -static bool callsShareTOCBase(const Function *Caller, SDValue Callee, +static bool callsShareTOCBase(const Function *Caller, + const GlobalValue *CalleeGV, const TargetMachine &TM) { // It does not make sense to call callsShareTOCBase() with a caller that // is PC Relative since PC Relative callers do not have a TOC. @@ -4644,23 +4786,20 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee, // don't have enough information to determine if the caller and callee share // the same TOC base, so we have to pessimistically assume they don't for // correctness. - GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); - if (!G) + if (!CalleeGV) return false; - const GlobalValue *GV = G->getGlobal(); - // If the callee is preemptable, then the static linker will use a plt-stub // which saves the toc to the stack, and needs a nop after the call // instruction to convert to a toc-restore. - if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV)) + if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), CalleeGV)) return false; // Functions with PC Relative enabled may clobber the TOC in the same DSO. // We may need a TOC restore in the situation where the caller requires a // valid TOC but the callee is PC Relative and does not. - const Function *F = dyn_cast<Function>(GV); - const GlobalAlias *Alias = dyn_cast<GlobalAlias>(GV); + const Function *F = dyn_cast<Function>(CalleeGV); + const GlobalAlias *Alias = dyn_cast<GlobalAlias>(CalleeGV); // If we have an Alias we can try to get the function from there. if (Alias) { @@ -4685,7 +4824,7 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee, // replaced by another function at link time. The function that replaces // it may not share the same TOC as the caller since the callee may be // replaced by a PC Relative version of the same function. - if (!GV->isStrongDefinitionForLinker()) + if (!CalleeGV->isStrongDefinitionForLinker()) return false; // The medium and large code models are expected to provide a sufficiently @@ -4698,10 +4837,10 @@ static bool callsShareTOCBase(const Function *Caller, SDValue Callee, // Any explicitly-specified sections and section prefixes must also match. // Also, if we're using -ffunction-sections, then each function is always in // a different section (the same is true for COMDAT functions). - if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() || - GV->getSection() != Caller->getSection()) + if (TM.getFunctionSections() || CalleeGV->hasComdat() || + Caller->hasComdat() || CalleeGV->getSection() != Caller->getSection()) return false; - if (const auto *F = dyn_cast<Function>(GV)) { + if (const auto *F = dyn_cast<Function>(CalleeGV)) { if (F->getSectionPrefix() != Caller->getSectionPrefix()) return false; } @@ -4726,9 +4865,9 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget, PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; - const unsigned NumGPRs = array_lengthof(GPR); + const unsigned NumGPRs = std::size(GPR); const unsigned NumFPRs = 13; - const unsigned NumVRs = array_lengthof(VR); + const unsigned NumVRs = std::size(VR); const unsigned ParamAreaSize = NumGPRs * PtrByteSize; unsigned NumBytes = LinkageSize; @@ -4794,9 +4933,11 @@ areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC, } bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( - SDValue Callee, CallingConv::ID CalleeCC, const CallBase *CB, bool isVarArg, + const GlobalValue *CalleeGV, CallingConv::ID CalleeCC, + CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc, + bool isCalleeExternalSymbol) const { bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt; if (DisableSCO && !TailCallOpt) return false; @@ -4804,9 +4945,8 @@ bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // Variadic argument functions are not supported. if (isVarArg) return false; - auto &Caller = DAG.getMachineFunction().getFunction(); // Check that the calling conventions are compatible for tco. - if (!areCallingConvEligibleForTCO_64SVR4(Caller.getCallingConv(), CalleeCC)) + if (!areCallingConvEligibleForTCO_64SVR4(CallerCC, CalleeCC)) return false; // Caller contains any byval parameter is not supported. @@ -4834,8 +4974,7 @@ bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // If callee and caller use different calling conventions, we cannot pass // parameters on stack since offsets for the parameter area may be different. - if (Caller.getCallingConv() != CalleeCC && - needStackSlotPassParameters(Subtarget, Outs)) + if (CallerCC != CalleeCC && needStackSlotPassParameters(Subtarget, Outs)) return false; // All variants of 64-bit ELF ABIs without PC-Relative addressing require that @@ -4848,12 +4987,12 @@ bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // applicable so this check is not required. // Check first for indirect calls. if (!Subtarget.isUsingPCRelativeCalls() && - !isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee)) + !isFunctionGlobalAddress(CalleeGV) && !isCalleeExternalSymbol) return false; // Check if we share the TOC base. if (!Subtarget.isUsingPCRelativeCalls() && - !callsShareTOCBase(&Caller, Callee, getTargetMachine())) + !callsShareTOCBase(CallerFunc, CalleeGV, getTargetMachine())) return false; // TCO allows altering callee ABI, so we don't have to check further. @@ -4868,7 +5007,7 @@ bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( // PC Relative tail calls may not have a CallBase. // If there is no CallBase we cannot verify if we have the same argument // list so assume that we don't have the same argument list. - if (CB && !hasSameArgumentList(&Caller, *CB) && + if (CB && !hasSameArgumentList(CallerFunc, *CB) && needStackSlotPassParameters(Subtarget, Outs)) return false; else if (!CB && needStackSlotPassParameters(Subtarget, Outs)) @@ -4880,12 +5019,10 @@ bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4( /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. -bool -PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, - CallingConv::ID CalleeCC, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SelectionDAG& DAG) const { +bool PPCTargetLowering::IsEligibleForTailCallOptimization( + const GlobalValue *CalleeGV, CallingConv::ID CalleeCC, + CallingConv::ID CallerCC, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins) const { if (!getTargetMachine().Options.GuaranteedTailCallOpt) return false; @@ -4893,14 +5030,10 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isVarArg) return false; - MachineFunction &MF = DAG.getMachineFunction(); - CallingConv::ID CallerCC = MF.getFunction().getCallingConv(); if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { // Functions containing by val parameters are not supported. - for (unsigned i = 0; i != Ins.size(); i++) { - ISD::ArgFlagsTy Flags = Ins[i].Flags; - if (Flags.isByVal()) return false; - } + if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); })) + return false; // Non-PIC/GOT tail calls are supported. if (getTargetMachine().getRelocationModel() != Reloc::PIC_) @@ -4908,9 +5041,9 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // At the moment we can only do local tail calls (in same module, hidden // or protected) if we are generating PIC. - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) - return G->getGlobal()->hasHiddenVisibility() - || G->getGlobal()->hasProtectedVisibility(); + if (CalleeGV) + return CalleeGV->hasHiddenVisibility() || + CalleeGV->hasProtectedVisibility(); } return false; @@ -5060,7 +5193,7 @@ static void LowerMemOpCallTo( } static void -PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, +PrepareTailCall(SelectionDAG &DAG, SDValue &InGlue, SDValue &Chain, const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp, SDValue FPOp, SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) { @@ -5068,7 +5201,7 @@ PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, // might overwrite each other in case of tail call optimization. SmallVector<SDValue, 8> MemOpChains2; // Do not flag preceding copytoreg stuff together with the following stuff. - InFlag = SDValue(); + InGlue = SDValue(); StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments, MemOpChains2, dl); if (!MemOpChains2.empty()) @@ -5078,27 +5211,25 @@ PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain, Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl); // Emit callseq_end just before tailcall node. - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - DAG.getIntPtrConstant(0, dl, true), InFlag, dl); - InFlag = Chain.getValue(1); + Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, dl); + InGlue = Chain.getValue(1); } // Is this global address that of a function that can be called by name? (as // opposed to something that must hold a descriptor for an indirect call). -static bool isFunctionGlobalAddress(SDValue Callee) { - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { - if (Callee.getOpcode() == ISD::GlobalTLSAddress || - Callee.getOpcode() == ISD::TargetGlobalTLSAddress) +static bool isFunctionGlobalAddress(const GlobalValue *GV) { + if (GV) { + if (GV->isThreadLocal()) return false; - return G->getGlobal()->getValueType()->isFunctionTy(); + return GV->getValueType()->isFunctionTy(); } return false; } SDValue PPCTargetLowering::LowerCallResult( - SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, + SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { SmallVector<CCValAssign, 16> RVLocs; @@ -5119,22 +5250,22 @@ SDValue PPCTargetLowering::LowerCallResult( if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) { SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, - InFlag); + InGlue); Chain = Lo.getValue(1); - InFlag = Lo.getValue(2); + InGlue = Lo.getValue(2); VA = RVLocs[++i]; // skip ahead to next loc SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, - InFlag); + InGlue); Chain = Hi.getValue(1); - InFlag = Hi.getValue(2); + InGlue = Hi.getValue(2); if (!Subtarget.isLittleEndian()) std::swap (Lo, Hi); Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi); } else { Val = DAG.getCopyFromReg(Chain, dl, - VA.getLocReg(), VA.getLocVT(), InFlag); + VA.getLocReg(), VA.getLocVT(), InGlue); Chain = Val.getValue(1); - InFlag = Val.getValue(2); + InGlue = Val.getValue(2); } switch (VA.getLocInfo()) { @@ -5163,11 +5294,14 @@ SDValue PPCTargetLowering::LowerCallResult( static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG, const PPCSubtarget &Subtarget, bool isPatchPoint) { + auto *G = dyn_cast<GlobalAddressSDNode>(Callee); + const GlobalValue *GV = G ? G->getGlobal() : nullptr; + // PatchPoint calls are not indirect. if (isPatchPoint) return false; - if (isFunctionGlobalAddress(Callee) || isa<ExternalSymbolSDNode>(Callee)) + if (isFunctionGlobalAddress(GV) || isa<ExternalSymbolSDNode>(Callee)) return false; // Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not @@ -5204,7 +5338,7 @@ static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags, // inserted into the DAG as part of call lowering. The restore of the TOC // pointer is modeled by using a pseudo instruction for the call opcode that // represents the 2 instruction sequence of an indirect branch and link, - // immediately followed by a load of the TOC pointer from the the stack save + // immediately followed by a load of the TOC pointer from the stack save // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC // as it is not saved or used. RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC @@ -5212,7 +5346,7 @@ static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags, } else if (Subtarget.isUsingPCRelativeCalls()) { assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI."); RetOpc = PPCISD::CALL_NOTOC; - } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) + } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) { // The ABIs that maintain a TOC pointer accross calls need to have a nop // immediately following the call instruction if the caller and callee may // have different TOC bases. At link time if the linker determines the calls @@ -5221,9 +5355,11 @@ static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags, // TOC pointer at an ABI designated offset in the linkage area and the // linker will rewrite the nop to be a load of the TOC pointer from the // linkage area into gpr2. - RetOpc = callsShareTOCBase(&Caller, Callee, TM) ? PPCISD::CALL - : PPCISD::CALL_NOP; - else + auto *G = dyn_cast<GlobalAddressSDNode>(Callee); + const GlobalValue *GV = G ? G->getGlobal() : nullptr; + RetOpc = + callsShareTOCBase(&Caller, GV, TM) ? PPCISD::CALL : PPCISD::CALL_NOP; + } else RetOpc = PPCISD::CALL; if (IsStrictFPCall) { switch (RetOpc) { @@ -5283,7 +5419,9 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG, return DAG.getMCSymbol(S, PtrVT); }; - if (isFunctionGlobalAddress(Callee)) { + auto *G = dyn_cast<GlobalAddressSDNode>(Callee); + const GlobalValue *GV = G ? G->getGlobal() : nullptr; + if (isFunctionGlobalAddress(GV)) { const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); if (Subtarget.isAIXABI()) { @@ -5348,8 +5486,8 @@ static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee, const SDLoc &dl) { SDValue MTCTROps[] = {Chain, Callee, Glue}; EVT ReturnTypes[] = {MVT::Other, MVT::Glue}; - Chain = DAG.getNode(PPCISD::MTCTR, dl, makeArrayRef(ReturnTypes, 2), - makeArrayRef(MTCTROps, Glue.getNode() ? 3 : 2)); + Chain = DAG.getNode(PPCISD::MTCTR, dl, ArrayRef(ReturnTypes, 2), + ArrayRef(MTCTROps, Glue.getNode() ? 3 : 2)); // The glue is the second value produced. Glue = Chain.getValue(1); } @@ -5404,7 +5542,7 @@ static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee, const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset(); const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; - const unsigned Alignment = Subtarget.isPPC64() ? 8 : 4; + const Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4); // One load for the functions entry point address. SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI, @@ -5574,7 +5712,9 @@ SDValue PPCTargetLowering::FinishCall( assert(CallOpc == PPCISD::TC_RETURN && "Unexpected call opcode for a tail call."); DAG.getMachineFunction().getFrameInfo().setHasTailCall(); - return DAG.getNode(CallOpc, dl, MVT::Other, Ops); + SDValue Ret = DAG.getNode(CallOpc, dl, MVT::Other, Ops); + DAG.addNoMergeSiteInfo(Ret.getNode(), CFlags.NoMerge); + return Ret; } std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}}; @@ -5590,15 +5730,52 @@ SDValue PPCTargetLowering::FinishCall( ? NumBytes : 0; - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - DAG.getIntPtrConstant(BytesCalleePops, dl, true), - Glue, dl); + Chain = DAG.getCALLSEQ_END(Chain, NumBytes, BytesCalleePops, Glue, dl); Glue = Chain.getValue(1); return LowerCallResult(Chain, Glue, CFlags.CallConv, CFlags.IsVarArg, Ins, dl, DAG, InVals); } +bool PPCTargetLowering::supportsTailCallFor(const CallBase *CB) const { + CallingConv::ID CalleeCC = CB->getCallingConv(); + const Function *CallerFunc = CB->getCaller(); + CallingConv::ID CallerCC = CallerFunc->getCallingConv(); + const Function *CalleeFunc = CB->getCalledFunction(); + if (!CalleeFunc) + return false; + const GlobalValue *CalleeGV = dyn_cast<GlobalValue>(CalleeFunc); + + SmallVector<ISD::OutputArg, 2> Outs; + SmallVector<ISD::InputArg, 2> Ins; + + GetReturnInfo(CalleeCC, CalleeFunc->getReturnType(), + CalleeFunc->getAttributes(), Outs, *this, + CalleeFunc->getParent()->getDataLayout()); + + return isEligibleForTCO(CalleeGV, CalleeCC, CallerCC, CB, + CalleeFunc->isVarArg(), Outs, Ins, CallerFunc, + false /*isCalleeExternalSymbol*/); +} + +bool PPCTargetLowering::isEligibleForTCO( + const GlobalValue *CalleeGV, CallingConv::ID CalleeCC, + CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc, + bool isCalleeExternalSymbol) const { + if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall())) + return false; + + if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) + return IsEligibleForTailCallOptimization_64SVR4( + CalleeGV, CalleeCC, CallerCC, CB, isVarArg, Outs, Ins, CallerFunc, + isCalleeExternalSymbol); + else + return IsEligibleForTailCallOptimization(CalleeGV, CalleeCC, CallerCC, + isVarArg, Ins); +} + SDValue PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { @@ -5616,14 +5793,15 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const CallBase *CB = CLI.CB; if (isTailCall) { - if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall())) - isTailCall = false; - else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) - isTailCall = IsEligibleForTailCallOptimization_64SVR4( - Callee, CallConv, CB, isVarArg, Outs, Ins, DAG); - else - isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, - Ins, DAG); + MachineFunction &MF = DAG.getMachineFunction(); + CallingConv::ID CallerCC = MF.getFunction().getCallingConv(); + auto *G = dyn_cast<GlobalAddressSDNode>(Callee); + const GlobalValue *GV = G ? G->getGlobal() : nullptr; + bool IsCalleeExternalSymbol = isa<ExternalSymbolSDNode>(Callee); + + isTailCall = + isEligibleForTCO(GV, CallConv, CallerCC, CB, isVarArg, Outs, Ins, + &(MF.getFunction()), IsCalleeExternalSymbol); if (isTailCall) { ++NumTailCalls; if (!getTargetMachine().Options.GuaranteedTailCallOpt) @@ -5743,7 +5921,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( if (Result) { #ifndef NDEBUG errs() << "Call operand #" << i << " has unhandled type " - << EVT(ArgVT).getEVTString() << "\n"; + << ArgVT << "\n"; #endif llvm_unreachable(nullptr); } @@ -5759,14 +5937,14 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext()); // Reserve stack space for the allocations in CCInfo. - CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign); + CCByValInfo.AllocateStack(CCInfo.getStackSize(), PtrAlign); CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal); // Size of the linkage area, parameter list area and the part of the local // space variable where copies of aggregates which are passed by value are // stored. - unsigned NumBytes = CCByValInfo.getNextStackOffset(); + unsigned NumBytes = CCByValInfo.getStackSize(); // Calculate by how many bytes the stack has to be adjusted in case of tail // call optimization. @@ -5886,30 +6064,30 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. - SDValue InFlag; + SDValue InGlue; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); + RegsToPass[i].second, InGlue); + InGlue = Chain.getValue(1); } // Set CR bit 6 to true if this is a vararg call with floating args passed in // registers. if (IsVarArg) { SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue Ops[] = { Chain, InFlag }; + SDValue Ops[] = { Chain, InGlue }; - Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, - dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1)); + Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, dl, + VTs, ArrayRef(Ops, InGlue.getNode() ? 2 : 1)); - InFlag = Chain.getValue(1); + InGlue = Chain.getValue(1); } if (IsTailCall) - PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, + PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp, TailCallArguments); - return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart, + return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CB); } @@ -5979,9 +6157,9 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13 }; - const unsigned NumGPRs = array_lengthof(GPR); + const unsigned NumGPRs = std::size(GPR); const unsigned NumFPRs = useSoftFloat() ? 0 : 13; - const unsigned NumVRs = array_lengthof(VR); + const unsigned NumVRs = std::size(VR); // On ELFv2, we can avoid allocating the parameter area if all the arguments // can be passed to the callee in registers. @@ -6257,8 +6435,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType()); SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); if (GPR_idx != NumGPRs) { - SDValue Load = - DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo()); + unsigned LoadSizeInBits = std::min(PtrByteSize, (Size - j)) * 8; + EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), LoadSizeInBits); + SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, AddArg, + MachinePointerInfo(), ObjType); + MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load)); ArgOffset += PtrByteSize; @@ -6497,18 +6678,18 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. - SDValue InFlag; + SDValue InGlue; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); + RegsToPass[i].second, InGlue); + InGlue = Chain.getValue(1); } if (CFlags.IsTailCall && !IsSibCall) - PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp, + PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp, TailCallArguments); - return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart, + return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CB); } @@ -6584,8 +6765,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, // but needs a MemLoc for a stack slot for the formal arguments side. if (ByValSize == 0) { State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE, - State.getNextStackOffset(), RegVT, - LocInfo)); + State.getStackSize(), RegVT, LocInfo)); return false; } @@ -6612,7 +6792,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, case MVT::i64: // i64 arguments should have been split to i32 for PPC32. assert(IsPPC64 && "PPC32 should have split i64 values."); - LLVM_FALLTHROUGH; + [[fallthrough]]; case MVT::i1: case MVT::i32: { const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign); @@ -6891,8 +7071,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( if (useSoftFloat()) report_fatal_error("Soft float support is unimplemented on AIX."); - const PPCSubtarget &Subtarget = - static_cast<const PPCSubtarget &>(DAG.getSubtarget()); + const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>(); const bool IsPPC64 = Subtarget.isPPC64(); const unsigned PtrByteSize = IsPPC64 ? 8 : 4; @@ -7074,7 +7253,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( // be future work. SDValue Store = DAG.getStore( CopyFrom.getValue(1), dl, CopyFrom, - DAG.getObjectPtrOffset(dl, FIN, TypeSize::Fixed(Offset)), + DAG.getObjectPtrOffset(dl, FIN, TypeSize::getFixed(Offset)), MachinePointerInfo::getFixedStack(MF, FI, Offset)); MemOps.push_back(Store); @@ -7129,8 +7308,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( // On AIX a minimum of 8 words is saved to the parameter save area. const unsigned MinParameterSaveArea = 8 * PtrByteSize; // Area that is at least reserved in the caller of this function. - unsigned CallerReservedArea = - std::max(CCInfo.getNextStackOffset(), LinkageSize + MinParameterSaveArea); + unsigned CallerReservedArea = std::max<unsigned>( + CCInfo.getStackSize(), LinkageSize + MinParameterSaveArea); // Set the size that is at least reserved in caller of this function. Tail // call optimized function's reserved stack space needs to be aligned so @@ -7142,7 +7321,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( if (isVarArg) { FuncInfo->setVarArgsFrameIndex( - MFI.CreateFixedObject(PtrByteSize, CCInfo.getNextStackOffset(), true)); + MFI.CreateFixedObject(PtrByteSize, CCInfo.getStackSize(), true)); SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6, @@ -7150,13 +7329,13 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX( static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6, PPC::X7, PPC::X8, PPC::X9, PPC::X10}; - const unsigned NumGPArgRegs = array_lengthof(IsPPC64 ? GPR_64 : GPR_32); + const unsigned NumGPArgRegs = std::size(IsPPC64 ? GPR_64 : GPR_32); // The fixed integer arguments of a variadic function are stored to the // VarArgsFrameIndex on the stack so that they may be loaded by // dereferencing the result of va_next. for (unsigned GPRIndex = - (CCInfo.getNextStackOffset() - LinkageSize) / PtrByteSize; + (CCInfo.getStackSize() - LinkageSize) / PtrByteSize; GPRIndex < NumGPArgRegs; ++GPRIndex) { const Register VReg = @@ -7197,8 +7376,7 @@ SDValue PPCTargetLowering::LowerCall_AIX( if (CFlags.IsPatchPoint) report_fatal_error("This call type is unimplemented on AIX."); - const PPCSubtarget& Subtarget = - static_cast<const PPCSubtarget&>(DAG.getSubtarget()); + const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>(); MachineFunction &MF = DAG.getMachineFunction(); SmallVector<CCValAssign, 16> ArgLocs; @@ -7223,8 +7401,8 @@ SDValue PPCTargetLowering::LowerCall_AIX( // conservatively assume that it is needed. As such, make sure we have at // least enough stack space for the caller to store the 8 GPRs. const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize; - const unsigned NumBytes = std::max(LinkageSize + MinParameterSaveAreaSize, - CCInfo.getNextStackOffset()); + const unsigned NumBytes = std::max<unsigned>( + LinkageSize + MinParameterSaveAreaSize, CCInfo.getStackSize()); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass. @@ -7255,12 +7433,12 @@ SDValue PPCTargetLowering::LowerCall_AIX( } auto GetLoad = [&](EVT VT, unsigned LoadOffset) { - return DAG.getExtLoad( - ISD::ZEXTLOAD, dl, PtrVT, Chain, - (LoadOffset != 0) - ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset)) - : Arg, - MachinePointerInfo(), VT); + return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, + (LoadOffset != 0) + ? DAG.getObjectPtrOffset( + dl, Arg, TypeSize::getFixed(LoadOffset)) + : Arg, + MachinePointerInfo(), VT); }; unsigned LoadOffset = 0; @@ -7290,11 +7468,11 @@ SDValue PPCTargetLowering::LowerCall_AIX( // Only memcpy the bytes that don't pass in register. MemcpyFlags.setByValSize(ByValSize - LoadOffset); Chain = CallSeqStart = createMemcpyOutsideCallSeq( - (LoadOffset != 0) - ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset)) - : Arg, - DAG.getObjectPtrOffset(dl, StackPtr, - TypeSize::Fixed(ByValVA.getLocMemOffset())), + (LoadOffset != 0) ? DAG.getObjectPtrOffset( + dl, Arg, TypeSize::getFixed(LoadOffset)) + : Arg, + DAG.getObjectPtrOffset( + dl, StackPtr, TypeSize::getFixed(ByValVA.getLocMemOffset())), CallSeqStart, MemcpyFlags, DAG, dl); continue; } @@ -7309,7 +7487,7 @@ SDValue PPCTargetLowering::LowerCall_AIX( "Unexpected register residue for by-value argument."); SDValue ResidueVal; for (unsigned Bytes = 0; Bytes != ResidueBytes;) { - const unsigned N = PowerOf2Floor(ResidueBytes - Bytes); + const unsigned N = llvm::bit_floor(ResidueBytes - Bytes); const MVT VT = N == 1 ? MVT::i8 : ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64)); @@ -7490,14 +7668,14 @@ SDValue PPCTargetLowering::LowerCall_AIX( // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. - SDValue InFlag; + SDValue InGlue; for (auto Reg : RegsToPass) { - Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InFlag); - InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InGlue); + InGlue = Chain.getValue(1); } const int SPDiff = 0; - return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart, + return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins, InVals, CB); } @@ -7528,7 +7706,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, ? RetCC_PPC_Cold : RetCC_PPC); - SDValue Flag; + SDValue Glue; SmallVector<SDValue, 4> RetOps(1, Chain); // Copy the result values into the output registers. @@ -7557,26 +7735,26 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl)); - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Glue); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg, DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl)); - Flag = Chain.getValue(1); + Glue = Chain.getValue(1); VA = RVLocs[++i]; // skip ahead to next loc - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Flag); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Glue); } else - Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); - Flag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue); + Glue = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } RetOps[0] = Chain; // Update chain. - // Add the flag if we have it. - if (Flag.getNode()) - RetOps.push_back(Flag); + // Add the glue if we have it. + if (Glue.getNode()) + RetOps.push_back(Glue); - return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); + return DAG.getNode(PPCISD::RET_GLUE, dl, MVT::Other, RetOps); } SDValue @@ -7801,15 +7979,15 @@ SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op, EVT EltVT = TrgVT.getVectorElementType(); if (!isOperationCustom(Op.getOpcode(), TrgVT) || TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(TrgNumElts) || - !isPowerOf2_32(EltVT.getSizeInBits())) + !llvm::has_single_bit<uint32_t>(EltVT.getSizeInBits())) return SDValue(); SDValue N1 = Op.getOperand(0); EVT SrcVT = N1.getValueType(); unsigned SrcSize = SrcVT.getSizeInBits(); - if (SrcSize > 256 || - !isPowerOf2_32(SrcVT.getVectorNumElements()) || - !isPowerOf2_32(SrcVT.getVectorElementType().getSizeInBits())) + if (SrcSize > 256 || !isPowerOf2_32(SrcVT.getVectorNumElements()) || + !llvm::has_single_bit<uint32_t>( + SrcVT.getVectorElementType().getSizeInBits())) return SDValue(); if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2) return SDValue(); @@ -7882,7 +8060,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDNodeFlags Flags = Op.getNode()->getFlags(); - // We have xsmaxcdp/xsmincdp which are OK to emit even in the + // We have xsmaxc[dq]p/xsminc[dq]p which are OK to emit even in the // presence of infinities. if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) { switch (CC) { @@ -7890,10 +8068,10 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { break; case ISD::SETOGT: case ISD::SETGT: - return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS); + return DAG.getNode(PPCISD::XSMAXC, dl, Op.getValueType(), LHS, RHS); case ISD::SETOLT: case ISD::SETLT: - return DAG.getNode(PPCISD::XSMINCDP, dl, Op.getValueType(), LHS, RHS); + return DAG.getNode(PPCISD::XSMINC, dl, Op.getValueType(), LHS, RHS); } } @@ -7902,7 +8080,8 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // For more information, see section F.3 of the 2.06 ISA specification. // With ISA 3.0 if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) || - (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs())) + (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()) || + ResVT == MVT::f128) return Op; // If the RHS of the comparison is a 0.0, we don't need to do the @@ -7913,7 +8092,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); - LLVM_FALLTHROUGH; + [[fallthrough]]; case ISD::SETEQ: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); @@ -7925,7 +8104,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETULT: case ISD::SETLT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt - LLVM_FALLTHROUGH; + [[fallthrough]]; case ISD::SETOGE: case ISD::SETGE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -7934,7 +8113,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETUGT: case ISD::SETGT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt - LLVM_FALLTHROUGH; + [[fallthrough]]; case ISD::SETOLE: case ISD::SETLE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -7948,7 +8127,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); - LLVM_FALLTHROUGH; + [[fallthrough]]; case ISD::SETEQ: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -8023,7 +8202,11 @@ static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG, // For strict nodes, source is the second operand. SDValue Src = Op.getOperand(IsStrict ? 1 : 0); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - assert(Src.getValueType().isFloatingPoint()); + MVT DestTy = Op.getSimpleValueType(); + assert(Src.getValueType().isFloatingPoint() && + (DestTy == MVT::i8 || DestTy == MVT::i16 || DestTy == MVT::i32 || + DestTy == MVT::i64) && + "Invalid FP_TO_INT types"); if (Src.getValueType() == MVT::f32) { if (IsStrict) { Src = @@ -8033,9 +8216,10 @@ static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG, } else Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); } - SDValue Conv; + if ((DestTy == MVT::i8 || DestTy == MVT::i16) && Subtarget.hasP9Vector()) + DestTy = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; unsigned Opc = ISD::DELETED_NODE; - switch (Op.getSimpleValueType().SimpleTy) { + switch (DestTy.SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); case MVT::i32: Opc = IsSigned ? PPCISD::FCTIWZ @@ -8046,12 +8230,14 @@ static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG, "i64 FP_TO_UINT is supported only with FPCVT"); Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ; } + EVT ConvTy = Src.getValueType() == MVT::f128 ? MVT::f128 : MVT::f64; + SDValue Conv; if (IsStrict) { Opc = getPPCStrictOpcode(Opc); - Conv = DAG.getNode(Opc, dl, DAG.getVTList(MVT::f64, MVT::Other), - {Chain, Src}, Flags); + Conv = DAG.getNode(Opc, dl, DAG.getVTList(ConvTy, MVT::Other), {Chain, Src}, + Flags); } else { - Conv = DAG.getNode(Opc, dl, MVT::f64, Src); + Conv = DAG.getNode(Opc, dl, ConvTy, Src); } return Conv; } @@ -8138,10 +8324,8 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept()); if (IsSigned) { - SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src, - DAG.getIntPtrConstant(0, dl)); - SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Src, - DAG.getIntPtrConstant(1, dl)); + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::f64, MVT::f64); // Add the two halves of the long double in round-to-zero mode, and use // a smaller FP_TO_SINT. @@ -8303,7 +8487,7 @@ void PPCTargetLowering::spliceIntoChain(SDValue ResChain, /// prefer float load to int load plus direct move /// when there is no integer use of int load bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const { - SDNode *Origin = Op.getOperand(0).getNode(); + SDNode *Origin = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0).getNode(); if (Origin->getOpcode() != ISD::LOAD) return true; @@ -8629,7 +8813,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags); else FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, - DAG.getIntPtrConstant(0, dl)); + DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); } return FP; } @@ -8710,13 +8894,13 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags); else FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, - DAG.getIntPtrConstant(0, dl)); + DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); } return FP; } -SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, - SelectionDAG &DAG) const { +SDValue PPCTargetLowering::LowerGET_ROUNDING(SDValue Op, + SelectionDAG &DAG) const { SDLoc dl(Op); /* The rounding mode is in bits 30:31 of FPSR, and has the following @@ -8726,7 +8910,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 10 Round to +inf 11 Round to -inf - FLT_ROUNDS, on the other hand, expects the following: + GET_ROUNDING, on the other hand, expects the following: -1 Undefined 0 Round to 0 1 Round to nearest @@ -9040,7 +9224,7 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) { const SDValue *InputLoad = &Op; - if (InputLoad->getOpcode() == ISD::BITCAST) + while (InputLoad->getOpcode() == ISD::BITCAST) InputLoad = &InputLoad->getOperand(0); if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR || InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) { @@ -9224,7 +9408,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // Exclude somes case where LD_SPLAT is worse than scalar_to_vector: // Below cases should also happen for "lfiwzx/lfiwax + LE target + index // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index - // 15", but funciton IsValidSplatLoad() now will only return true when + // 15", but function IsValidSplatLoad() now will only return true when // the data at index 0 is not nullptr. So we will not get into trouble for // these cases. // @@ -9364,7 +9548,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16 }; - for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) { + for (unsigned idx = 0; idx < std::size(SplatCsts); ++idx) { // Indirect through the SplatCsts array so that we favor 'vsplti -1' for // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1' int i = SplatCsts[idx]; @@ -9741,8 +9925,11 @@ SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN, // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx. if (RHS->getOpcode() != ISD::BUILD_VECTOR) { std::swap(LHS, RHS); - VecShuffle = DAG.getCommutedVectorShuffle(*SVN); - ShuffleMask = cast<ShuffleVectorSDNode>(VecShuffle)->getMask(); + VecShuffle = peekThroughBitcasts(DAG.getCommutedVectorShuffle(*SVN)); + ShuffleVectorSDNode *CommutedSV = dyn_cast<ShuffleVectorSDNode>(VecShuffle); + if (!CommutedSV) + return SDValue(); + ShuffleMask = CommutedSV->getMask(); } // Ensure that the RHS is a vector of constants. @@ -9804,7 +9991,7 @@ SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const { SDValue N1 = peekThroughBitcasts(Op.getOperand(1)); unsigned SHLAmt = N1.getConstantOperandVal(0); if (SHLAmt % 8 == 0) { - SmallVector<int, 16> Mask(16, 0); + std::array<int, 16> Mask; std::iota(Mask.begin(), Mask.end(), 0); std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end()); if (SDValue Shuffle = @@ -9906,6 +10093,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return LdSplt; } } + + // All v2i64 and v2f64 shuffles are legal + if (VT == MVT::v2i64 || VT == MVT::v2f64) + return Op; + if (Subtarget.hasP9Vector() && PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap, isLittleEndian)) { @@ -10051,98 +10243,197 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // perfect shuffle table to emit an optimal matching sequence. ArrayRef<int> PermMask = SVOp->getMask(); - unsigned PFIndexes[4]; - bool isFourElementShuffle = true; - for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number - unsigned EltNo = 8; // Start out undef. - for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. - if (PermMask[i*4+j] < 0) - continue; // Undef, ignore it. - - unsigned ByteSource = PermMask[i*4+j]; - if ((ByteSource & 3) != j) { - isFourElementShuffle = false; - break; - } + if (!DisablePerfectShuffle && !isLittleEndian) { + unsigned PFIndexes[4]; + bool isFourElementShuffle = true; + for (unsigned i = 0; i != 4 && isFourElementShuffle; + ++i) { // Element number + unsigned EltNo = 8; // Start out undef. + for (unsigned j = 0; j != 4; ++j) { // Intra-element byte. + if (PermMask[i * 4 + j] < 0) + continue; // Undef, ignore it. + + unsigned ByteSource = PermMask[i * 4 + j]; + if ((ByteSource & 3) != j) { + isFourElementShuffle = false; + break; + } - if (EltNo == 8) { - EltNo = ByteSource/4; - } else if (EltNo != ByteSource/4) { - isFourElementShuffle = false; - break; + if (EltNo == 8) { + EltNo = ByteSource / 4; + } else if (EltNo != ByteSource / 4) { + isFourElementShuffle = false; + break; + } } + PFIndexes[i] = EltNo; + } + + // If this shuffle can be expressed as a shuffle of 4-byte elements, use the + // perfect shuffle vector to determine if it is cost effective to do this as + // discrete instructions, or whether we should use a vperm. + // For now, we skip this for little endian until such time as we have a + // little-endian perfect shuffle table. + if (isFourElementShuffle) { + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + // Determining when to avoid vperm is tricky. Many things affect the cost + // of vperm, particularly how many times the perm mask needs to be + // computed. For example, if the perm mask can be hoisted out of a loop or + // is already used (perhaps because there are multiple permutes with the + // same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the + // permute mask out of the loop requires an extra register. + // + // As a compromise, we only emit discrete instructions if the shuffle can + // be generated in 3 or fewer operations. When we have loop information + // available, if this block is within a loop, we should avoid using vperm + // for 3-operation perms and use a constant pool load instead. + if (Cost < 3) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } - PFIndexes[i] = EltNo; - } - - // If this shuffle can be expressed as a shuffle of 4-byte elements, use the - // perfect shuffle vector to determine if it is cost effective to do this as - // discrete instructions, or whether we should use a vperm. - // For now, we skip this for little endian until such time as we have a - // little-endian perfect shuffle table. - if (isFourElementShuffle && !isLittleEndian) { - // Compute the index in the perfect shuffle table. - unsigned PFTableIndex = - PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; - - unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - unsigned Cost = (PFEntry >> 30); - - // Determining when to avoid vperm is tricky. Many things affect the cost - // of vperm, particularly how many times the perm mask needs to be computed. - // For example, if the perm mask can be hoisted out of a loop or is already - // used (perhaps because there are multiple permutes with the same shuffle - // mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of - // the loop requires an extra register. - // - // As a compromise, we only emit discrete instructions if the shuffle can be - // generated in 3 or fewer operations. When we have loop information - // available, if this block is within a loop, we should avoid using vperm - // for 3-operation perms and use a constant pool load instead. - if (Cost < 3) - return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant // vector that will get spilled to the constant pool. if (V2.isUndef()) V2 = V1; + return LowerVPERM(Op, DAG, PermMask, VT, V1, V2); +} + +SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG, + ArrayRef<int> PermMask, EVT VT, + SDValue V1, SDValue V2) const { + unsigned Opcode = PPCISD::VPERM; + EVT ValType = V1.getValueType(); + SDLoc dl(Op); + bool NeedSwap = false; + bool isLittleEndian = Subtarget.isLittleEndian(); + bool isPPC64 = Subtarget.isPPC64(); + + if (Subtarget.hasVSX() && Subtarget.hasP9Vector() && + (V1->hasOneUse() || V2->hasOneUse())) { + LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using " + "XXPERM instead\n"); + Opcode = PPCISD::XXPERM; + + // The second input to XXPERM is also an output so if the second input has + // multiple uses then copying is necessary, as a result we want the + // single-use operand to be used as the second input to prevent copying. + if ((!isLittleEndian && !V2->hasOneUse() && V1->hasOneUse()) || + (isLittleEndian && !V1->hasOneUse() && V2->hasOneUse())) { + std::swap(V1, V2); + NeedSwap = !NeedSwap; + } + } + // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except // that it is in input element units, not in bytes. Convert now. // For little endian, the order of the input vectors is reversed, and // the permutation mask is complemented with respect to 31. This is - // necessary to produce proper semantics with the big-endian-biased vperm + // necessary to produce proper semantics with the big-endian-based vperm // instruction. EVT EltVT = V1.getValueType().getVectorElementType(); - unsigned BytesPerElement = EltVT.getSizeInBits()/8; + unsigned BytesPerElement = EltVT.getSizeInBits() / 8; + + bool V1HasXXSWAPD = V1->getOperand(0)->getOpcode() == PPCISD::XXSWAPD; + bool V2HasXXSWAPD = V2->getOperand(0)->getOpcode() == PPCISD::XXSWAPD; + + /* + Vectors will be appended like so: [ V1 | v2 ] + XXSWAPD on V1: + [ A | B | C | D ] -> [ C | D | A | B ] + 0-3 4-7 8-11 12-15 0-3 4-7 8-11 12-15 + i.e. index of A, B += 8, and index of C, D -= 8. + XXSWAPD on V2: + [ E | F | G | H ] -> [ G | H | E | F ] + 16-19 20-23 24-27 28-31 16-19 20-23 24-27 28-31 + i.e. index of E, F += 8, index of G, H -= 8 + Swap V1 and V2: + [ V1 | V2 ] -> [ V2 | V1 ] + 0-15 16-31 0-15 16-31 + i.e. index of V1 += 16, index of V2 -= 16 + */ SmallVector<SDValue, 16> ResultMask; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i]; + if (V1HasXXSWAPD) { + if (SrcElt < 8) + SrcElt += 8; + else if (SrcElt < 16) + SrcElt -= 8; + } + if (V2HasXXSWAPD) { + if (SrcElt > 23) + SrcElt -= 8; + else if (SrcElt > 15) + SrcElt += 8; + } + if (NeedSwap) { + if (SrcElt < 16) + SrcElt += 16; + else + SrcElt -= 16; + } for (unsigned j = 0; j != BytesPerElement; ++j) if (isLittleEndian) - ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement + j), - dl, MVT::i32)); + ResultMask.push_back( + DAG.getConstant(31 - (SrcElt * BytesPerElement + j), dl, MVT::i32)); else - ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement + j, dl, - MVT::i32)); + ResultMask.push_back( + DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32)); + } + + if (V1HasXXSWAPD) { + dl = SDLoc(V1->getOperand(0)); + V1 = V1->getOperand(0)->getOperand(1); + } + if (V2HasXXSWAPD) { + dl = SDLoc(V2->getOperand(0)); + V2 = V2->getOperand(0)->getOperand(1); + } + + if (isPPC64 && (V1HasXXSWAPD || V2HasXXSWAPD)) { + if (ValType != MVT::v2f64) + V1 = DAG.getBitcast(MVT::v2f64, V1); + if (V2.getValueType() != MVT::v2f64) + V2 = DAG.getBitcast(MVT::v2f64, V2); } ShufflesHandledWithVPERM++; SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask); - LLVM_DEBUG(dbgs() << "Emitting a VPERM for the following shuffle:\n"); - LLVM_DEBUG(SVOp->dump()); - LLVM_DEBUG(dbgs() << "With the following permute control vector:\n"); - LLVM_DEBUG(VPermMask.dump()); + LLVM_DEBUG({ + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + if (Opcode == PPCISD::XXPERM) { + dbgs() << "Emitting a XXPERM for the following shuffle:\n"; + } else { + dbgs() << "Emitting a VPERM for the following shuffle:\n"; + } + SVOp->dump(); + dbgs() << "With the following permute control vector:\n"; + VPermMask.dump(); + }); + + if (Opcode == PPCISD::XXPERM) + VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask); + // Only need to place items backwards in LE, + // the mask was properly calculated. if (isLittleEndian) - return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), - V2, V1, VPermMask); - else - return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), - V1, V2, VPermMask); + std::swap(V1, V2); + + SDValue VPERMNode = + DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask); + + VPERMNode = DAG.getBitcast(ValType, VPERMNode); + return VPERMNode; } /// getVectorCompareInfo - Given an intrinsic, return false if it is not a @@ -10150,8 +10441,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, /// information about the intrinsic. static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, bool &isDot, const PPCSubtarget &Subtarget) { - unsigned IntrinsicID = - cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); + unsigned IntrinsicID = Intrin.getConstantOperandVal(0); CompareOpc = -1; isDot = false; switch (IntrinsicID) { @@ -10436,8 +10726,7 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, /// lower, do it, otherwise return null. SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { - unsigned IntrinsicID = - cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned IntrinsicID = Op.getConstantOperandVal(0); SDLoc dl(Op); @@ -10448,7 +10737,46 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getRegister(PPC::X13, MVT::i64); return DAG.getRegister(PPC::R2, MVT::i32); - case Intrinsic::ppc_mma_disassemble_acc: + case Intrinsic::ppc_mma_disassemble_acc: { + if (Subtarget.isISAFuture()) { + EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; + SDValue WideVec = SDValue(DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl, + ArrayRef(ReturnTypes, 2), + Op.getOperand(1)), + 0); + SmallVector<SDValue, 4> RetOps; + SDValue Value = SDValue(WideVec.getNode(), 0); + SDValue Value2 = SDValue(WideVec.getNode(), 1); + + SDValue Extract; + Extract = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Subtarget.isLittleEndian() ? Value2 : Value, + DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0, + dl, getPointerTy(DAG.getDataLayout()))); + RetOps.push_back(Extract); + Extract = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Subtarget.isLittleEndian() ? Value2 : Value, + DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1, + dl, getPointerTy(DAG.getDataLayout()))); + RetOps.push_back(Extract); + Extract = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Subtarget.isLittleEndian() ? Value : Value2, + DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0, + dl, getPointerTy(DAG.getDataLayout()))); + RetOps.push_back(Extract); + Extract = DAG.getNode( + PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Subtarget.isLittleEndian() ? Value : Value2, + DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1, + dl, getPointerTy(DAG.getDataLayout()))); + RetOps.push_back(Extract); + return DAG.getMergeValues(RetOps, dl); + } + [[fallthrough]]; + } case Intrinsic::ppc_vsx_disassemble_pair: { int NumVecs = 2; SDValue WideVec = Op.getOperand(1); @@ -10468,6 +10796,20 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMergeValues(RetOps, dl); } + case Intrinsic::ppc_mma_xxmfacc: + case Intrinsic::ppc_mma_xxmtacc: { + // Allow pre-isa-future subtargets to lower as normal. + if (!Subtarget.isISAFuture()) + return SDValue(); + // The intrinsics for xxmtacc and xxmfacc take one argument of + // type v512i1, for future cpu the corresponding wacc instruction + // dmxx[inst|extf]dmr512 is always generated for type v512i1, negating + // the need to produce the xxm[t|f]acc. + SDValue WideVec = Op.getOperand(1); + DAG.ReplaceAllUsesWith(Op, WideVec); + return SDValue(); + } + case Intrinsic::ppc_unpack_longdouble: { auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2)); assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) && @@ -10506,11 +10848,11 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getTargetConstant(Pred, dl, MVT::i32)}), 0); } - case Intrinsic::ppc_test_data_class_d: - case Intrinsic::ppc_test_data_class_f: { - unsigned CmprOpc = PPC::XSTSTDCDP; - if (IntrinsicID == Intrinsic::ppc_test_data_class_f) - CmprOpc = PPC::XSTSTDCSP; + case Intrinsic::ppc_test_data_class: { + EVT OpVT = Op.getOperand(1).getValueType(); + unsigned CmprOpc = OpVT == MVT::f128 ? PPC::XSTSTDCQP + : (OpVT == MVT::f64 ? PPC::XSTSTDCDP + : PPC::XSTSTDCSP); return SDValue( DAG.getMachineNode( PPC::SELECT_CC_I4, dl, MVT::i32, @@ -10521,6 +10863,16 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getTargetConstant(PPC::PRED_EQ, dl, MVT::i32)}), 0); } + case Intrinsic::ppc_fnmsub: { + EVT VT = Op.getOperand(1).getValueType(); + if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128)) + return DAG.getNode( + ISD::FNEG, dl, VT, + DAG.getNode(ISD::FMA, dl, VT, Op.getOperand(1), Op.getOperand(2), + DAG.getNode(ISD::FNEG, dl, VT, Op.getOperand(3)))); + return DAG.getNode(PPCISD::FNMSUB, dl, VT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } case Intrinsic::ppc_convert_f128_to_ppcf128: case Intrinsic::ppc_convert_ppcf128_to_f128: { RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128 @@ -10532,6 +10884,31 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, dl, SDValue()); return Result.first; } + case Intrinsic::ppc_maxfe: + case Intrinsic::ppc_maxfl: + case Intrinsic::ppc_maxfs: + case Intrinsic::ppc_minfe: + case Intrinsic::ppc_minfl: + case Intrinsic::ppc_minfs: { + EVT VT = Op.getValueType(); + assert( + all_of(Op->ops().drop_front(4), + [VT](const SDUse &Use) { return Use.getValueType() == VT; }) && + "ppc_[max|min]f[e|l|s] must have uniform type arguments"); + (void)VT; + ISD::CondCode CC = ISD::SETGT; + if (IntrinsicID == Intrinsic::ppc_minfe || + IntrinsicID == Intrinsic::ppc_minfl || + IntrinsicID == Intrinsic::ppc_minfs) + CC = ISD::SETLT; + unsigned I = Op.getNumOperands() - 2, Cnt = I; + SDValue Res = Op.getOperand(I); + for (--I; Cnt != 0; --Cnt, I = (--I == 0 ? (Op.getNumOperands() - 1) : I)) { + Res = + DAG.getSelectCC(dl, Res, Op.getOperand(I), Res, Op.getOperand(I), CC); + } + return Res; + } } // If this is a lowered altivec predicate compare, CompareOpc is set to the @@ -10567,7 +10944,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // Unpack the result based on how the target uses it. unsigned BitNo; // Bit # of CR6. bool InvertBit; // Invert result? - switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) { + switch (Op.getConstantOperandVal(1)) { default: // Can't happen, don't crash on invalid number though. case 0: // Return the value of the EQ bit of CR6. BitNo = 0; InvertBit = false; @@ -10603,10 +10980,9 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, // the beginning of the argument list. int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1; SDLoc DL(Op); - switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) { + switch (Op.getConstantOperandVal(ArgStart)) { case Intrinsic::ppc_cfence: { assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument."); - assert(Subtarget.isPPC64() && "Only 64-bit is supported for now."); SDValue Val = Op.getOperand(ArgStart + 1); EVT Ty = Val.getValueType(); if (Ty == MVT::i128) { @@ -10614,9 +10990,11 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, // ordering? Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, Val); } + unsigned Opcode = Subtarget.isPPC64() ? PPC::CFENCE8 : PPC::CFENCE; + EVT FTy = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; return SDValue( - DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other, - DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Val), + DAG.getMachineNode(Opcode, DL, MVT::Other, + DAG.getNode(ISD::ANY_EXTEND, DL, FTy, Val), Op.getOperand(0)), 0); } @@ -10718,14 +11096,14 @@ SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op, SmallVector<SDValue, 4> Ops{ N->getOperand(0), DAG.getConstant(Intrinsic::ppc_atomic_store_i128, dl, MVT::i32)}; - SDValue Val = N->getOperand(2); + SDValue Val = N->getOperand(1); SDValue ValLo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, Val); SDValue ValHi = DAG.getNode(ISD::SRL, dl, MVT::i128, Val, DAG.getConstant(64, dl, MVT::i32)); ValHi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, ValHi); Ops.push_back(ValLo); Ops.push_back(ValHi); - Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(2)); return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, MemVT, N->getMemOperand()); } @@ -10734,6 +11112,153 @@ SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op, } } +static SDValue getDataClassTest(SDValue Op, FPClassTest Mask, const SDLoc &Dl, + SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + assert(Mask <= fcAllFlags && "Invalid fp_class flags!"); + + enum DataClassMask { + DC_NAN = 1 << 6, + DC_NEG_INF = 1 << 4, + DC_POS_INF = 1 << 5, + DC_NEG_ZERO = 1 << 2, + DC_POS_ZERO = 1 << 3, + DC_NEG_SUBNORM = 1, + DC_POS_SUBNORM = 1 << 1, + }; + + EVT VT = Op.getValueType(); + + unsigned TestOp = VT == MVT::f128 ? PPC::XSTSTDCQP + : VT == MVT::f64 ? PPC::XSTSTDCDP + : PPC::XSTSTDCSP; + + if (Mask == fcAllFlags) + return DAG.getBoolConstant(true, Dl, MVT::i1, VT); + if (Mask == 0) + return DAG.getBoolConstant(false, Dl, MVT::i1, VT); + + // When it's cheaper or necessary to test reverse flags. + if ((Mask & fcNormal) == fcNormal || Mask == ~fcQNan || Mask == ~fcSNan) { + SDValue Rev = getDataClassTest(Op, ~Mask, Dl, DAG, Subtarget); + return DAG.getNOT(Dl, Rev, MVT::i1); + } + + // Power doesn't support testing whether a value is 'normal'. Test the rest + // first, and test if it's 'not not-normal' with expected sign. + if (Mask & fcNormal) { + SDValue Rev(DAG.getMachineNode( + TestOp, Dl, MVT::i32, + DAG.getTargetConstant(DC_NAN | DC_NEG_INF | DC_POS_INF | + DC_NEG_ZERO | DC_POS_ZERO | + DC_NEG_SUBNORM | DC_POS_SUBNORM, + Dl, MVT::i32), + Op), + 0); + // Sign are stored in CR bit 0, result are in CR bit 2. + SDValue Sign( + DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1, Rev, + DAG.getTargetConstant(PPC::sub_lt, Dl, MVT::i32)), + 0); + SDValue Normal(DAG.getNOT( + Dl, + SDValue(DAG.getMachineNode( + TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1, Rev, + DAG.getTargetConstant(PPC::sub_eq, Dl, MVT::i32)), + 0), + MVT::i1)); + if (Mask & fcPosNormal) + Sign = DAG.getNOT(Dl, Sign, MVT::i1); + SDValue Result = DAG.getNode(ISD::AND, Dl, MVT::i1, Sign, Normal); + if (Mask == fcPosNormal || Mask == fcNegNormal) + return Result; + + return DAG.getNode( + ISD::OR, Dl, MVT::i1, + getDataClassTest(Op, Mask & ~fcNormal, Dl, DAG, Subtarget), Result); + } + + // The instruction doesn't differentiate between signaling or quiet NaN. Test + // the rest first, and test if it 'is NaN and is signaling/quiet'. + if ((Mask & fcNan) == fcQNan || (Mask & fcNan) == fcSNan) { + bool IsQuiet = Mask & fcQNan; + SDValue NanCheck = getDataClassTest(Op, fcNan, Dl, DAG, Subtarget); + + // Quietness is determined by the first bit in fraction field. + uint64_t QuietMask = 0; + SDValue HighWord; + if (VT == MVT::f128) { + HighWord = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, Dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Op), + DAG.getVectorIdxConstant(Subtarget.isLittleEndian() ? 3 : 0, Dl)); + QuietMask = 0x8000; + } else if (VT == MVT::f64) { + if (Subtarget.isPPC64()) { + HighWord = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, + DAG.getBitcast(MVT::i64, Op), + DAG.getConstant(1, Dl, MVT::i32)); + } else { + SDValue Vec = DAG.getBitcast( + MVT::v4i32, DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v2f64, Op)); + HighWord = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, Dl, MVT::i32, Vec, + DAG.getVectorIdxConstant(Subtarget.isLittleEndian() ? 1 : 0, Dl)); + } + QuietMask = 0x80000; + } else if (VT == MVT::f32) { + HighWord = DAG.getBitcast(MVT::i32, Op); + QuietMask = 0x400000; + } + SDValue NanRes = DAG.getSetCC( + Dl, MVT::i1, + DAG.getNode(ISD::AND, Dl, MVT::i32, HighWord, + DAG.getConstant(QuietMask, Dl, MVT::i32)), + DAG.getConstant(0, Dl, MVT::i32), IsQuiet ? ISD::SETNE : ISD::SETEQ); + NanRes = DAG.getNode(ISD::AND, Dl, MVT::i1, NanCheck, NanRes); + if (Mask == fcQNan || Mask == fcSNan) + return NanRes; + + return DAG.getNode(ISD::OR, Dl, MVT::i1, + getDataClassTest(Op, Mask & ~fcNan, Dl, DAG, Subtarget), + NanRes); + } + + unsigned NativeMask = 0; + if ((Mask & fcNan) == fcNan) + NativeMask |= DC_NAN; + if (Mask & fcNegInf) + NativeMask |= DC_NEG_INF; + if (Mask & fcPosInf) + NativeMask |= DC_POS_INF; + if (Mask & fcNegZero) + NativeMask |= DC_NEG_ZERO; + if (Mask & fcPosZero) + NativeMask |= DC_POS_ZERO; + if (Mask & fcNegSubnormal) + NativeMask |= DC_NEG_SUBNORM; + if (Mask & fcPosSubnormal) + NativeMask |= DC_POS_SUBNORM; + return SDValue( + DAG.getMachineNode( + TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1, + SDValue(DAG.getMachineNode( + TestOp, Dl, MVT::i32, + DAG.getTargetConstant(NativeMask, Dl, MVT::i32), Op), + 0), + DAG.getTargetConstant(PPC::sub_eq, Dl, MVT::i32)), + 0); +} + +SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget.hasP9Vector() && "Test data class requires Power9"); + SDValue LHS = Op.getOperand(0); + const auto *RHS = cast<ConstantSDNode>(Op.getOperand(1)); + SDLoc Dl(Op); + FPClassTest Category = static_cast<FPClassTest>(RHS->getZExtValue()); + return getDataClassTest(LHS, Category, Dl, DAG, Subtarget); +} + SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -10867,6 +11392,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SDValue StoreChain = SN->getChain(); SDValue BasePtr = SN->getBasePtr(); SDValue Value = SN->getValue(); + SDValue Value2 = SN->getValue(); EVT StoreVT = Value.getValueType(); if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1) @@ -10883,13 +11409,29 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SmallVector<SDValue, 4> Stores; unsigned NumVecs = 2; if (StoreVT == MVT::v512i1) { - Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value); + if (Subtarget.isISAFuture()) { + EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1}; + MachineSDNode *ExtNode = DAG.getMachineNode( + PPC::DMXXEXTFDMR512, dl, ArrayRef(ReturnTypes, 2), Op.getOperand(1)); + + Value = SDValue(ExtNode, 0); + Value2 = SDValue(ExtNode, 1); + } else + Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value); NumVecs = 4; } for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx; - SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, - DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); + SDValue Elt; + if (Subtarget.isISAFuture()) { + VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2); + Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, + Idx > 1 ? Value2 : Value, + DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); + } else + Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, + DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); + SDValue Store = DAG.getStore(StoreChain, dl, Elt, BasePtr, SN->getPointerInfo().getWithOffset(Idx * 16), @@ -11003,7 +11545,7 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { return SDValue(); // Custom lower is only done for high or low doubleword. - int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); + int Idx = Op0.getConstantOperandVal(1); if (Idx % 2 != 0) return SDValue(); @@ -11058,6 +11600,12 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Wasn't expecting to be able to lower this!"); + case ISD::FPOW: return lowerPow(Op, DAG); + case ISD::FSIN: return lowerSin(Op, DAG); + case ISD::FCOS: return lowerCos(Op, DAG); + case ISD::FLOG: return lowerLog(Op, DAG); + case ISD::FLOG10: return lowerLog10(Op, DAG); + case ISD::FEXP: return lowerExp(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); @@ -11098,7 +11646,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STRICT_SINT_TO_FP: case ISD::UINT_TO_FP: case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG); - case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); // Lower 64-bit shifts. case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG); @@ -11138,6 +11686,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_LOAD_STORE(Op, DAG); + case ISD::IS_FPCLASS: + return LowerIS_FPCLASS(Op, DAG); } } @@ -11164,8 +11714,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, break; } case ISD::INTRINSIC_W_CHAIN: { - if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != - Intrinsic::loop_decrement) + if (N->getConstantOperandVal(1) != Intrinsic::loop_decrement) break; assert(N->getValueType(0) == MVT::i1 && @@ -11181,11 +11730,14 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, break; } case ISD::INTRINSIC_WO_CHAIN: { - switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) { + switch (N->getConstantOperandVal(0)) { case Intrinsic::ppc_pack_longdouble: Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128, N->getOperand(2), N->getOperand(1))); break; + case Intrinsic::ppc_maxfe: + case Intrinsic::ppc_minfe: + case Intrinsic::ppc_fnmsub: case Intrinsic::ppc_convert_f128_to_ppcf128: Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG)); break; @@ -11272,7 +11824,7 @@ Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder, // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. - if (isa<LoadInst>(Inst) && Subtarget.isPPC64()) + if (isa<LoadInst>(Inst)) return Builder.CreateCall( Intrinsic::getDeclaration( Builder.GetInsertBlock()->getParent()->getParent(), @@ -11360,7 +11912,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, // For max/min... // loopMBB: // l[wd]arx dest, ptr - // cmpl?[wd] incr, dest + // cmpl?[wd] dest, incr // bgt exitMBB // loop2MBB: // st[wd]cx. dest, ptr @@ -11373,19 +11925,20 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB, if (BinOpcode) BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest); if (CmpOpcode) { + Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); // Signed comparisons of byte or halfword values must be sign-extended. if (CmpOpcode == PPC::CMPW && AtomicSize < 4) { Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH), ExtReg).addReg(dest); - BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) - .addReg(incr).addReg(ExtReg); + BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(ExtReg).addReg(incr); } else - BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) - .addReg(incr).addReg(dest); + BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(dest).addReg(incr); BuildMI(BB, dl, TII->get(PPC::BCC)) - .addImm(CmpPred).addReg(PPC::CR0).addMBB(exitMBB); + .addImm(CmpPred) + .addReg(CrReg) + .addMBB(exitMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(exitMBB); BB = loop2MBB; @@ -11408,7 +11961,8 @@ static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) { default: return false; case PPC::COPY: - return TII->isSignExtended(MI); + return TII->isSignExtended(MI.getOperand(1).getReg(), + &MI.getMF()->getRegInfo()); case PPC::LHA: case PPC::LHA8: case PPC::LHAU: @@ -11471,14 +12025,15 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( MachineFunction *F = BB->getParent(); MachineRegisterInfo &RegInfo = F->getRegInfo(); Register incr = MI.getOperand(3).getReg(); - bool IsSignExtended = Register::isVirtualRegister(incr) && - isSignExtended(*RegInfo.getVRegDef(incr), TII); + bool IsSignExtended = + incr.isVirtual() && isSignExtended(*RegInfo.getVRegDef(incr), TII); if (CmpOpcode == PPC::CMPW && !IsSignExtended) { Register ValueReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass); BuildMI(*BB, MI, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueReg) .addReg(MI.getOperand(3).getReg()); MI.getOperand(3).setReg(ValueReg); + incr = ValueReg; } // If we support part-word atomic mnemonics, just use them if (Subtarget.hasPartwordAtomics()) @@ -11617,6 +12172,7 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( // For unsigned comparisons, we can directly compare the shifted values. // For signed comparisons we shift and sign extend. Register SReg = RegInfo.createVirtualRegister(GPRC); + Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); BuildMI(BB, dl, TII->get(PPC::AND), SReg) .addReg(TmpDestReg) .addReg(MaskReg); @@ -11633,12 +12189,10 @@ MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary( ValueReg = ValueSReg; CmpReg = incr; } - BuildMI(BB, dl, TII->get(CmpOpcode), PPC::CR0) - .addReg(CmpReg) - .addReg(ValueReg); + BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(ValueReg).addReg(CmpReg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(CmpPred) - .addReg(PPC::CR0) + .addReg(CrReg) .addMBB(exitMBB); BB->addSuccessor(loop2MBB); BB->addSuccessor(exitMBB); @@ -11916,7 +12470,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, return MBB; } -bool PPCTargetLowering::hasInlineStackProbe(MachineFunction &MF) const { +bool PPCTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const { // If the function specifically requests inline stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == @@ -11924,19 +12478,16 @@ bool PPCTargetLowering::hasInlineStackProbe(MachineFunction &MF) const { return false; } -unsigned PPCTargetLowering::getStackProbeSize(MachineFunction &MF) const { +unsigned PPCTargetLowering::getStackProbeSize(const MachineFunction &MF) const { const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); unsigned StackAlign = TFI->getStackAlignment(); assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) && "Unexpected stack alignment"); // The default stack probe size is 4096 if the function has no // stack-probe-size attribute. - unsigned StackProbeSize = 4096; const Function &Fn = MF.getFunction(); - if (Fn.hasFnAttribute("stack-probe-size")) - Fn.getFnAttribute("stack-probe-size") - .getValueAsString() - .getAsInteger(0, StackProbeSize); + unsigned StackProbeSize = + Fn.getFnAttributeAsParsedInteger("stack-probe-size", 4096); // Round down to the stack alignment. StackProbeSize &= ~(StackAlign - 1); return StackProbeSize ? StackProbeSize : StackAlign; @@ -12345,40 +12896,40 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8) - BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GE); + BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16) - BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GE); + BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32) - BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GE); + BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64) - BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GE); + BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8) - BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LE); + BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16) - BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LE); + BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32) - BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LE); + BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64) - BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LE); + BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8) - BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GE); + BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16) - BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GE); + BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32) - BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GE); + BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64) - BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GE); + BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8) - BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LE); + BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16) - BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LE); + BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32) - BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LE); + BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GT); else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64) - BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LE); + BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GT); else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8) BB = EmitPartwordAtomicBinary(MI, BB, true, 0); @@ -12420,20 +12971,20 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, StoreMnemonic = PPC::STDCX; break; } + MachineRegisterInfo &RegInfo = F->getRegInfo(); Register dest = MI.getOperand(0).getReg(); Register ptrA = MI.getOperand(1).getReg(); Register ptrB = MI.getOperand(2).getReg(); + Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); Register oldval = MI.getOperand(3).getReg(); Register newval = MI.getOperand(4).getReg(); DebugLoc dl = MI.getDebugLoc(); MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loop1MBB); F->insert(It, loop2MBB); - F->insert(It, midMBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); @@ -12447,25 +12998,23 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // loop1MBB: // l[bhwd]arx dest, ptr // cmp[wd] dest, oldval - // bne- midMBB + // bne- exitBB // loop2MBB: // st[bhwd]cx. newval, ptr // bne- loopMBB // b exitBB - // midMBB: - // st[bhwd]cx. dest, ptr // exitBB: BB = loop1MBB; BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB); - BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0) - .addReg(oldval) - .addReg(dest); + BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), CrReg) + .addReg(dest) + .addReg(oldval); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE) - .addReg(PPC::CR0) - .addMBB(midMBB); + .addReg(CrReg) + .addMBB(exitMBB); BB->addSuccessor(loop2MBB); - BB->addSuccessor(midMBB); + BB->addSuccessor(exitMBB); BB = loop2MBB; BuildMI(BB, dl, TII->get(StoreMnemonic)) @@ -12480,13 +13029,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BB->addSuccessor(loop1MBB); BB->addSuccessor(exitMBB); - BB = midMBB; - BuildMI(BB, dl, TII->get(StoreMnemonic)) - .addReg(dest) - .addReg(ptrA) - .addReg(ptrB); - BB->addSuccessor(exitMBB); - // exitMBB: // ... BB = exitMBB; @@ -12508,11 +13050,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, loop1MBB); F->insert(It, loop2MBB); - F->insert(It, midMBB); F->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); @@ -12540,6 +13080,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Register Ptr1Reg; Register TmpReg = RegInfo.createVirtualRegister(GPRC); Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO; + Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass); // thisMBB: // ... // fallthrough --> loopMBB @@ -12561,15 +13102,13 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // lwarx tmpDest, ptr // and tmp, tmpDest, mask // cmpw tmp, oldval3 - // bne- midMBB + // bne- exitBB // loop2MBB: // andc tmp2, tmpDest, mask // or tmp4, tmp2, newval3 // stwcx. tmp4, ptr // bne- loop1MBB // b exitBB - // midMBB: - // stwcx. tmpDest, ptr // exitBB: // srw dest, tmpDest, shift if (ptrA != ZeroReg) { @@ -12634,15 +13173,15 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BuildMI(BB, dl, TII->get(PPC::AND), TmpReg) .addReg(TmpDestReg) .addReg(MaskReg); - BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0) + BuildMI(BB, dl, TII->get(PPC::CMPW), CrReg) .addReg(TmpReg) .addReg(OldVal3Reg); BuildMI(BB, dl, TII->get(PPC::BCC)) .addImm(PPC::PRED_NE) - .addReg(PPC::CR0) - .addMBB(midMBB); + .addReg(CrReg) + .addMBB(exitMBB); BB->addSuccessor(loop2MBB); - BB->addSuccessor(midMBB); + BB->addSuccessor(exitMBB); BB = loop2MBB; BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg) @@ -12663,13 +13202,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BB->addSuccessor(loop1MBB); BB->addSuccessor(exitMBB); - BB = midMBB; - BuildMI(BB, dl, TII->get(PPC::STWCX)) - .addReg(TmpDestReg) - .addReg(ZeroReg) - .addReg(PtrReg); - BB->addSuccessor(exitMBB); - // exitMBB: // ... BB = exitMBB; @@ -13118,7 +13650,7 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) { EVT VT; - switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + switch (N->getConstantOperandVal(1)) { default: return false; case Intrinsic::ppc_altivec_lvx: case Intrinsic::ppc_altivec_lvxl: @@ -13146,7 +13678,7 @@ static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base, if (N->getOpcode() == ISD::INTRINSIC_VOID) { EVT VT; - switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + switch (N->getConstantOperandVal(1)) { default: return false; case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: @@ -13222,9 +13754,8 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { Visited.clear(); Queue.clear(); - for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(), - IE = LoadRoots.end(); I != IE; ++I) { - Queue.push_back(*I); + for (SDNode *I : LoadRoots) { + Queue.push_back(I); while (!Queue.empty()) { SDNode *LoadRoot = Queue.pop_back_val(); @@ -13979,9 +14510,9 @@ combineElementTruncationToVectorTruncation(SDNode *N, if (In.isUndef()) Ops.push_back(DAG.getUNDEF(SrcVT)); else { - SDValue Trunc = DAG.getNode(ISD::FP_ROUND, dl, - MVT::f32, In.getOperand(0), - DAG.getIntPtrConstant(1, dl)); + SDValue Trunc = + DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, In.getOperand(0), + DAG.getIntPtrConstant(1, dl, /*isTarget=*/true)); Ops.push_back(Trunc); } } else @@ -14023,17 +14554,23 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize(); SDValue FirstInput = N->getOperand(0); bool IsRoundOfExtLoad = false; + LoadSDNode *FirstLoad = nullptr; if (FirstInput.getOpcode() == ISD::FP_ROUND && FirstInput.getOperand(0).getOpcode() == ISD::LOAD) { - LoadSDNode *LD = dyn_cast<LoadSDNode>(FirstInput.getOperand(0)); - IsRoundOfExtLoad = LD->getExtensionType() == ISD::EXTLOAD; + FirstLoad = cast<LoadSDNode>(FirstInput.getOperand(0)); + IsRoundOfExtLoad = FirstLoad->getExtensionType() == ISD::EXTLOAD; } // Not a build vector of (possibly fp_rounded) loads. if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) || N->getNumOperands() == 1) return SDValue(); + if (!IsRoundOfExtLoad) + FirstLoad = cast<LoadSDNode>(FirstInput); + + SmallVector<LoadSDNode *, 4> InputLoads; + InputLoads.push_back(FirstLoad); for (int i = 1, e = N->getNumOperands(); i < e; ++i) { // If any inputs are fp_round(extload), they all must be. if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND) @@ -14046,53 +14583,55 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { SDValue PreviousInput = IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1); - LoadSDNode *LD1 = dyn_cast<LoadSDNode>(PreviousInput); - LoadSDNode *LD2 = dyn_cast<LoadSDNode>(NextInput); + LoadSDNode *LD1 = cast<LoadSDNode>(PreviousInput); + LoadSDNode *LD2 = cast<LoadSDNode>(NextInput); // If any inputs are fp_round(extload), they all must be. if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD) return SDValue(); - if (!isConsecutiveLS(LD2, LD1, ElemSize, 1, DAG)) + // We only care about regular loads. The PPC-specific load intrinsics + // will not lead to a merge opportunity. + if (!DAG.areNonVolatileConsecutiveLoads(LD2, LD1, ElemSize, 1)) InputsAreConsecutiveLoads = false; - if (!isConsecutiveLS(LD1, LD2, ElemSize, 1, DAG)) + if (!DAG.areNonVolatileConsecutiveLoads(LD1, LD2, ElemSize, 1)) InputsAreReverseConsecutive = false; // Exit early if the loads are neither consecutive nor reverse consecutive. if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive) return SDValue(); + InputLoads.push_back(LD2); } assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) && "The loads cannot be both consecutive and reverse consecutive."); - SDValue FirstLoadOp = - IsRoundOfExtLoad ? FirstInput.getOperand(0) : FirstInput; - SDValue LastLoadOp = - IsRoundOfExtLoad ? N->getOperand(N->getNumOperands()-1).getOperand(0) : - N->getOperand(N->getNumOperands()-1); - - LoadSDNode *LD1 = dyn_cast<LoadSDNode>(FirstLoadOp); - LoadSDNode *LDL = dyn_cast<LoadSDNode>(LastLoadOp); + SDValue WideLoad; + SDValue ReturnSDVal; if (InputsAreConsecutiveLoads) { - assert(LD1 && "Input needs to be a LoadSDNode."); - return DAG.getLoad(N->getValueType(0), dl, LD1->getChain(), - LD1->getBasePtr(), LD1->getPointerInfo(), - LD1->getAlignment()); - } - if (InputsAreReverseConsecutive) { - assert(LDL && "Input needs to be a LoadSDNode."); - SDValue Load = DAG.getLoad(N->getValueType(0), dl, LDL->getChain(), - LDL->getBasePtr(), LDL->getPointerInfo(), - LDL->getAlignment()); + assert(FirstLoad && "Input needs to be a LoadSDNode."); + WideLoad = DAG.getLoad(N->getValueType(0), dl, FirstLoad->getChain(), + FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), + FirstLoad->getAlign()); + ReturnSDVal = WideLoad; + } else if (InputsAreReverseConsecutive) { + LoadSDNode *LastLoad = InputLoads.back(); + assert(LastLoad && "Input needs to be a LoadSDNode."); + WideLoad = DAG.getLoad(N->getValueType(0), dl, LastLoad->getChain(), + LastLoad->getBasePtr(), LastLoad->getPointerInfo(), + LastLoad->getAlign()); SmallVector<int, 16> Ops; for (int i = N->getNumOperands() - 1; i >= 0; i--) Ops.push_back(i); - return DAG.getVectorShuffle(N->getValueType(0), dl, Load, - DAG.getUNDEF(N->getValueType(0)), Ops); - } - return SDValue(); + ReturnSDVal = DAG.getVectorShuffle(N->getValueType(0), dl, WideLoad, + DAG.getUNDEF(N->getValueType(0)), Ops); + } else + return SDValue(); + + for (auto *LD : InputLoads) + DAG.makeEquivalentMemoryOrdering(LD, WideLoad); + return ReturnSDVal; } // This function adds the required vector_shuffle needed to get @@ -14403,6 +14942,7 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl, DAG.getVTList(MVT::f64, MVT::Other), Ops, MVT::i8, LDN->getMemOperand()); + DAG.makeEquivalentMemoryOrdering(LDN, Ld); // For signed conversion, we need to sign-extend the value in the VSR if (Signed) { @@ -14457,8 +14997,8 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp); if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) { - FP = DAG.getNode(ISD::FP_ROUND, dl, - MVT::f32, FP, DAG.getIntPtrConstant(0, dl)); + FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, + DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); DCI.AddToWorklist(FP.getNode()); } @@ -14472,6 +15012,11 @@ SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, // builtins) into loads with swaps. SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const { + // Delay VSX load for LE combine until after LegalizeOps to prioritize other + // load combines. + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue Chain; @@ -14506,13 +15051,6 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, MVT VecTy = N->getValueType(0).getSimpleVT(); - // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is - // aligned and the type is a vector with elements up to 4 bytes - if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) && - VecTy.getScalarSizeInBits() <= 32) { - return SDValue(); - } - SDValue LoadOps[] = { Chain, Base }; SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl, DAG.getVTList(MVT::v2f64, MVT::Other), @@ -14540,6 +15078,11 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, // builtins) into stores with swaps. SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const { + // Delay VSX store for LE combine until after LegalizeOps to prioritize other + // store combines. + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue Chain; @@ -14577,13 +15120,6 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, SDValue Src = N->getOperand(SrcOpnd); MVT VecTy = Src.getValueType().getSimpleVT(); - // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is - // aligned and the type is a vector with elements up to 4 bytes - if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) && - VecTy.getScalarSizeInBits() <= 32) { - return SDValue(); - } - // All stores are done as v2f64 and possible bit cast. if (VecTy != MVT::v2f64) { Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src); @@ -14605,60 +15141,49 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N, // Handle DAG combine for STORE (FP_TO_INT F). SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); unsigned Opcode = N->getOperand(1).getOpcode(); + (void)Opcode; + bool Strict = N->getOperand(1)->isStrictFPOpcode(); - assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) + assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT || + Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT) && "Not a FP_TO_INT Instruction!"); - SDValue Val = N->getOperand(1).getOperand(0); + SDValue Val = N->getOperand(1).getOperand(Strict ? 1 : 0); EVT Op1VT = N->getOperand(1).getValueType(); EVT ResVT = Val.getValueType(); - if (!isTypeLegal(ResVT)) + if (!Subtarget.hasVSX() || !Subtarget.hasFPCVT() || !isTypeLegal(ResVT)) return SDValue(); // Only perform combine for conversion to i64/i32 or power9 i16/i8. bool ValidTypeForStoreFltAsInt = - (Op1VT == MVT::i32 || Op1VT == MVT::i64 || + (Op1VT == MVT::i32 || (Op1VT == MVT::i64 && Subtarget.isPPC64()) || (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8))); - if (ResVT == MVT::f128 && !Subtarget.hasP9Vector()) + // TODO: Lower conversion from f128 on all VSX targets + if (ResVT == MVT::ppcf128 || (ResVT == MVT::f128 && !Subtarget.hasP9Vector())) return SDValue(); - if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Vector() || + if ((Op1VT != MVT::i64 && !Subtarget.hasP8Vector()) || cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt) return SDValue(); - // Extend f32 values to f64 - if (ResVT.getScalarSizeInBits() == 32) { - Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val); - DCI.AddToWorklist(Val.getNode()); - } - - // Set signed or unsigned conversion opcode. - unsigned ConvOpcode = (Opcode == ISD::FP_TO_SINT) ? - PPCISD::FP_TO_SINT_IN_VSR : - PPCISD::FP_TO_UINT_IN_VSR; - - Val = DAG.getNode(ConvOpcode, - dl, ResVT == MVT::f128 ? MVT::f128 : MVT::f64, Val); - DCI.AddToWorklist(Val.getNode()); + Val = convertFPToInt(N->getOperand(1), DAG, Subtarget); // Set number of bytes being converted. unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8; - SDValue Ops[] = { N->getOperand(0), Val, N->getOperand(2), - DAG.getIntPtrConstant(ByteSize, dl, false), - DAG.getValueType(Op1VT) }; + SDValue Ops[] = {N->getOperand(0), Val, N->getOperand(2), + DAG.getIntPtrConstant(ByteSize, dl, false), + DAG.getValueType(Op1VT)}; Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl, DAG.getVTList(MVT::Other), Ops, cast<StoreSDNode>(N)->getMemoryVT(), cast<StoreSDNode>(N)->getMemOperand()); - DCI.AddToWorklist(Val.getNode()); return Val; } @@ -14805,10 +15330,21 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN, // Adjust the shuffle mask if either input vector comes from a // SCALAR_TO_VECTOR and keep the respective input vector in permuted // form (to prevent the need for a swap). - SmallVector<int, 16> ShuffV(Mask.begin(), Mask.end()); + SmallVector<int, 16> ShuffV(Mask); SDValue SToVLHS = isScalarToVec(LHS); SDValue SToVRHS = isScalarToVec(RHS); if (SToVLHS || SToVRHS) { + // FIXME: If both LHS and RHS are SCALAR_TO_VECTOR, but are not the + // same type and have differing element sizes, then do not perform + // the following transformation. The current transformation for + // SCALAR_TO_VECTOR assumes that both input vectors have the same + // element size. This will be updated in the future to account for + // differing sizes of the LHS and RHS. + if (SToVLHS && SToVRHS && + (SToVLHS.getValueType().getScalarSizeInBits() != + SToVRHS.getValueType().getScalarSizeInBits())) + return Res; + int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements() : SToVRHS.getValueType().getVectorNumElements(); int NumEltsOut = ShuffV.size(); @@ -14892,24 +15428,40 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN, // Example (even elements from first vector): // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero> if (Mask[0] < NumElts) - for (int i = 1, e = Mask.size(); i < e; i += 2) - ShuffV[i] = (ShuffV[i - 1] + NumElts); + for (int i = 1, e = Mask.size(); i < e; i += 2) { + if (ShuffV[i] < 0) + continue; + // If element from non-splat is undef, pick first element from splat. + ShuffV[i] = (ShuffV[i - 1] >= 0 ? ShuffV[i - 1] : 0) + NumElts; + } // Example (odd elements from first vector): // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero> else - for (int i = 0, e = Mask.size(); i < e; i += 2) - ShuffV[i] = (ShuffV[i + 1] + NumElts); + for (int i = 0, e = Mask.size(); i < e; i += 2) { + if (ShuffV[i] < 0) + continue; + // If element from non-splat is undef, pick first element from splat. + ShuffV[i] = (ShuffV[i + 1] >= 0 ? ShuffV[i + 1] : 0) + NumElts; + } } else { // Example (even elements from first vector): // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t1 if (Mask[0] < NumElts) - for (int i = 0, e = Mask.size(); i < e; i += 2) - ShuffV[i] = ShuffV[i + 1] - NumElts; + for (int i = 0, e = Mask.size(); i < e; i += 2) { + if (ShuffV[i] < 0) + continue; + // If element from non-splat is undef, pick first element from splat. + ShuffV[i] = ShuffV[i + 1] >= 0 ? ShuffV[i + 1] - NumElts : 0; + } // Example (odd elements from first vector): // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t1 else - for (int i = 1, e = Mask.size(); i < e; i += 2) - ShuffV[i] = ShuffV[i - 1] - NumElts; + for (int i = 1, e = Mask.size(); i < e; i += 2) { + if (ShuffV[i] < 0) + continue; + // If element from non-splat is undef, pick first element from splat. + ShuffV[i] = ShuffV[i - 1] >= 0 ? ShuffV[i - 1] - NumElts : 0; + } } // If the RHS has undefs, we need to remove them since we may have created @@ -14994,6 +15546,21 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN, llvm_unreachable("Expected a load or store node here"); } +static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) { + unsigned IntrinsicID = Intrin.getConstantOperandVal(1); + if (IntrinsicID == Intrinsic::ppc_stdcx) + StoreWidth = 8; + else if (IntrinsicID == Intrinsic::ppc_stwcx) + StoreWidth = 4; + else if (IntrinsicID == Intrinsic::ppc_sthcx) + StoreWidth = 2; + else if (IntrinsicID == Intrinsic::ppc_stbcx) + StoreWidth = 1; + else + return false; + return true; +} + SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -15002,6 +15569,30 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, default: break; case ISD::ADD: return combineADD(N, DCI); + case ISD::AND: { + // We don't want (and (zext (shift...)), C) if C fits in the width of the + // original input as that will prevent us from selecting optimal rotates. + // This only matters if the input to the extend is i32 widened to i64. + SDValue Op1 = N->getOperand(0); + SDValue Op2 = N->getOperand(1); + if ((Op1.getOpcode() != ISD::ZERO_EXTEND && + Op1.getOpcode() != ISD::ANY_EXTEND) || + !isa<ConstantSDNode>(Op2) || N->getValueType(0) != MVT::i64 || + Op1.getOperand(0).getValueType() != MVT::i32) + break; + SDValue NarrowOp = Op1.getOperand(0); + if (NarrowOp.getOpcode() != ISD::SHL && NarrowOp.getOpcode() != ISD::SRL && + NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR) + break; + + uint64_t Imm = Op2->getAsZExtVal(); + // Make sure that the constant is narrow enough to fit in the narrow type. + if (!isUInt<32>(Imm)) + break; + SDValue ConstOp = DAG.getConstant(Imm, dl, MVT::i32); + SDValue NarrowAnd = DAG.getNode(ISD::AND, dl, MVT::i32, NarrowOp, ConstOp); + return DAG.getZExtOrTrunc(NarrowAnd, dl, N->getValueType(0)); + } case ISD::SHL: return combineSHL(N, DCI); case ISD::SRA: @@ -15037,7 +15628,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SETCC: if (SDValue CSCC = combineSetCC(N, DCI)) return CSCC; - LLVM_FALLTHROUGH; + [[fallthrough]]; case ISD::SELECT_CC: return DAGCombineTruncBoolExt(N, DCI); case ISD::SINT_TO_FP: @@ -15054,8 +15645,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, EVT Op1VT = N->getOperand(1).getValueType(); unsigned Opcode = N->getOperand(1).getOpcode(); - if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT) { - SDValue Val= combineStoreFPToInt(N, DCI); + if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT || + Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT) { + SDValue Val = combineStoreFPToInt(N, DCI); if (Val) return Val; } @@ -15226,7 +15818,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, auto MMOFlags = LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile; SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr, - LD->getPointerInfo(), LD->getAlignment(), + LD->getPointerInfo(), LD->getAlign(), MMOFlags, LD->getAAInfo()); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), @@ -15234,7 +15826,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDValue FloatLoad2 = DAG.getLoad( MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr, LD->getPointerInfo().getWithOffset(4), - MinAlign(LD->getAlignment(), 4), MMOFlags, LD->getAAInfo()); + commonAlignment(LD->getAlign(), 4), MMOFlags, LD->getAAInfo()); if (LD->isIndexed()) { // Note that DAGCombine should re-form any pre-increment load(s) from @@ -15312,7 +15904,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *BaseMMO = MF.getMachineMemOperand(LD->getMemOperand(), - -(long)MemVT.getStoreSize()+1, + -(int64_t)MemVT.getStoreSize()+1, 2*MemVT.getStoreSize()-1); // Create the new base load. @@ -15387,7 +15979,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::INTRINSIC_WO_CHAIN: { bool isLittleEndian = Subtarget.isLittleEndian(); - unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + unsigned IID = N->getConstantOperandVal(0); Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr : Intrinsic::ppc_altivec_lvsl); if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) { @@ -15400,36 +15992,34 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, .zext(Add.getScalarValueSizeInBits()))) { SDNode *BasePtr = Add->getOperand(0).getNode(); for (SDNode *U : BasePtr->uses()) { - if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN && - cast<ConstantSDNode>(U->getOperand(0))->getZExtValue() == IID) { - // We've found another LVSL/LVSR, and this address is an aligned - // multiple of that one. The results will be the same, so use the - // one we've just found instead. + if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN && + U->getConstantOperandVal(0) == IID) { + // We've found another LVSL/LVSR, and this address is an aligned + // multiple of that one. The results will be the same, so use the + // one we've just found instead. - return SDValue(U, 0); - } + return SDValue(U, 0); + } } } if (isa<ConstantSDNode>(Add->getOperand(1))) { SDNode *BasePtr = Add->getOperand(0).getNode(); for (SDNode *U : BasePtr->uses()) { - if (U->getOpcode() == ISD::ADD && - isa<ConstantSDNode>(U->getOperand(1)) && - (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() - - cast<ConstantSDNode>(U->getOperand(1))->getZExtValue()) % - (1ULL << Bits) == - 0) { - SDNode *OtherAdd = U; - for (SDNode *V : OtherAdd->uses()) { - if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN && - cast<ConstantSDNode>(V->getOperand(0))->getZExtValue() == - IID) { - return SDValue(V, 0); - } + if (U->getOpcode() == ISD::ADD && + isa<ConstantSDNode>(U->getOperand(1)) && + (Add->getConstantOperandVal(1) - U->getConstantOperandVal(1)) % + (1ULL << Bits) == + 0) { + SDNode *OtherAdd = U; + for (SDNode *V : OtherAdd->uses()) { + if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN && + V->getConstantOperandVal(0) == IID) { + return SDValue(V, 0); } } } + } } } @@ -15469,23 +16059,44 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::INTRINSIC_W_CHAIN: - // For little endian, VSX loads require generating lxvd2x/xxswapd. - // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. - if (Subtarget.needsSwapsForVSXMemOps()) { - switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + switch (N->getConstantOperandVal(1)) { default: break; - case Intrinsic::ppc_vsx_lxvw4x: - case Intrinsic::ppc_vsx_lxvd2x: - return expandVSXLoadForLE(N, DCI); + case Intrinsic::ppc_altivec_vsum4sbs: + case Intrinsic::ppc_altivec_vsum4shs: + case Intrinsic::ppc_altivec_vsum4ubs: { + // These sum-across intrinsics only have a chain due to the side effect + // that they may set the SAT bit. If we know the SAT bit will not be set + // for some inputs, we can replace any uses of their chain with the + // input chain. + if (BuildVectorSDNode *BVN = + dyn_cast<BuildVectorSDNode>(N->getOperand(3))) { + APInt APSplatBits, APSplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + bool BVNIsConstantSplat = BVN->isConstantSplat( + APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0, + !Subtarget.isLittleEndian()); + // If the constant splat vector is 0, the SAT bit will not be set. + if (BVNIsConstantSplat && APSplatBits == 0) + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0)); + } + return SDValue(); } + case Intrinsic::ppc_vsx_lxvw4x: + case Intrinsic::ppc_vsx_lxvd2x: + // For little endian, VSX loads require generating lxvd2x/xxswapd. + // Not needed on ISA 3.0 based CPUs since we have a non-permuting load. + if (Subtarget.needsSwapsForVSXMemOps()) + return expandVSXLoadForLE(N, DCI); + break; } break; case ISD::INTRINSIC_VOID: // For little endian, VSX stores require generating xxswapd/stxvd2x. // Not needed on ISA 3.0 based CPUs since we have a non-permuting store. if (Subtarget.needsSwapsForVSXMemOps()) { - switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { + switch (N->getConstantOperandVal(1)) { default: break; case Intrinsic::ppc_vsx_stxvw4x: @@ -15547,7 +16158,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); SDValue BasePtr = LD->getBasePtr(); SDValue Lo = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr, - LD->getPointerInfo(), LD->getAlignment()); + LD->getPointerInfo(), LD->getAlign()); Lo = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Lo); BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, DAG.getIntPtrConstant(4, dl)); @@ -15616,75 +16227,33 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(VCMPrecNode, 0); } break; - case ISD::BRCOND: { - SDValue Cond = N->getOperand(1); - SDValue Target = N->getOperand(2); - - if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && - cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == - Intrinsic::loop_decrement) { - - // We now need to make the intrinsic dead (it cannot be instruction - // selected). - DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0)); - assert(Cond.getNode()->hasOneUse() && - "Counter decrement has more than one use"); - - return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other, - N->getOperand(0), Target); - } - } - break; case ISD::BR_CC: { // If this is a branch on an altivec predicate comparison, lower this so // that we don't have to do a MFOCRF: instead, branch directly on CR6. This // lowering is done pre-legalize, because the legalizer lowers the predicate // compare down to code that is difficult to reassemble. + // This code also handles branches that depend on the result of a store + // conditional. ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); SDValue LHS = N->getOperand(2), RHS = N->getOperand(3); - // Sometimes the promoted value of the intrinsic is ANDed by some non-zero - // value. If so, pass-through the AND to get to the intrinsic. - if (LHS.getOpcode() == ISD::AND && - LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN && - cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == - Intrinsic::loop_decrement && - isa<ConstantSDNode>(LHS.getOperand(1)) && - !isNullConstant(LHS.getOperand(1))) - LHS = LHS.getOperand(0); - - if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && - cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == - Intrinsic::loop_decrement && - isa<ConstantSDNode>(RHS)) { - assert((CC == ISD::SETEQ || CC == ISD::SETNE) && - "Counter decrement comparison is not EQ or NE"); - - unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); - bool isBDNZ = (CC == ISD::SETEQ && Val) || - (CC == ISD::SETNE && !Val); - - // We now need to make the intrinsic dead (it cannot be instruction - // selected). - DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0)); - assert(LHS.getNode()->hasOneUse() && - "Counter decrement has more than one use"); - - return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other, - N->getOperand(0), N->getOperand(4)); - } - int CompareOpc; bool isDot; - if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && - isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && - getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { - assert(isDot && "Can't compare against a vector result!"); + if (!isa<ConstantSDNode>(RHS) || (CC != ISD::SETEQ && CC != ISD::SETNE)) + break; + // Since we are doing this pre-legalize, the RHS can be a constant of + // arbitrary bitwidth which may cause issues when trying to get the value + // from the underlying APInt. + auto RHSAPInt = RHS->getAsAPIntVal(); + if (!RHSAPInt.isIntN(64)) + break; + + unsigned Val = RHSAPInt.getZExtValue(); + auto isImpossibleCompare = [&]() { // If this is a comparison against something other than 0/1, then we know // that the condition is never/always true. - unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); if (Val != 0 && Val != 1) { if (CC == ISD::SETEQ) // Cond never true, remove branch. return N->getOperand(0); @@ -15692,9 +16261,59 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getNode(ISD::BR, dl, MVT::Other, N->getOperand(0), N->getOperand(4)); } + return SDValue(); + }; + // Combine branches fed by store conditional instructions (st[bhwd]cx). + unsigned StoreWidth = 0; + if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && + isStoreConditional(LHS, StoreWidth)) { + if (SDValue Impossible = isImpossibleCompare()) + return Impossible; + PPC::Predicate CompOpc; + // eq 0 => ne + // ne 0 => eq + // eq 1 => eq + // ne 1 => ne + if (Val == 0) + CompOpc = CC == ISD::SETEQ ? PPC::PRED_NE : PPC::PRED_EQ; + else + CompOpc = CC == ISD::SETEQ ? PPC::PRED_EQ : PPC::PRED_NE; + + SDValue Ops[] = {LHS.getOperand(0), LHS.getOperand(2), LHS.getOperand(3), + DAG.getConstant(StoreWidth, dl, MVT::i32)}; + auto *MemNode = cast<MemSDNode>(LHS); + SDValue ConstSt = DAG.getMemIntrinsicNode( + PPCISD::STORE_COND, dl, + DAG.getVTList(MVT::i32, MVT::Other, MVT::Glue), Ops, + MemNode->getMemoryVT(), MemNode->getMemOperand()); + + SDValue InChain; + // Unchain the branch from the original store conditional. + if (N->getOperand(0) == LHS.getValue(1)) + InChain = LHS.getOperand(0); + else if (N->getOperand(0).getOpcode() == ISD::TokenFactor) { + SmallVector<SDValue, 4> InChains; + SDValue InTF = N->getOperand(0); + for (int i = 0, e = InTF.getNumOperands(); i < e; i++) + if (InTF.getOperand(i) != LHS.getValue(1)) + InChains.push_back(InTF.getOperand(i)); + InChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, InChains); + } - bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); + return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, InChain, + DAG.getConstant(CompOpc, dl, MVT::i32), + DAG.getRegister(PPC::CR0, MVT::i32), N->getOperand(4), + ConstSt.getValue(2)); + } + + if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && + getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { + assert(isDot && "Can't compare against a vector result!"); + + if (SDValue Impossible = isImpossibleCompare()) + return Impossible; + bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0); // Create the PPCISD altivec 'dot' comparison node. SDValue Ops[] = { LHS.getOperand(2), // LHS of compare @@ -15706,7 +16325,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // Unpack the result based on how the target uses it. PPC::Predicate CompOpc; - switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) { + switch (LHS.getConstantOperandVal(1)) { default: // Can't happen, don't crash on invalid number though. case 0: // Branch on the value of the EQ bit of CR6. CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE; @@ -15731,10 +16350,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); - case ISD::ABS: - return combineABS(N, DCI); - case ISD::VSELECT: - return combineVSelect(N, DCI); } return SDValue(); @@ -15756,7 +16371,7 @@ PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SDValue N0 = N->getOperand(0); bool IsNegPow2 = Divisor.isNegatedPowerOf2(); - unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countTrailingZeros(); + unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countr_zero(); SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT); SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt); @@ -15789,7 +16404,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, break; } case ISD::INTRINSIC_WO_CHAIN: { - switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { + switch (Op.getConstantOperandVal(0)) { default: break; case Intrinsic::ppc_altivec_vcmpbfp_p: case Intrinsic::ppc_altivec_vcmpeqfp_p: @@ -15816,7 +16431,7 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, break; } case ISD::INTRINSIC_W_CHAIN: { - switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) { + switch (Op.getConstantOperandVal(1)) { default: break; case Intrinsic::ppc_load2r: @@ -15861,8 +16476,8 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { // boundary so that the entire loop fits in one instruction-cache line. uint64_t LoopSize = 0; for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) - for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { - LoopSize += TII->getInstSizeInBytes(*J); + for (const MachineInstr &J : **I) { + LoopSize += TII->getInstSizeInBytes(J); if (LoopSize > 32) break; } @@ -16104,13 +16719,14 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, - std::string &Constraint, - std::vector<SDValue>&Ops, + StringRef Constraint, + std::vector<SDValue> &Ops, SelectionDAG &DAG) const { SDValue Result; // Only support length 1 constraints. - if (Constraint.length() > 1) return; + if (Constraint.size() > 1) + return; char Letter = Constraint[0]; switch (Letter) { @@ -16177,6 +16793,24 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } +void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I, + SmallVectorImpl<SDValue> &Ops, + SelectionDAG &DAG) const { + if (I.getNumOperands() <= 1) + return; + if (!isa<ConstantSDNode>(Ops[1].getNode())) + return; + auto IntrinsicID = Ops[1].getNode()->getAsZExtVal(); + if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw && + IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap) + return; + + if (I.hasMetadata("annotation")) { + MDNode *MDN = I.getMetadata("annotation"); + Ops.push_back(DAG.getMDNode(MDN)); + } +} + // isLegalAddressingMode - Return true if the addressing mode represented // by AM is legal for this target, for a load/store of the specified type. bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL, @@ -16232,7 +16866,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, return SDValue(); SDLoc dl(Op); - unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned Depth = Op.getConstantOperandVal(0); // Make sure the function does not optimize away the store of the RA to // the stack. @@ -16265,7 +16899,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); - unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + unsigned Depth = Op.getConstantOperandVal(0); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -16460,6 +17094,37 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MOStore; return true; } + case Intrinsic::ppc_stdcx: + case Intrinsic::ppc_stwcx: + case Intrinsic::ppc_sthcx: + case Intrinsic::ppc_stbcx: { + EVT VT; + auto Alignment = Align(8); + switch (Intrinsic) { + case Intrinsic::ppc_stdcx: + VT = MVT::i64; + break; + case Intrinsic::ppc_stwcx: + VT = MVT::i32; + Alignment = Align(4); + break; + case Intrinsic::ppc_sthcx: + VT = MVT::i16; + Alignment = Align(2); + break; + case Intrinsic::ppc_stbcx: + VT = MVT::i8; + Alignment = Align(1); + break; + } + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = VT; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = Alignment; + Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; + return true; + } default: break; } @@ -16471,13 +17136,23 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, /// target-independent logic. EVT PPCTargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { - if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { + if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None) { // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. - if (Subtarget.hasAltivec() && Op.size() >= 16 && - (Op.isAligned(Align(16)) || - ((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector()))) - return MVT::v4i32; + if (Subtarget.hasAltivec() && Op.size() >= 16) { + if (Op.isMemset() && Subtarget.hasVSX()) { + uint64_t TailSize = Op.size() % 16; + // For memset lowering, EXTRACT_VECTOR_ELT tries to return constant + // element if vector element type matches tail store. For tail size + // 3/4, the tail store is i32, v4i32 cannot be used, need a legal one. + if (TailSize > 2 && TailSize <= 4) { + return MVT::v8i16; + } + return MVT::v4i32; + } + if (Op.isAligned(Align(16)) || Subtarget.hasP8Vector()) + return MVT::v4i32; + } } if (Subtarget.isPPC64()) { @@ -16552,7 +17227,7 @@ bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const { bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align, MachineMemOperand::Flags, - bool *Fast) const { + unsigned *Fast) const { if (DisablePPCUnaligned) return false; @@ -16583,7 +17258,7 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align, return false; if (Fast) - *Fast = true; + *Fast = 1; return true; } @@ -16603,7 +17278,7 @@ bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, // 2. If the multiplier after shifted fits 16 bits, an extra shift // instruction is needed than case 1, ie. MULLI and RLDICR int64_t Imm = ConstNode->getSExtValue(); - unsigned Shift = countTrailingZeros<uint64_t>(Imm); + unsigned Shift = llvm::countr_zero<uint64_t>(Imm); Imm >>= Shift; if (isInt<16>(Imm)) return false; @@ -16623,6 +17298,8 @@ bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const { + if (Subtarget.hasSPE() || Subtarget.useSoftFloat()) + return false; switch (Ty->getScalarType()->getTypeID()) { case Type::FloatTyID: case Type::DoubleTyID: @@ -16825,7 +17502,7 @@ bool PPCTargetLowering::useLoadStackGuardNode() const { void PPCTargetLowering::insertSSPDeclarations(Module &M) const { if (Subtarget.isAIXABI()) { M.getOrInsertGlobal(AIXSSPCanaryWordName, - Type::getInt8PtrTy(M.getContext())); + PointerType::getUnqual(M.getContext())); return; } if (!Subtarget.isTargetLinux()) @@ -16849,12 +17526,21 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, // false. Examples: f16, f80. return false; case MVT::f32: - case MVT::f64: + case MVT::f64: { if (Subtarget.hasPrefixInstrs()) { // we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP. return true; } - LLVM_FALLTHROUGH; + bool IsExact; + APSInt IntResult(16, false); + // The rounding mode doesn't really matter because we only care about floats + // that can be converted to integers exactly. + Imm.convertToInteger(IntResult, APFloat::rmTowardZero, &IsExact); + // For exact values in the range [-16, 15] we can materialize the float. + if (IsExact && IntResult <= 15 && IntResult >= -16) + return true; + return Imm.isZero(); + } case MVT::ppcf128: return Imm.isPosZero(); } @@ -17100,24 +17786,6 @@ SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N, SDLoc dl(N); SDValue Op0 = N->getOperand(0); - // fold (truncate (abs (sub (zext a), (zext b)))) -> (vabsd a, b) - if (Subtarget.hasP9Altivec() && Op0.getOpcode() == ISD::ABS) { - EVT VT = N->getValueType(0); - if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) - return SDValue(); - SDValue Sub = Op0.getOperand(0); - if (Sub.getOpcode() == ISD::SUB) { - SDValue SubOp0 = Sub.getOperand(0); - SDValue SubOp1 = Sub.getOperand(1); - if ((SubOp0.getOpcode() == ISD::ZERO_EXTEND) && - (SubOp1.getOpcode() == ISD::ZERO_EXTEND)) { - return DCI.DAG.getNode(PPCISD::VABSD, dl, VT, SubOp0.getOperand(0), - SubOp1.getOperand(0), - DCI.DAG.getTargetConstant(0, dl, MVT::i32)); - } - } - } - // Looking for a truncate of i128 to i64. if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64) return SDValue(); @@ -17301,15 +17969,6 @@ bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee); } -bool PPCTargetLowering::hasBitPreservingFPLogic(EVT VT) const { - if (!Subtarget.hasVSX()) - return false; - if (Subtarget.hasP9Vector() && VT == MVT::f128) - return true; - return VT == MVT::f32 || VT == MVT::f64 || - VT == MVT::v4f32 || VT == MVT::v2f64; -} - bool PPCTargetLowering:: isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { const Value *Mask = AndI.getOperand(1); @@ -17327,112 +17986,6 @@ isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { return true; } -// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0) -// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0) -// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0) -// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0) -// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32 -SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const { - assert((N->getOpcode() == ISD::ABS) && "Need ABS node here"); - assert(Subtarget.hasP9Altivec() && - "Only combine this when P9 altivec supported!"); - EVT VT = N->getValueType(0); - if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - SDLoc dl(N); - if (N->getOperand(0).getOpcode() == ISD::SUB) { - // Even for signed integers, if it's known to be positive (as signed - // integer) due to zero-extended inputs. - unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode(); - unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode(); - if ((SubOpcd0 == ISD::ZERO_EXTEND || - SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) && - (SubOpcd1 == ISD::ZERO_EXTEND || - SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) { - return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), - N->getOperand(0)->getOperand(0), - N->getOperand(0)->getOperand(1), - DAG.getTargetConstant(0, dl, MVT::i32)); - } - - // For type v4i32, it can be optimized with xvnegsp + vabsduw - if (N->getOperand(0).getValueType() == MVT::v4i32 && - N->getOperand(0).hasOneUse()) { - return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), - N->getOperand(0)->getOperand(0), - N->getOperand(0)->getOperand(1), - DAG.getTargetConstant(1, dl, MVT::i32)); - } - } - - return SDValue(); -} - -// For type v4i32/v8ii16/v16i8, transform -// from (vselect (setcc a, b, setugt), (sub a, b), (sub b, a)) to (vabsd a, b) -// from (vselect (setcc a, b, setuge), (sub a, b), (sub b, a)) to (vabsd a, b) -// from (vselect (setcc a, b, setult), (sub b, a), (sub a, b)) to (vabsd a, b) -// from (vselect (setcc a, b, setule), (sub b, a), (sub a, b)) to (vabsd a, b) -SDValue PPCTargetLowering::combineVSelect(SDNode *N, - DAGCombinerInfo &DCI) const { - assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here"); - assert(Subtarget.hasP9Altivec() && - "Only combine this when P9 altivec supported!"); - - SelectionDAG &DAG = DCI.DAG; - SDLoc dl(N); - SDValue Cond = N->getOperand(0); - SDValue TrueOpnd = N->getOperand(1); - SDValue FalseOpnd = N->getOperand(2); - EVT VT = N->getOperand(1).getValueType(); - - if (Cond.getOpcode() != ISD::SETCC || TrueOpnd.getOpcode() != ISD::SUB || - FalseOpnd.getOpcode() != ISD::SUB) - return SDValue(); - - // ABSD only available for type v4i32/v8i16/v16i8 - if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) - return SDValue(); - - // At least to save one more dependent computation - if (!(Cond.hasOneUse() || TrueOpnd.hasOneUse() || FalseOpnd.hasOneUse())) - return SDValue(); - - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - // Can only handle unsigned comparison here - switch (CC) { - default: - return SDValue(); - case ISD::SETUGT: - case ISD::SETUGE: - break; - case ISD::SETULT: - case ISD::SETULE: - std::swap(TrueOpnd, FalseOpnd); - break; - } - - SDValue CmpOpnd1 = Cond.getOperand(0); - SDValue CmpOpnd2 = Cond.getOperand(1); - - // SETCC CmpOpnd1 CmpOpnd2 cond - // TrueOpnd = CmpOpnd1 - CmpOpnd2 - // FalseOpnd = CmpOpnd2 - CmpOpnd1 - if (TrueOpnd.getOperand(0) == CmpOpnd1 && - TrueOpnd.getOperand(1) == CmpOpnd2 && - FalseOpnd.getOperand(0) == CmpOpnd2 && - FalseOpnd.getOperand(1) == CmpOpnd1) { - return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(1).getValueType(), - CmpOpnd1, CmpOpnd2, - DAG.getTargetConstant(0, dl, MVT::i32)); - } - - return SDValue(); -} - /// getAddrModeForFlags - Based on the set of address flags, select the most /// optimal instruction format to match by. PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const { @@ -17531,8 +18084,7 @@ static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet, FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates. else FlagSet |= PPC::MOF_RPlusR; // Register. - } else if (RHS.getOpcode() == PPCISD::Lo && - !cast<ConstantSDNode>(RHS.getOperand(1))->getZExtValue()) + } else if (RHS.getOpcode() == PPCISD::Lo && !RHS.getConstantOperandVal(1)) FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo. else FlagSet |= PPC::MOF_RPlusR; @@ -17576,7 +18128,7 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N, unsigned ParentOp = Parent->getOpcode(); if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) || (ParentOp == ISD::INTRINSIC_VOID))) { - unsigned ID = cast<ConstantSDNode>(Parent->getOperand(1))->getZExtValue(); + unsigned ID = Parent->getConstantOperandVal(1); if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) { SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp) ? Parent->getOperand(2) @@ -17705,7 +18257,7 @@ PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp, bool PPCTargetLowering::splitValueIntoRegisterParts( SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, - unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const { + unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { EVT ValVT = Val.getValueType(); // If we are splitting a scalar integer into f64 parts (i.e. so they // can be placed into VFRC registers), we need to zero extend and @@ -17721,6 +18273,118 @@ bool PPCTargetLowering::splitValueIntoRegisterParts( return false; } +SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op, + SelectionDAG &DAG) const { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + TargetLowering::CallLoweringInfo CLI(DAG); + EVT RetVT = Op.getValueType(); + Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); + SDValue Callee = + DAG.getExternalSymbol(LibCallName, TLI.getPointerTy(DAG.getDataLayout())); + bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, false); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + for (const SDValue &N : Op->op_values()) { + EVT ArgVT = N.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + Entry.Node = N; + Entry.Ty = ArgTy; + Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, SignExtend); + Entry.IsZExt = !Entry.IsSExt; + Args.push_back(Entry); + } + + SDValue InChain = DAG.getEntryNode(); + SDValue TCChain = InChain; + const Function &F = DAG.getMachineFunction().getFunction(); + bool isTailCall = + TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) && + (RetTy == F.getReturnType() || F.getReturnType()->isVoidTy()); + if (isTailCall) + InChain = TCChain; + CLI.setDebugLoc(SDLoc(Op)) + .setChain(InChain) + .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)) + .setTailCall(isTailCall) + .setSExtResult(SignExtend) + .setZExtResult(!SignExtend) + .setIsPostTypeLegalization(true); + return TLI.LowerCallTo(CLI).first; +} + +SDValue PPCTargetLowering::lowerLibCallBasedOnType( + const char *LibCallFloatName, const char *LibCallDoubleName, SDValue Op, + SelectionDAG &DAG) const { + if (Op.getValueType() == MVT::f32) + return lowerToLibCall(LibCallFloatName, Op, DAG); + + if (Op.getValueType() == MVT::f64) + return lowerToLibCall(LibCallDoubleName, Op, DAG); + + return SDValue(); +} + +bool PPCTargetLowering::isLowringToMASSFiniteSafe(SDValue Op) const { + SDNodeFlags Flags = Op.getNode()->getFlags(); + return isLowringToMASSSafe(Op) && Flags.hasNoSignedZeros() && + Flags.hasNoNaNs() && Flags.hasNoInfs(); +} + +bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const { + return Op.getNode()->getFlags().hasApproximateFuncs(); +} + +bool PPCTargetLowering::isScalarMASSConversionEnabled() const { + return getTargetMachine().Options.PPCGenScalarMASSEntries; +} + +SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName, + const char *LibCallFloatName, + const char *LibCallDoubleNameFinite, + const char *LibCallFloatNameFinite, + SDValue Op, + SelectionDAG &DAG) const { + if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op)) + return SDValue(); + + if (!isLowringToMASSFiniteSafe(Op)) + return lowerLibCallBasedOnType(LibCallFloatName, LibCallDoubleName, Op, + DAG); + + return lowerLibCallBasedOnType(LibCallFloatNameFinite, + LibCallDoubleNameFinite, Op, DAG); +} + +SDValue PPCTargetLowering::lowerPow(SDValue Op, SelectionDAG &DAG) const { + return lowerLibCallBase("__xl_pow", "__xl_powf", "__xl_pow_finite", + "__xl_powf_finite", Op, DAG); +} + +SDValue PPCTargetLowering::lowerSin(SDValue Op, SelectionDAG &DAG) const { + return lowerLibCallBase("__xl_sin", "__xl_sinf", "__xl_sin_finite", + "__xl_sinf_finite", Op, DAG); +} + +SDValue PPCTargetLowering::lowerCos(SDValue Op, SelectionDAG &DAG) const { + return lowerLibCallBase("__xl_cos", "__xl_cosf", "__xl_cos_finite", + "__xl_cosf_finite", Op, DAG); +} + +SDValue PPCTargetLowering::lowerLog(SDValue Op, SelectionDAG &DAG) const { + return lowerLibCallBase("__xl_log", "__xl_logf", "__xl_log_finite", + "__xl_logf_finite", Op, DAG); +} + +SDValue PPCTargetLowering::lowerLog10(SDValue Op, SelectionDAG &DAG) const { + return lowerLibCallBase("__xl_log10", "__xl_log10f", "__xl_log10_finite", + "__xl_log10f_finite", Op, DAG); +} + +SDValue PPCTargetLowering::lowerExp(SDValue Op, SelectionDAG &DAG) const { + return lowerLibCallBase("__xl_exp", "__xl_expf", "__xl_exp_finite", + "__xl_expf_finite", Op, DAG); +} + // If we happen to match to an aligned D-Form, check if the Frame Index is // adequately aligned. If it is not, reset the mode to match to X-Form. static void setXFormForUnalignedFI(SDValue N, unsigned Flags, @@ -17771,7 +18435,7 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent, if (Flags & PPC::MOF_RPlusSImm16) { SDValue Op0 = N.getOperand(0); SDValue Op1 = N.getOperand(1); - int16_t Imm = cast<ConstantSDNode>(Op1)->getAPIntValue().getZExtValue(); + int16_t Imm = Op1->getAsZExtVal(); if (!Align || isAligned(*Align, Imm)) { Disp = DAG.getTargetConstant(Imm, DL, N.getValueType()); Base = Op0; @@ -17875,24 +18539,37 @@ CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const { switch (CC) { case CallingConv::Cold: - return (Return ? RetCC_PPC_Cold : CC_PPC64_ELF_FIS); + return (Return ? RetCC_PPC_Cold : CC_PPC64_ELF); default: - return CC_PPC64_ELF_FIS; + return CC_PPC64_ELF; } } +bool PPCTargetLowering::shouldInlineQuadwordAtomics() const { + return Subtarget.isPPC64() && Subtarget.hasQuadwordAtomics(); +} + TargetLowering::AtomicExpansionKind PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128) + if (shouldInlineQuadwordAtomics() && Size == 128) return AtomicExpansionKind::MaskedIntrinsic; - return TargetLowering::shouldExpandAtomicRMWInIR(AI); + + switch (AI->getOperation()) { + case AtomicRMWInst::UIncWrap: + case AtomicRMWInst::UDecWrap: + return AtomicExpansionKind::CmpXChg; + default: + return TargetLowering::shouldExpandAtomicRMWInIR(AI); + } + + llvm_unreachable("unreachable atomicrmw operation"); } TargetLowering::AtomicExpansionKind PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits(); - if (EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && Size == 128) + if (shouldInlineQuadwordAtomics() && Size == 128) return AtomicExpansionKind::MaskedIntrinsic; return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI); } @@ -17922,10 +18599,9 @@ getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) { Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic( IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr, Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const { - assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && - "Only support quadword now"); + assert(shouldInlineQuadwordAtomics() && "Only support quadword now"); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Type *ValTy = AlignedAddr->getType()->getPointerElementType(); + Type *ValTy = Incr->getType(); assert(ValTy->getPrimitiveSizeInBits() == 128); Function *RMW = Intrinsic::getDeclaration( M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation())); @@ -17933,9 +18609,7 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic( Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo"); Value *IncrHi = Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi"); - Value *Addr = - Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext())); - Value *LoHi = Builder.CreateCall(RMW, {Addr, IncrLo, IncrHi}); + Value *LoHi = Builder.CreateCall(RMW, {AlignedAddr, IncrLo, IncrHi}); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); @@ -17947,10 +18621,9 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic( Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr, Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const { - assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() && - "Only support quadword now"); + assert(shouldInlineQuadwordAtomics() && "Only support quadword now"); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Type *ValTy = AlignedAddr->getType()->getPointerElementType(); + Type *ValTy = CmpVal->getType(); assert(ValTy->getPrimitiveSizeInBits() == 128); Function *IntCmpXchg = Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128); @@ -17961,11 +18634,9 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( Value *NewLo = Builder.CreateTrunc(NewVal, Int64Ty, "new_lo"); Value *NewHi = Builder.CreateTrunc(Builder.CreateLShr(NewVal, 64), Int64Ty, "new_hi"); - Value *Addr = - Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext())); emitLeadingFence(Builder, CI, Ord); Value *LoHi = - Builder.CreateCall(IntCmpXchg, {Addr, CmpLo, CmpHi, NewLo, NewHi}); + Builder.CreateCall(IntCmpXchg, {AlignedAddr, CmpLo, CmpHi, NewLo, NewHi}); emitTrailingFence(Builder, CI, Ord); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); |