diff options
Diffstat (limited to 'lib/Target/X86/X86ISelDAGToDAG.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelDAGToDAG.cpp | 975 |
1 files changed, 645 insertions, 330 deletions
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 660c1eff3c4b..a28d4eac8393 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Config/llvm-config.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" @@ -100,11 +101,11 @@ namespace { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void dump() { + void dump(SelectionDAG *DAG = nullptr) { dbgs() << "X86ISelAddressMode " << this << '\n'; dbgs() << "Base_Reg "; if (Base_Reg.getNode()) - Base_Reg.getNode()->dump(); + Base_Reg.getNode()->dump(DAG); else dbgs() << "nul\n"; if (BaseType == FrameIndexBase) @@ -112,7 +113,7 @@ namespace { dbgs() << " Scale " << Scale << '\n' << "IndexReg "; if (IndexReg.getNode()) - IndexReg.getNode()->dump(); + IndexReg.getNode()->dump(DAG); else dbgs() << "nul\n"; dbgs() << " Disp " << Disp << '\n' @@ -181,6 +182,7 @@ namespace { bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; void PreprocessISelDAG() override; + void PostprocessISelDAG() override; // Include the pieces autogenerated from the target description. #include "X86GenDAGISel.inc" @@ -213,7 +215,7 @@ namespace { bool selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool selectScalarSSELoad(SDNode *Root, SDValue N, + bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment, @@ -225,7 +227,7 @@ namespace { SDValue &Index, SDValue &Disp, SDValue &Segment); - // Convience method where P is also root. + // Convenience method where P is also root. bool tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, @@ -233,6 +235,12 @@ namespace { return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } + // Try to fold a vector load. This makes sure the load isn't non-temporal. + bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -368,6 +376,11 @@ namespace { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } + /// Return a target constant with the specified value, of type i64. + inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { + return CurDAG->getTargetConstant(Imm, DL, MVT::i64); + } + SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth, const SDLoc &DL) { assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); @@ -401,7 +414,7 @@ namespace { return Subtarget->getInstrInfo(); } - /// \brief Address-mode matching performs shift-of-and to and-of-shift + /// Address-mode matching performs shift-of-and to and-of-shift /// reassociation in order to expose more scaled addressing /// opportunities. bool ComplexPatternFuncMutatesDAG() const override { @@ -440,10 +453,15 @@ namespace { } bool foldLoadStoreIntoMemOperand(SDNode *Node); - bool matchBEXTRFromAnd(SDNode *Node); - + bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; + + MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, + const SDLoc &dl, MVT VT, SDNode *Node); + MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, + const SDLoc &dl, MVT VT, SDNode *Node, + SDValue &InFlag); }; } @@ -452,19 +470,21 @@ namespace { // type. static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); - if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM || - Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM || - Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU || - Opcode == X86ISD::CMPM_RND) { + if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC || + Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. EVT OpVT = N->getOperand(0).getValueType(); - if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32) + if (OpVT.is256BitVector() || OpVT.is128BitVector()) return Subtarget->hasVLX(); return true; } + // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. + if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || + Opcode == X86ISD::FSETCCM_RND) + return true; return false; } @@ -518,10 +538,21 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { // addl 4(%esp), %eax // The former is 2 bytes shorter. In case where the increment is 1, then // the saving can be 4 bytes (by using incl %eax). - if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) + if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) { if (Imm->getAPIntValue().isSignedIntN(8)) return false; + // If this is a 64-bit AND with an immediate that fits in 32-bits, + // prefer using the smaller and over folding the load. This is needed to + // make sure immediates created by shrinkAndImmediate are always folded. + // Ideally we would narrow the load during DAG combine and get the + // best of both worlds. + if (U->getOpcode() == ISD::AND && + Imm->getAPIntValue().getBitWidth() == 64 && + Imm->getAPIntValue().isIntN(32)) + return false; + } + // If the other operand is a TLS address, we should fold it instead. // This produces // movl %gs:0, %eax @@ -537,10 +568,60 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) return false; } + + // Don't fold load if this matches the BTS/BTR/BTC patterns. + // BTS: (or X, (shl 1, n)) + // BTR: (and X, (rotl -2, n)) + // BTC: (xor X, (shl 1, n)) + if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { + if (U->getOperand(0).getOpcode() == ISD::SHL && + isOneConstant(U->getOperand(0).getOperand(0))) + return false; + + if (U->getOperand(1).getOpcode() == ISD::SHL && + isOneConstant(U->getOperand(1).getOperand(0))) + return false; + } + if (U->getOpcode() == ISD::AND) { + SDValue U0 = U->getOperand(0); + SDValue U1 = U->getOperand(1); + if (U0.getOpcode() == ISD::ROTL) { + auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0)); + if (C && C->getSExtValue() == -2) + return false; + } + + if (U1.getOpcode() == ISD::ROTL) { + auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0)); + if (C && C->getSExtValue() == -2) + return false; + } + } + + break; } + case ISD::SHL: + case ISD::SRA: + case ISD::SRL: + // Don't fold a load into a shift by immediate. The BMI2 instructions + // support folding a load, but not an immediate. The legacy instructions + // support folding an immediate, but can't fold a load. Folding an + // immediate is preferable to folding a load. + if (isa<ConstantSDNode>(U->getOperand(1))) + return false; + + break; } } + // Prevent folding a load if this can implemented with an insert_subreg or + // a move that implicitly zeroes. + if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && + isNullConstant(Root->getOperand(2)) && + (Root->getOperand(0).isUndef() || + ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode()))) + return false; + return true; } @@ -628,12 +709,24 @@ void X86DAGToDAGISel::PreprocessISelDAG() { E = CurDAG->allnodes_end(); I != E; ) { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. + // If this is a target specific AND node with no flag usages, turn it back + // into ISD::AND to enable test instruction matching. + if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) { + SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0), + N->getOperand(0), N->getOperand(1)); + --I; + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + ++I; + CurDAG->DeleteNode(N); + continue; + } + if (OptLevel != CodeGenOpt::None && - // Only does this when target favors doesn't favor register indirect - // call. + // Only do this when the target can fold the load into the call or + // jmp. + !Subtarget->useRetpoline() && ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || (N->getOpcode() == X86ISD::TC_RETURN && - // Only does this if load can be folded into TC_RETURN. (Subtarget->is64Bit() || !getTargetMachine().isPositionIndependent())))) { /// Also try moving call address load from outside callseq_start to just @@ -735,6 +828,70 @@ void X86DAGToDAGISel::PreprocessISelDAG() { } +void X86DAGToDAGISel::PostprocessISelDAG() { + // Skip peepholes at -O0. + if (TM.getOptLevel() == CodeGenOpt::None) + return; + + // Attempt to remove vectors moves that were inserted to zero upper bits. + + SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); + ++Position; + + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + // Skip dead nodes and any non-machine opcodes. + if (N->use_empty() || !N->isMachineOpcode()) + continue; + + if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG) + continue; + + unsigned SubRegIdx = N->getConstantOperandVal(2); + if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) + continue; + + SDValue Move = N->getOperand(1); + if (!Move.isMachineOpcode()) + continue; + + // Make sure its one of the move opcodes we recognize. + switch (Move.getMachineOpcode()) { + default: + continue; + case X86::VMOVAPDrr: case X86::VMOVUPDrr: + case X86::VMOVAPSrr: case X86::VMOVUPSrr: + case X86::VMOVDQArr: case X86::VMOVDQUrr: + case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: + case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: + case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: + case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: + case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: + case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: + case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: + case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: + case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: + case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: + case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: + break; + } + + SDValue In = Move.getOperand(0); + if (!In.isMachineOpcode() || + In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) + continue; + + // Producing instruction is another vector instruction. We can drop the + // move. + CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2)); + + // If the move is now dead, delete it. + if (Move.getNode()->use_empty()) + CurDAG->RemoveDeadNode(Move.getNode()); + } +} + + /// Emit any code that needs to be executed only in the main function. void X86DAGToDAGISel::emitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { @@ -771,9 +928,14 @@ static bool isDispSafeForFrameIndex(int64_t Val) { bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM) { + // If there's no offset to fold, we don't need to do any work. + if (Offset == 0) + return false; + // Cannot combine ExternalSymbol displacements with integer offsets. - if (Offset != 0 && (AM.ES || AM.MCSym)) + if (AM.ES || AM.MCSym) return true; + int64_t Val = AM.Disp + Offset; CodeModel::Model M = TM.getCodeModel(); if (Subtarget->is64Bit()) { @@ -827,94 +989,60 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { if (AM.hasSymbolicDisplacement()) return true; - SDValue N0 = N.getOperand(0); + bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; + + // We can't use an addressing mode in the 64-bit large code model. In the + // medium code model, we use can use an mode when RIP wrappers are present. + // That signifies access to globals that are known to be "near", such as the + // GOT itself. CodeModel::Model M = TM.getCodeModel(); + if (Subtarget->is64Bit() && + (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel))) + return true; - // Handle X86-64 rip-relative addresses. We check this before checking direct - // folding because RIP is preferable to non-RIP accesses. - if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP && - // Under X86-64 non-small code model, GV (and friends) are 64-bits, so - // they cannot be folded into immediate fields. - // FIXME: This can be improved for kernel and other models? - (M == CodeModel::Small || M == CodeModel::Kernel)) { - // Base and index reg must be 0 in order to use %rip as base. - if (AM.hasBaseOrIndexReg()) - return true; - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { - X86ISelAddressMode Backup = AM; - AM.GV = G->getGlobal(); - AM.SymbolFlags = G->getTargetFlags(); - if (foldOffsetIntoAddress(G->getOffset(), AM)) { - AM = Backup; - return true; - } - } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { - X86ISelAddressMode Backup = AM; - AM.CP = CP->getConstVal(); - AM.Align = CP->getAlignment(); - AM.SymbolFlags = CP->getTargetFlags(); - if (foldOffsetIntoAddress(CP->getOffset(), AM)) { - AM = Backup; - return true; - } - } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { - AM.ES = S->getSymbol(); - AM.SymbolFlags = S->getTargetFlags(); - } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { - AM.MCSym = S->getMCSymbol(); - } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { - AM.JT = J->getIndex(); - AM.SymbolFlags = J->getTargetFlags(); - } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { - X86ISelAddressMode Backup = AM; - AM.BlockAddr = BA->getBlockAddress(); - AM.SymbolFlags = BA->getTargetFlags(); - if (foldOffsetIntoAddress(BA->getOffset(), AM)) { - AM = Backup; - return true; - } - } else - llvm_unreachable("Unhandled symbol reference node."); + // Base and index reg must be 0 in order to use %rip as base. + if (IsRIPRel && AM.hasBaseOrIndexReg()) + return true; - if (N.getOpcode() == X86ISD::WrapperRIP) - AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); - return false; - } + // Make a local copy in case we can't do this fold. + X86ISelAddressMode Backup = AM; - // Handle the case when globals fit in our immediate field: This is true for - // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit - // mode, this only applies to a non-RIP-relative computation. - if (!Subtarget->is64Bit() || - M == CodeModel::Small || M == CodeModel::Kernel) { - assert(N.getOpcode() != X86ISD::WrapperRIP && - "RIP-relative addressing already handled"); - if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { - AM.GV = G->getGlobal(); - AM.Disp += G->getOffset(); - AM.SymbolFlags = G->getTargetFlags(); - } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { - AM.CP = CP->getConstVal(); - AM.Align = CP->getAlignment(); - AM.Disp += CP->getOffset(); - AM.SymbolFlags = CP->getTargetFlags(); - } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { - AM.ES = S->getSymbol(); - AM.SymbolFlags = S->getTargetFlags(); - } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { - AM.MCSym = S->getMCSymbol(); - } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { - AM.JT = J->getIndex(); - AM.SymbolFlags = J->getTargetFlags(); - } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { - AM.BlockAddr = BA->getBlockAddress(); - AM.Disp += BA->getOffset(); - AM.SymbolFlags = BA->getTargetFlags(); - } else - llvm_unreachable("Unhandled symbol reference node."); - return false; + int64_t Offset = 0; + SDValue N0 = N.getOperand(0); + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) { + AM.GV = G->getGlobal(); + AM.SymbolFlags = G->getTargetFlags(); + Offset = G->getOffset(); + } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { + AM.CP = CP->getConstVal(); + AM.Align = CP->getAlignment(); + AM.SymbolFlags = CP->getTargetFlags(); + Offset = CP->getOffset(); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { + AM.ES = S->getSymbol(); + AM.SymbolFlags = S->getTargetFlags(); + } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { + AM.MCSym = S->getMCSymbol(); + } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { + AM.JT = J->getIndex(); + AM.SymbolFlags = J->getTargetFlags(); + } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) { + AM.BlockAddr = BA->getBlockAddress(); + AM.SymbolFlags = BA->getTargetFlags(); + Offset = BA->getOffset(); + } else + llvm_unreachable("Unhandled symbol reference node."); + + if (foldOffsetIntoAddress(Offset, AM)) { + AM = Backup; + return true; } - return true; + if (IsRIPRel) + AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); + + // Commit the changes now that we know this fold is safe. + return false; } /// Add the specified node to the specified addressing mode, returning true if @@ -988,10 +1116,16 @@ bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM, // IDs! The selection DAG must no longer depend on their uniqueness when this // is used. static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { - if (N.getNode()->getNodeId() == -1 || - N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { - DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode()); - N.getNode()->setNodeId(Pos.getNode()->getNodeId()); + if (N->getNodeId() == -1 || + (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > + SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { + DAG.RepositionNode(Pos->getIterator(), N.getNode()); + // Mark Node as invalid for pruning as after this it may be a successor to a + // selected node but otherwise be in the same position of Pos. + // Conservatively mark it with the same -abs(Id) to assure node id + // invariant is preserved. + N->setNodeId(Pos->getNodeId()); + SelectionDAGISel::InvalidateNodeId(N.getNode()); } } @@ -1196,10 +1330,10 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { SDLoc dl(N); - DEBUG({ - dbgs() << "MatchAddress: "; - AM.dump(); - }); + LLVM_DEBUG({ + dbgs() << "MatchAddress: "; + AM.dump(CurDAG); + }); // Limit recursion. if (Depth > 5) return matchAddressBase(N, AM); @@ -1508,6 +1642,12 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { // TODO: Support other operations. switch (N.getOpcode()) { + case ISD::Constant: { + uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); + if (!foldOffsetIntoAddress(Val, AM)) + return false; + break; + } case X86ISD::Wrapper: if (!matchWrapper(N, AM)) return false; @@ -1523,7 +1663,7 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, X86ISelAddressMode AM; auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent); AM.IndexReg = Mgs->getIndex(); - AM.Scale = Mgs->getValue().getScalarValueSizeInBits() / 8; + AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue(); unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. @@ -1534,14 +1674,8 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, if (AddrSpace == 258) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); - // If Base is 0, the whole address is in index and the Scale is 1 - if (isa<ConstantSDNode>(N)) { - assert(cast<ConstantSDNode>(N)->isNullValue() && - "Unexpected base in gather/scatter"); - AM.Scale = 1; - } - // Otherwise, try to match into the base and displacement fields. - else if (matchVectorAddress(N, AM)) + // Try to match into the base and displacement fields. + if (matchVectorAddress(N, AM)) return false; MVT VT = N.getSimpleValueType(); @@ -1604,8 +1738,7 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, // We can only fold a load if all nodes between it and the root node have a // single use. If there are additional uses, we could end up duplicating the // load. -static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) { - SDNode *User = *N->use_begin(); +static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) { while (User != Root) { if (!User->hasOneUse()) return false; @@ -1622,17 +1755,19 @@ static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) { /// We also return: /// PatternChainNode: this is the matched node that has a chain input and /// output. -bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, +bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment, SDValue &PatternNodeWithChain) { + if (!hasSingleUsesFromRoot(Root, Parent)) + return false; + // We can allow a full vector load here since narrowing a load is ok. if (ISD::isNON_EXTLoad(N.getNode())) { PatternNodeWithChain = N; if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) && - hasSingleUsesFromRoot(Root, N.getNode())) { + IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) { LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment); @@ -1643,8 +1778,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, if (N.getOpcode() == X86ISD::VZEXT_LOAD) { PatternNodeWithChain = N; if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) && - hasSingleUsesFromRoot(Root, N.getNode())) { + IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) { auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain); return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp, Segment); @@ -1658,8 +1792,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, PatternNodeWithChain = N.getOperand(0); if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) && - hasSingleUsesFromRoot(Root, N.getNode())) { + IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment); @@ -1675,8 +1808,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, PatternNodeWithChain = N.getOperand(0).getOperand(0); if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) && - hasSingleUsesFromRoot(Root, N.getNode())) { + IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { // Okay, this is a zero extending load. Fold it. LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, @@ -1699,10 +1831,10 @@ bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { } // In static codegen with small code model, we can get the address of a label - // into a register with 'movl'. TableGen has already made sure we're looking - // at a label of some kind. - assert(N->getOpcode() == X86ISD::Wrapper && - "Unexpected node type for MOV32ri64"); + // into a register with 'movl' + if (N->getOpcode() != X86ISD::Wrapper) + return false; + N = N.getOperand(0); // At least GNU as does not accept 'movl' for TPOFF relocations. @@ -1907,6 +2039,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, N.getOperand(1), Base, Scale, Index, Disp, Segment); } +bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + if (!ISD::isNON_EXTLoad(N.getNode()) || + useNonTemporalLoad(cast<LoadSDNode>(N)) || + !IsProfitableToFold(N, P, Root) || + !IsLegalToFold(N, P, Root, OptLevel)) + return false; + + return selectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -2092,50 +2238,84 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, LoadNode->getOffset() != StoreNode->getOffset()) return false; - // Check if the chain is produced by the load or is a TokenFactor with - // the load output chain as an operand. Return InputChain by reference. + bool FoundLoad = false; + SmallVector<SDValue, 4> ChainOps; + SmallVector<const SDNode *, 4> LoopWorklist; + SmallPtrSet<const SDNode *, 16> Visited; + const unsigned int Max = 1024; + + // Visualization of Load-Op-Store fusion: + // ------------------------- + // Legend: + // *-lines = Chain operand dependencies. + // |-lines = Normal operand dependencies. + // Dependencies flow down and right. n-suffix references multiple nodes. + // + // C Xn C + // * * * + // * * * + // Xn A-LD Yn TF Yn + // * * \ | * | + // * * \ | * | + // * * \ | => A--LD_OP_ST + // * * \| \ + // TF OP \ + // * | \ Zn + // * | \ + // A-ST Zn + // + + // This merge induced dependences from: #1: Xn -> LD, OP, Zn + // #2: Yn -> LD + // #3: ST -> Zn + + // Ensure the transform is safe by checking for the dual + // dependencies to make sure we do not induce a loop. + + // As LD is a predecessor to both OP and ST we can do this by checking: + // a). if LD is a predecessor to a member of Xn or Yn. + // b). if a Zn is a predecessor to ST. + + // However, (b) can only occur through being a chain predecessor to + // ST, which is the same as Zn being a member or predecessor of Xn, + // which is a subset of LD being a predecessor of Xn. So it's + // subsumed by check (a). + SDValue Chain = StoreNode->getChain(); - bool ChainCheck = false; + // Gather X elements in ChainOps. if (Chain == Load.getValue(1)) { - ChainCheck = true; - InputChain = LoadNode->getChain(); + FoundLoad = true; + ChainOps.push_back(Load.getOperand(0)); } else if (Chain.getOpcode() == ISD::TokenFactor) { - SmallVector<SDValue, 4> ChainOps; for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { - ChainCheck = true; + FoundLoad = true; // Drop Load, but keep its chain. No cycle check necessary. ChainOps.push_back(Load.getOperand(0)); continue; } - - // Make sure using Op as part of the chain would not cause a cycle here. - // In theory, we could check whether the chain node is a predecessor of - // the load. But that can be very expensive. Instead visit the uses and - // make sure they all have smaller node id than the load. - int LoadId = LoadNode->getNodeId(); - for (SDNode::use_iterator UI = Op.getNode()->use_begin(), - UE = UI->use_end(); UI != UE; ++UI) { - if (UI.getUse().getResNo() != 0) - continue; - if (UI->getNodeId() > LoadId) - return false; - } - + LoopWorklist.push_back(Op.getNode()); ChainOps.push_back(Op); } - - if (ChainCheck) - // Make a new TokenFactor with all the other input chains except - // for the load. - InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), - MVT::Other, ChainOps); } - if (!ChainCheck) + + if (!FoundLoad) return false; + // Worklist is currently Xn. Add Yn to worklist. + for (SDValue Op : StoredVal->ops()) + if (Op.getNode() != LoadNode) + LoopWorklist.push_back(Op.getNode()); + + // Check (a) if Load is a predecessor to Xn + Yn + if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, + true)) + return false; + + InputChain = + CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); return true; } @@ -2177,7 +2357,9 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { case X86ISD::INC: case X86ISD::DEC: case X86ISD::ADD: + case X86ISD::ADC: case X86ISD::SUB: + case X86ISD::SBB: case X86ISD::AND: case X86ISD::OR: case X86ISD::XOR: @@ -2225,7 +2407,9 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { break; } case X86ISD::ADD: + case X86ISD::ADC: case X86ISD::SUB: + case X86ISD::SBB: case X86ISD::AND: case X86ISD::OR: case X86ISD::XOR: { @@ -2234,9 +2418,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { case X86ISD::ADD: return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, X86::ADD8mr); + case X86ISD::ADC: + return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, + X86::ADC8mr); case X86ISD::SUB: return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, X86::SUB8mr); + case X86ISD::SBB: + return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, + X86::SBB8mr); case X86ISD::AND: return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, X86::AND8mr); @@ -2253,8 +2443,12 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { switch (Opc) { case X86ISD::ADD: return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); + case X86ISD::ADC: + return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); case X86ISD::SUB: return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); + case X86ISD::SBB: + return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); case X86ISD::AND: return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); case X86ISD::OR: @@ -2270,9 +2464,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { case X86ISD::ADD: return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, X86::ADD8mi); + case X86ISD::ADC: + return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, + X86::ADC8mi); case X86ISD::SUB: return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, X86::SUB8mi); + case X86ISD::SBB: + return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, + X86::SBB8mi); case X86ISD::AND: return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, X86::AND8mi); @@ -2320,10 +2520,21 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { } } - const SDValue Ops[] = {Base, Scale, Index, Disp, - Segment, Operand, InputChain}; - Result = - CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); + if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { + SDValue CopyTo = + CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, + StoredVal.getOperand(2), SDValue()); + + const SDValue Ops[] = {Base, Scale, Index, Disp, + Segment, Operand, CopyTo, CopyTo.getValue(1)}; + Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, + Ops); + } else { + const SDValue Ops[] = {Base, Scale, Index, Disp, + Segment, Operand, InputChain}; + Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, + Ops); + } break; } default: @@ -2335,6 +2546,8 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { MemOp[1] = LoadNode->getMemOperand(); Result->setMemRefs(MemOp, MemOp + 2); + // Update Load Chain uses as well. + ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); CurDAG->RemoveDeadNode(Node); @@ -2388,57 +2601,169 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) { if (Shift + MaskSize > NVT.getSizeInBits()) return false; - SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); - unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; - unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + // Create a BEXTR node and run it through selection. + SDValue C = CurDAG->getConstant(Shift | (MaskSize << 8), dl, NVT); + SDValue New = CurDAG->getNode(X86ISD::BEXTR, dl, NVT, + N0->getOperand(0), C); + ReplaceNode(Node, New.getNode()); + SelectCode(New.getNode()); + return true; +} - // BMI requires the immediate to placed in a register. - if (!Subtarget->hasTBM()) { - ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; - MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; - New = SDValue(CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New), 0); - if (NVT == MVT::i64) { - New = - SDValue(CurDAG->getMachineNode( - TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, - CurDAG->getTargetConstant(0, dl, MVT::i64), New, - CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), - 0); - } +// Emit a PCMISTR(I/M) instruction. +MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, + bool MayFoldLoad, const SDLoc &dl, + MVT VT, SDNode *Node) { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + SDValue Imm = Node->getOperand(2); + const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); + Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); + + // If there is a load, it will be behind a bitcast. We don't need to check + // alignment on this load. + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() && + tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { + SDValue Load = N1.getOperand(0); + SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, + Load.getOperand(0) }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); + MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + // Update the chain. + ReplaceUses(Load.getValue(1), SDValue(CNode, 2)); + // Record the mem-refs + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand(); + CNode->setMemRefs(MemOp, MemOp + 1); + return CNode; } - MachineSDNode *NewNode; - SDValue Input = N0->getOperand(0); + SDValue Ops[] = { N0, N1, Imm }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); + MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); + return CNode; +} + +// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need +// to emit a second instruction after this one. This is needed since we have two +// copyToReg nodes glued before this and we need to continue that glue through. +MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, + bool MayFoldLoad, const SDLoc &dl, + MVT VT, SDNode *Node, + SDValue &InFlag) { + SDValue N0 = Node->getOperand(0); + SDValue N2 = Node->getOperand(2); + SDValue Imm = Node->getOperand(4); + const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue(); + Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); + + // If there is a load, it will be behind a bitcast. We don't need to check + // alignment on this load. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; - SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); - NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() && + tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4)) { + SDValue Load = N2.getOperand(0); + SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, + Load.getOperand(0), InFlag }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); + MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + InFlag = SDValue(CNode, 3); // Update the chain. - ReplaceUses(Input.getValue(1), SDValue(NewNode, 1)); + ReplaceUses(Load.getValue(1), SDValue(CNode, 2)); // Record the mem-refs MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<LoadSDNode>(Input)->getMemOperand(); - NewNode->setMemRefs(MemOp, MemOp + 1); - } else { - NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New); + MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand(); + CNode->setMemRefs(MemOp, MemOp + 1); + return CNode; } - ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); - CurDAG->RemoveDeadNode(Node); + SDValue Ops[] = { N0, N2, Imm, InFlag }; + SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); + MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); + InFlag = SDValue(CNode, 2); + return CNode; +} + +/// If the high bits of an 'and' operand are known zero, try setting the +/// high bits of an 'and' constant operand to produce a smaller encoding by +/// creating a small, sign-extended negative immediate rather than a large +/// positive one. This reverses a transform in SimplifyDemandedBits that +/// shrinks mask constants by clearing bits. There is also a possibility that +/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that +/// case, just replace the 'and'. Return 'true' if the node is replaced. +bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { + // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't + // have immediate operands. + MVT VT = And->getSimpleValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return false; + + auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1)); + if (!And1C) + return false; + + // Bail out if the mask constant is already negative. It's can't shrink more. + // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel + // patterns to use a 32-bit and instead of a 64-bit and by relying on the + // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits + // are negative too. + APInt MaskVal = And1C->getAPIntValue(); + unsigned MaskLZ = MaskVal.countLeadingZeros(); + if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) + return false; + + // Don't extend into the upper 32 bits of a 64 bit mask. + if (VT == MVT::i64 && MaskLZ >= 32) { + MaskLZ -= 32; + MaskVal = MaskVal.trunc(32); + } + + SDValue And0 = And->getOperand(0); + APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ); + APInt NegMaskVal = MaskVal | HighZeros; + + // If a negative constant would not allow a smaller encoding, there's no need + // to continue. Only change the constant when we know it's a win. + unsigned MinWidth = NegMaskVal.getMinSignedBits(); + if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32)) + return false; + + // Extend masks if we truncated above. + if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { + NegMaskVal = NegMaskVal.zext(64); + HighZeros = HighZeros.zext(64); + } + + // The variable operand must be all zeros in the top bits to allow using the + // new, negative constant as the mask. + if (!CurDAG->MaskedValueIsZero(And0, HighZeros)) + return false; + + // Check if the mask is -1. In that case, this is an unnecessary instruction + // that escaped earlier analysis. + if (NegMaskVal.isAllOnesValue()) { + ReplaceNode(And, And0.getNode()); + return true; + } + + // A negative mask allows a smaller encoding. Create a new 'and' node. + SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT); + SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask); + ReplaceNode(And, NewAnd.getNode()); + SelectCode(NewAnd.getNode()); return true; } void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); - unsigned Opc, MOpc; unsigned Opcode = Node->getOpcode(); SDLoc dl(Node); - DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n'); - if (Node->isMachineOpcode()) { - DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); Node->setNodeId(-1); return; // Already selected. } @@ -2483,9 +2808,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } case ISD::AND: - // Try to match BEXTR/BEXTRI instruction. if (matchBEXTRFromAnd(Node)) return; + if (shrinkAndImmediate(Node)) + return; LLVM_FALLTHROUGH; case ISD::OR: @@ -2577,7 +2903,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r); + unsigned Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r); SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL, N0, SDValue()).getValue(1); @@ -2594,7 +2920,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - unsigned LoReg; + unsigned LoReg, Opc; switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); // MVT::i8 is handled by X86ISD::UMUL8. @@ -2619,13 +2945,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); + unsigned Opc, MOpc; bool isSigned = Opcode == ISD::SMUL_LOHI; bool hasBMI2 = Subtarget->hasBMI2(); if (!isSigned) { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); - case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break; - case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break; case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r; MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break; case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r; @@ -2634,8 +2959,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } else { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); - case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break; - case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break; case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break; case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break; } @@ -2644,14 +2967,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { unsigned SrcReg, LoReg, HiReg; switch (Opc) { default: llvm_unreachable("Unknown MUL opcode!"); - case X86::IMUL8r: - case X86::MUL8r: - SrcReg = LoReg = X86::AL; HiReg = X86::AH; - break; - case X86::IMUL16r: - case X86::MUL16r: - SrcReg = LoReg = X86::AX; HiReg = X86::DX; - break; case X86::IMUL32r: case X86::MUL32r: SrcReg = LoReg = X86::EAX; HiReg = X86::EDX; @@ -2721,27 +3036,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } } - // Prevent use of AH in a REX instruction by referencing AX instead. - if (HiReg == X86::AH && Subtarget->is64Bit() && - !SDValue(Node, 1).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - X86::AX, MVT::i16, InFlag); - InFlag = Result.getValue(2); - // Get the low part if needed. Don't use getCopyFromReg for aliasing - // registers. - if (!SDValue(Node, 0).use_empty()) - ReplaceUses(SDValue(Node, 0), - CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); - - // Shift AX down 8 bits. - Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, - Result, - CurDAG->getTargetConstant(8, dl, MVT::i8)), - 0); - // Then truncate it down to i8. - ReplaceUses(SDValue(Node, 1), - CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); - } // Copy the low half of the result, if it is needed. if (!SDValue(Node, 0).use_empty()) { if (!ResLo.getNode()) { @@ -2751,7 +3045,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { InFlag = ResLo.getValue(2); } ReplaceUses(SDValue(Node, 0), ResLo); - DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); + dbgs() << '\n'); } // Copy the high half of the result, if it is needed. if (!SDValue(Node, 1).use_empty()) { @@ -2762,7 +3057,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { InFlag = ResHi.getValue(2); } ReplaceUses(SDValue(Node, 1), ResHi); - DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); + dbgs() << '\n'); } CurDAG->RemoveDeadNode(Node); @@ -2776,6 +3072,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); + unsigned Opc, MOpc; bool isSigned = (Opcode == ISD::SDIVREM || Opcode == X86ISD::SDIVREM8_SEXT_HREG); if (!isSigned) { @@ -2909,7 +3206,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); unsigned AHExtOpcode = - isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8; + isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, MVT::Glue, AHCopy, InFlag); @@ -2924,7 +3221,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); } ReplaceUses(SDValue(Node, 1), Result); - DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); + dbgs() << '\n'); } // Copy the division (low) result, if it is needed. if (!SDValue(Node, 0).use_empty()) { @@ -2932,7 +3230,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { LoReg, NVT, InFlag); InFlag = Result.getValue(2); ReplaceUses(SDValue(Node, 0), Result); - DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); + dbgs() << '\n'); } // Copy the remainder (high) result, if it is needed. if (!SDValue(Node, 1).use_empty()) { @@ -2940,18 +3239,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) { HiReg, NVT, InFlag); InFlag = Result.getValue(2); ReplaceUses(SDValue(Node, 1), Result); - DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); + dbgs() << '\n'); } CurDAG->RemoveDeadNode(Node); return; } - case X86ISD::CMP: - case X86ISD::SUB: { - // Sometimes a SUB is used to perform comparison. - if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0)) - // This node is not a CMP. - break; + case X86ISD::CMP: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); @@ -2962,8 +3257,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to // use a smaller encoding. // Look past the truncate if CMP is the only use of it. - if ((N0.getOpcode() == ISD::AND || - (N0.getResNo() == 0 && N0.getOpcode() == X86ISD::AND)) && + if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8 && X86::isZeroNode(N1)) { @@ -2971,98 +3265,119 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (!C) break; uint64_t Mask = C->getZExtValue(); - // For example, convert "testl %eax, $8" to "testb %al, $8" + MVT VT; + int SubRegOp; + unsigned Op; + if (isUInt<8>(Mask) && (!(Mask & 0x80) || hasNoSignedComparisonUses(Node))) { - SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i8); - SDValue Reg = N0.getOperand(0); - - // Extract the l-register. - SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, - MVT::i8, Reg); - - // Emit a testb. - SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, - Subreg, Imm); - // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has - // one, do not call ReplaceAllUsesWith. - ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), - SDValue(NewNode, 0)); - CurDAG->RemoveDeadNode(Node); - return; + // For example, convert "testl %eax, $8" to "testb %al, $8" + VT = MVT::i8; + SubRegOp = X86::sub_8bit; + Op = X86::TEST8ri; + } else if (OptForMinSize && isUInt<16>(Mask) && + (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { + // For example, "testl %eax, $32776" to "testw %ax, $32776". + // NOTE: We only want to form TESTW instructions if optimizing for + // min size. Otherwise we only save one byte and possibly get a length + // changing prefix penalty in the decoders. + VT = MVT::i16; + SubRegOp = X86::sub_16bit; + Op = X86::TEST16ri; + } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && + (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) { + // For example, "testq %rax, $268468232" to "testl %eax, $268468232". + // NOTE: We only want to run that transform if N0 is 32 or 64 bits. + // Otherwize, we find ourselves in a position where we have to do + // promotion. If previous passes did not promote the and, we assume + // they had a good reason not to and do not promote here. + VT = MVT::i32; + SubRegOp = X86::sub_32bit; + Op = X86::TEST32ri; + } else { + // No eligible transformation was found. + break; } - // For example, "testl %eax, $2048" to "testb %ah, $8". - if (isShiftedUInt<8, 8>(Mask) && - (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { - // Shift the immediate right by 8 bits. - SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8); - SDValue Reg = N0.getOperand(0); - - // Extract the h-register. - SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, - MVT::i8, Reg); - - // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only - // target GR8_NOREX registers, so make sure the register class is - // forced. - SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, - MVT::i32, Subreg, ShiftedImm); - // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has - // one, do not call ReplaceAllUsesWith. - ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), - SDValue(NewNode, 0)); - CurDAG->RemoveDeadNode(Node); - return; - } + SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); + SDValue Reg = N0.getOperand(0); - // For example, "testl %eax, $32776" to "testw %ax, $32776". - // NOTE: We only want to form TESTW instructions if optimizing for - // min size. Otherwise we only save one byte and possibly get a length - // changing prefix penalty in the decoders. - if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 && - (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { - SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16); - SDValue Reg = N0.getOperand(0); - - // Extract the 16-bit subregister. - SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, - MVT::i16, Reg); - - // Emit a testw. - SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, - Subreg, Imm); - // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has - // one, do not call ReplaceAllUsesWith. - ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), - SDValue(NewNode, 0)); - CurDAG->RemoveDeadNode(Node); - return; - } + // Extract the subregister if necessary. + if (N0.getValueType() != VT) + Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); - // For example, "testq %rax, $268468232" to "testl %eax, $268468232". - if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 && - (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) { - SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32); - SDValue Reg = N0.getOperand(0); - - // Extract the 32-bit subregister. - SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl, - MVT::i32, Reg); - - // Emit a testl. - SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, - Subreg, Imm); - // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has - // one, do not call ReplaceAllUsesWith. - ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), - SDValue(NewNode, 0)); - CurDAG->RemoveDeadNode(Node); - return; - } + // Emit a testl or testw. + SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm); + // Replace CMP with TEST. + ReplaceNode(Node, NewNode); + return; } break; } + case X86ISD::PCMPISTR: { + if (!Subtarget->hasSSE42()) + break; + + bool NeedIndex = !SDValue(Node, 0).use_empty(); + bool NeedMask = !SDValue(Node, 1).use_empty(); + // We can't fold a load if we are going to make two instructions. + bool MayFoldLoad = !NeedIndex || !NeedMask; + + MachineSDNode *CNode; + if (NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; + CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); + } + if (NeedIndex || !NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; + CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + } + + // Connect the flag usage to the last instruction created. + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); + CurDAG->RemoveDeadNode(Node); + return; + } + case X86ISD::PCMPESTR: { + if (!Subtarget->hasSSE42()) + break; + + // Copy the two implicit register inputs. + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, + Node->getOperand(1), + SDValue()).getValue(1); + InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, + Node->getOperand(3), InFlag).getValue(1); + + bool NeedIndex = !SDValue(Node, 0).use_empty(); + bool NeedMask = !SDValue(Node, 1).use_empty(); + // We can't fold a load if we are going to make two instructions. + bool MayFoldLoad = !NeedIndex || !NeedMask; + + MachineSDNode *CNode; + if (NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; + CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, + InFlag); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); + } + if (NeedIndex || !NeedMask) { + unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; + unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; + CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + } + // Connect the flag usage to the last instruction created. + ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); + CurDAG->RemoveDeadNode(Node); + return; + } + case ISD::STORE: if (foldLoadStoreIntoMemOperand(Node)) return; |