aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/X86/X86ISelDAGToDAG.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86ISelDAGToDAG.cpp')
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp975
1 files changed, 645 insertions, 330 deletions
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 660c1eff3c4b..a28d4eac8393 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Config/llvm-config.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
@@ -100,11 +101,11 @@ namespace {
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void dump() {
+ void dump(SelectionDAG *DAG = nullptr) {
dbgs() << "X86ISelAddressMode " << this << '\n';
dbgs() << "Base_Reg ";
if (Base_Reg.getNode())
- Base_Reg.getNode()->dump();
+ Base_Reg.getNode()->dump(DAG);
else
dbgs() << "nul\n";
if (BaseType == FrameIndexBase)
@@ -112,7 +113,7 @@ namespace {
dbgs() << " Scale " << Scale << '\n'
<< "IndexReg ";
if (IndexReg.getNode())
- IndexReg.getNode()->dump();
+ IndexReg.getNode()->dump(DAG);
else
dbgs() << "nul\n";
dbgs() << " Disp " << Disp << '\n'
@@ -181,6 +182,7 @@ namespace {
bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
void PreprocessISelDAG() override;
+ void PostprocessISelDAG() override;
// Include the pieces autogenerated from the target description.
#include "X86GenDAGISel.inc"
@@ -213,7 +215,7 @@ namespace {
bool selectTLSADDRAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- bool selectScalarSSELoad(SDNode *Root, SDValue N,
+ bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment,
@@ -225,7 +227,7 @@ namespace {
SDValue &Index, SDValue &Disp,
SDValue &Segment);
- // Convience method where P is also root.
+ // Convenience method where P is also root.
bool tryFoldLoad(SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
@@ -233,6 +235,12 @@ namespace {
return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
}
+ // Try to fold a vector load. This makes sure the load isn't non-temporal.
+ bool tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment);
+
/// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
@@ -368,6 +376,11 @@ namespace {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
+ /// Return a target constant with the specified value, of type i64.
+ inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
+ return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
+ }
+
SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
const SDLoc &DL) {
assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
@@ -401,7 +414,7 @@ namespace {
return Subtarget->getInstrInfo();
}
- /// \brief Address-mode matching performs shift-of-and to and-of-shift
+ /// Address-mode matching performs shift-of-and to and-of-shift
/// reassociation in order to expose more scaled addressing
/// opportunities.
bool ComplexPatternFuncMutatesDAG() const override {
@@ -440,10 +453,15 @@ namespace {
}
bool foldLoadStoreIntoMemOperand(SDNode *Node);
-
bool matchBEXTRFromAnd(SDNode *Node);
-
+ bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
+
+ MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+ const SDLoc &dl, MVT VT, SDNode *Node);
+ MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
+ const SDLoc &dl, MVT VT, SDNode *Node,
+ SDValue &InFlag);
};
}
@@ -452,19 +470,21 @@ namespace {
// type.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
unsigned Opcode = N->getOpcode();
- if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM ||
- Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM ||
- Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU ||
- Opcode == X86ISD::CMPM_RND) {
+ if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
+ Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
EVT OpVT = N->getOperand(0).getValueType();
- if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32)
+ if (OpVT.is256BitVector() || OpVT.is128BitVector())
return Subtarget->hasVLX();
return true;
}
+ // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
+ if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
+ Opcode == X86ISD::FSETCCM_RND)
+ return true;
return false;
}
@@ -518,10 +538,21 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
// addl 4(%esp), %eax
// The former is 2 bytes shorter. In case where the increment is 1, then
// the saving can be 4 bytes (by using incl %eax).
- if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+ if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
if (Imm->getAPIntValue().isSignedIntN(8))
return false;
+ // If this is a 64-bit AND with an immediate that fits in 32-bits,
+ // prefer using the smaller and over folding the load. This is needed to
+ // make sure immediates created by shrinkAndImmediate are always folded.
+ // Ideally we would narrow the load during DAG combine and get the
+ // best of both worlds.
+ if (U->getOpcode() == ISD::AND &&
+ Imm->getAPIntValue().getBitWidth() == 64 &&
+ Imm->getAPIntValue().isIntN(32))
+ return false;
+ }
+
// If the other operand is a TLS address, we should fold it instead.
// This produces
// movl %gs:0, %eax
@@ -537,10 +568,60 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
return false;
}
+
+ // Don't fold load if this matches the BTS/BTR/BTC patterns.
+ // BTS: (or X, (shl 1, n))
+ // BTR: (and X, (rotl -2, n))
+ // BTC: (xor X, (shl 1, n))
+ if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
+ if (U->getOperand(0).getOpcode() == ISD::SHL &&
+ isOneConstant(U->getOperand(0).getOperand(0)))
+ return false;
+
+ if (U->getOperand(1).getOpcode() == ISD::SHL &&
+ isOneConstant(U->getOperand(1).getOperand(0)))
+ return false;
+ }
+ if (U->getOpcode() == ISD::AND) {
+ SDValue U0 = U->getOperand(0);
+ SDValue U1 = U->getOperand(1);
+ if (U0.getOpcode() == ISD::ROTL) {
+ auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
+ if (C && C->getSExtValue() == -2)
+ return false;
+ }
+
+ if (U1.getOpcode() == ISD::ROTL) {
+ auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
+ if (C && C->getSExtValue() == -2)
+ return false;
+ }
+ }
+
+ break;
}
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ // Don't fold a load into a shift by immediate. The BMI2 instructions
+ // support folding a load, but not an immediate. The legacy instructions
+ // support folding an immediate, but can't fold a load. Folding an
+ // immediate is preferable to folding a load.
+ if (isa<ConstantSDNode>(U->getOperand(1)))
+ return false;
+
+ break;
}
}
+ // Prevent folding a load if this can implemented with an insert_subreg or
+ // a move that implicitly zeroes.
+ if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
+ isNullConstant(Root->getOperand(2)) &&
+ (Root->getOperand(0).isUndef() ||
+ ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
+ return false;
+
return true;
}
@@ -628,12 +709,24 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
E = CurDAG->allnodes_end(); I != E; ) {
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
+ // If this is a target specific AND node with no flag usages, turn it back
+ // into ISD::AND to enable test instruction matching.
+ if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
+ SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+ --I;
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ ++I;
+ CurDAG->DeleteNode(N);
+ continue;
+ }
+
if (OptLevel != CodeGenOpt::None &&
- // Only does this when target favors doesn't favor register indirect
- // call.
+ // Only do this when the target can fold the load into the call or
+ // jmp.
+ !Subtarget->useRetpoline() &&
((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
(N->getOpcode() == X86ISD::TC_RETURN &&
- // Only does this if load can be folded into TC_RETURN.
(Subtarget->is64Bit() ||
!getTargetMachine().isPositionIndependent())))) {
/// Also try moving call address load from outside callseq_start to just
@@ -735,6 +828,70 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
}
+void X86DAGToDAGISel::PostprocessISelDAG() {
+ // Skip peepholes at -O0.
+ if (TM.getOptLevel() == CodeGenOpt::None)
+ return;
+
+ // Attempt to remove vectors moves that were inserted to zero upper bits.
+
+ SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+ ++Position;
+
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ // Skip dead nodes and any non-machine opcodes.
+ if (N->use_empty() || !N->isMachineOpcode())
+ continue;
+
+ if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG)
+ continue;
+
+ unsigned SubRegIdx = N->getConstantOperandVal(2);
+ if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
+ continue;
+
+ SDValue Move = N->getOperand(1);
+ if (!Move.isMachineOpcode())
+ continue;
+
+ // Make sure its one of the move opcodes we recognize.
+ switch (Move.getMachineOpcode()) {
+ default:
+ continue;
+ case X86::VMOVAPDrr: case X86::VMOVUPDrr:
+ case X86::VMOVAPSrr: case X86::VMOVUPSrr:
+ case X86::VMOVDQArr: case X86::VMOVDQUrr:
+ case X86::VMOVAPDYrr: case X86::VMOVUPDYrr:
+ case X86::VMOVAPSYrr: case X86::VMOVUPSYrr:
+ case X86::VMOVDQAYrr: case X86::VMOVDQUYrr:
+ case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr:
+ case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr:
+ case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
+ case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
+ case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr:
+ case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr:
+ case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
+ case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
+ break;
+ }
+
+ SDValue In = Move.getOperand(0);
+ if (!In.isMachineOpcode() ||
+ In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
+ continue;
+
+ // Producing instruction is another vector instruction. We can drop the
+ // move.
+ CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
+
+ // If the move is now dead, delete it.
+ if (Move.getNode()->use_empty())
+ CurDAG->RemoveDeadNode(Move.getNode());
+ }
+}
+
+
/// Emit any code that needs to be executed only in the main function.
void X86DAGToDAGISel::emitSpecialCodeForMain() {
if (Subtarget->isTargetCygMing()) {
@@ -771,9 +928,14 @@ static bool isDispSafeForFrameIndex(int64_t Val) {
bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
X86ISelAddressMode &AM) {
+ // If there's no offset to fold, we don't need to do any work.
+ if (Offset == 0)
+ return false;
+
// Cannot combine ExternalSymbol displacements with integer offsets.
- if (Offset != 0 && (AM.ES || AM.MCSym))
+ if (AM.ES || AM.MCSym)
return true;
+
int64_t Val = AM.Disp + Offset;
CodeModel::Model M = TM.getCodeModel();
if (Subtarget->is64Bit()) {
@@ -827,94 +989,60 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
if (AM.hasSymbolicDisplacement())
return true;
- SDValue N0 = N.getOperand(0);
+ bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
+
+ // We can't use an addressing mode in the 64-bit large code model. In the
+ // medium code model, we use can use an mode when RIP wrappers are present.
+ // That signifies access to globals that are known to be "near", such as the
+ // GOT itself.
CodeModel::Model M = TM.getCodeModel();
+ if (Subtarget->is64Bit() &&
+ (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel)))
+ return true;
- // Handle X86-64 rip-relative addresses. We check this before checking direct
- // folding because RIP is preferable to non-RIP accesses.
- if (Subtarget->is64Bit() && N.getOpcode() == X86ISD::WrapperRIP &&
- // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
- // they cannot be folded into immediate fields.
- // FIXME: This can be improved for kernel and other models?
- (M == CodeModel::Small || M == CodeModel::Kernel)) {
- // Base and index reg must be 0 in order to use %rip as base.
- if (AM.hasBaseOrIndexReg())
- return true;
- if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
- X86ISelAddressMode Backup = AM;
- AM.GV = G->getGlobal();
- AM.SymbolFlags = G->getTargetFlags();
- if (foldOffsetIntoAddress(G->getOffset(), AM)) {
- AM = Backup;
- return true;
- }
- } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
- X86ISelAddressMode Backup = AM;
- AM.CP = CP->getConstVal();
- AM.Align = CP->getAlignment();
- AM.SymbolFlags = CP->getTargetFlags();
- if (foldOffsetIntoAddress(CP->getOffset(), AM)) {
- AM = Backup;
- return true;
- }
- } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
- AM.ES = S->getSymbol();
- AM.SymbolFlags = S->getTargetFlags();
- } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
- AM.MCSym = S->getMCSymbol();
- } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
- AM.JT = J->getIndex();
- AM.SymbolFlags = J->getTargetFlags();
- } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
- X86ISelAddressMode Backup = AM;
- AM.BlockAddr = BA->getBlockAddress();
- AM.SymbolFlags = BA->getTargetFlags();
- if (foldOffsetIntoAddress(BA->getOffset(), AM)) {
- AM = Backup;
- return true;
- }
- } else
- llvm_unreachable("Unhandled symbol reference node.");
+ // Base and index reg must be 0 in order to use %rip as base.
+ if (IsRIPRel && AM.hasBaseOrIndexReg())
+ return true;
- if (N.getOpcode() == X86ISD::WrapperRIP)
- AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
- return false;
- }
+ // Make a local copy in case we can't do this fold.
+ X86ISelAddressMode Backup = AM;
- // Handle the case when globals fit in our immediate field: This is true for
- // X86-32 always and X86-64 when in -mcmodel=small mode. In 64-bit
- // mode, this only applies to a non-RIP-relative computation.
- if (!Subtarget->is64Bit() ||
- M == CodeModel::Small || M == CodeModel::Kernel) {
- assert(N.getOpcode() != X86ISD::WrapperRIP &&
- "RIP-relative addressing already handled");
- if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
- AM.GV = G->getGlobal();
- AM.Disp += G->getOffset();
- AM.SymbolFlags = G->getTargetFlags();
- } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
- AM.CP = CP->getConstVal();
- AM.Align = CP->getAlignment();
- AM.Disp += CP->getOffset();
- AM.SymbolFlags = CP->getTargetFlags();
- } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
- AM.ES = S->getSymbol();
- AM.SymbolFlags = S->getTargetFlags();
- } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
- AM.MCSym = S->getMCSymbol();
- } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
- AM.JT = J->getIndex();
- AM.SymbolFlags = J->getTargetFlags();
- } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
- AM.BlockAddr = BA->getBlockAddress();
- AM.Disp += BA->getOffset();
- AM.SymbolFlags = BA->getTargetFlags();
- } else
- llvm_unreachable("Unhandled symbol reference node.");
- return false;
+ int64_t Offset = 0;
+ SDValue N0 = N.getOperand(0);
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ AM.GV = G->getGlobal();
+ AM.SymbolFlags = G->getTargetFlags();
+ Offset = G->getOffset();
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.SymbolFlags = CP->getTargetFlags();
+ Offset = CP->getOffset();
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+ AM.ES = S->getSymbol();
+ AM.SymbolFlags = S->getTargetFlags();
+ } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
+ AM.MCSym = S->getMCSymbol();
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ AM.JT = J->getIndex();
+ AM.SymbolFlags = J->getTargetFlags();
+ } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+ AM.BlockAddr = BA->getBlockAddress();
+ AM.SymbolFlags = BA->getTargetFlags();
+ Offset = BA->getOffset();
+ } else
+ llvm_unreachable("Unhandled symbol reference node.");
+
+ if (foldOffsetIntoAddress(Offset, AM)) {
+ AM = Backup;
+ return true;
}
- return true;
+ if (IsRIPRel)
+ AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+
+ // Commit the changes now that we know this fold is safe.
+ return false;
}
/// Add the specified node to the specified addressing mode, returning true if
@@ -988,10 +1116,16 @@ bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
// IDs! The selection DAG must no longer depend on their uniqueness when this
// is used.
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
- if (N.getNode()->getNodeId() == -1 ||
- N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) {
- DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode());
- N.getNode()->setNodeId(Pos.getNode()->getNodeId());
+ if (N->getNodeId() == -1 ||
+ (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
+ SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
+ DAG.RepositionNode(Pos->getIterator(), N.getNode());
+ // Mark Node as invalid for pruning as after this it may be a successor to a
+ // selected node but otherwise be in the same position of Pos.
+ // Conservatively mark it with the same -abs(Id) to assure node id
+ // invariant is preserved.
+ N->setNodeId(Pos->getNodeId());
+ SelectionDAGISel::InvalidateNodeId(N.getNode());
}
}
@@ -1196,10 +1330,10 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth) {
SDLoc dl(N);
- DEBUG({
- dbgs() << "MatchAddress: ";
- AM.dump();
- });
+ LLVM_DEBUG({
+ dbgs() << "MatchAddress: ";
+ AM.dump(CurDAG);
+ });
// Limit recursion.
if (Depth > 5)
return matchAddressBase(N, AM);
@@ -1508,6 +1642,12 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
// TODO: Support other operations.
switch (N.getOpcode()) {
+ case ISD::Constant: {
+ uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+ if (!foldOffsetIntoAddress(Val, AM))
+ return false;
+ break;
+ }
case X86ISD::Wrapper:
if (!matchWrapper(N, AM))
return false;
@@ -1523,7 +1663,7 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
X86ISelAddressMode AM;
auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
AM.IndexReg = Mgs->getIndex();
- AM.Scale = Mgs->getValue().getScalarValueSizeInBits() / 8;
+ AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
@@ -1534,14 +1674,8 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
if (AddrSpace == 258)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
- // If Base is 0, the whole address is in index and the Scale is 1
- if (isa<ConstantSDNode>(N)) {
- assert(cast<ConstantSDNode>(N)->isNullValue() &&
- "Unexpected base in gather/scatter");
- AM.Scale = 1;
- }
- // Otherwise, try to match into the base and displacement fields.
- else if (matchVectorAddress(N, AM))
+ // Try to match into the base and displacement fields.
+ if (matchVectorAddress(N, AM))
return false;
MVT VT = N.getSimpleValueType();
@@ -1604,8 +1738,7 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
// We can only fold a load if all nodes between it and the root node have a
// single use. If there are additional uses, we could end up duplicating the
// load.
-static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) {
- SDNode *User = *N->use_begin();
+static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
while (User != Root) {
if (!User->hasOneUse())
return false;
@@ -1622,17 +1755,19 @@ static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) {
/// We also return:
/// PatternChainNode: this is the matched node that has a chain input and
/// output.
-bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
+bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index,
SDValue &Disp, SDValue &Segment,
SDValue &PatternNodeWithChain) {
+ if (!hasSingleUsesFromRoot(Root, Parent))
+ return false;
+
// We can allow a full vector load here since narrowing a load is ok.
if (ISD::isNON_EXTLoad(N.getNode())) {
PatternNodeWithChain = N;
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
- hasSingleUsesFromRoot(Root, N.getNode())) {
+ IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
@@ -1643,8 +1778,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
PatternNodeWithChain = N;
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
- hasSingleUsesFromRoot(Root, N.getNode())) {
+ IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
Segment);
@@ -1658,8 +1792,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
PatternNodeWithChain = N.getOperand(0);
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
- hasSingleUsesFromRoot(Root, N.getNode())) {
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
@@ -1675,8 +1808,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
PatternNodeWithChain = N.getOperand(0).getOperand(0);
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
- hasSingleUsesFromRoot(Root, N.getNode())) {
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
// Okay, this is a zero extending load. Fold it.
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
@@ -1699,10 +1831,10 @@ bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
}
// In static codegen with small code model, we can get the address of a label
- // into a register with 'movl'. TableGen has already made sure we're looking
- // at a label of some kind.
- assert(N->getOpcode() == X86ISD::Wrapper &&
- "Unexpected node type for MOV32ri64");
+ // into a register with 'movl'
+ if (N->getOpcode() != X86ISD::Wrapper)
+ return false;
+
N = N.getOperand(0);
// At least GNU as does not accept 'movl' for TPOFF relocations.
@@ -1907,6 +2039,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
N.getOperand(1), Base, Scale, Index, Disp, Segment);
}
+bool X86DAGToDAGISel::tryFoldVecLoad(SDNode *Root, SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ if (!ISD::isNON_EXTLoad(N.getNode()) ||
+ useNonTemporalLoad(cast<LoadSDNode>(N)) ||
+ !IsProfitableToFold(N, P, Root) ||
+ !IsLegalToFold(N, P, Root, OptLevel))
+ return false;
+
+ return selectAddr(N.getNode(),
+ N.getOperand(1), Base, Scale, Index, Disp, Segment);
+}
+
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
@@ -2092,50 +2238,84 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
LoadNode->getOffset() != StoreNode->getOffset())
return false;
- // Check if the chain is produced by the load or is a TokenFactor with
- // the load output chain as an operand. Return InputChain by reference.
+ bool FoundLoad = false;
+ SmallVector<SDValue, 4> ChainOps;
+ SmallVector<const SDNode *, 4> LoopWorklist;
+ SmallPtrSet<const SDNode *, 16> Visited;
+ const unsigned int Max = 1024;
+
+ // Visualization of Load-Op-Store fusion:
+ // -------------------------
+ // Legend:
+ // *-lines = Chain operand dependencies.
+ // |-lines = Normal operand dependencies.
+ // Dependencies flow down and right. n-suffix references multiple nodes.
+ //
+ // C Xn C
+ // * * *
+ // * * *
+ // Xn A-LD Yn TF Yn
+ // * * \ | * |
+ // * * \ | * |
+ // * * \ | => A--LD_OP_ST
+ // * * \| \
+ // TF OP \
+ // * | \ Zn
+ // * | \
+ // A-ST Zn
+ //
+
+ // This merge induced dependences from: #1: Xn -> LD, OP, Zn
+ // #2: Yn -> LD
+ // #3: ST -> Zn
+
+ // Ensure the transform is safe by checking for the dual
+ // dependencies to make sure we do not induce a loop.
+
+ // As LD is a predecessor to both OP and ST we can do this by checking:
+ // a). if LD is a predecessor to a member of Xn or Yn.
+ // b). if a Zn is a predecessor to ST.
+
+ // However, (b) can only occur through being a chain predecessor to
+ // ST, which is the same as Zn being a member or predecessor of Xn,
+ // which is a subset of LD being a predecessor of Xn. So it's
+ // subsumed by check (a).
+
SDValue Chain = StoreNode->getChain();
- bool ChainCheck = false;
+ // Gather X elements in ChainOps.
if (Chain == Load.getValue(1)) {
- ChainCheck = true;
- InputChain = LoadNode->getChain();
+ FoundLoad = true;
+ ChainOps.push_back(Load.getOperand(0));
} else if (Chain.getOpcode() == ISD::TokenFactor) {
- SmallVector<SDValue, 4> ChainOps;
for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
SDValue Op = Chain.getOperand(i);
if (Op == Load.getValue(1)) {
- ChainCheck = true;
+ FoundLoad = true;
// Drop Load, but keep its chain. No cycle check necessary.
ChainOps.push_back(Load.getOperand(0));
continue;
}
-
- // Make sure using Op as part of the chain would not cause a cycle here.
- // In theory, we could check whether the chain node is a predecessor of
- // the load. But that can be very expensive. Instead visit the uses and
- // make sure they all have smaller node id than the load.
- int LoadId = LoadNode->getNodeId();
- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = UI->use_end(); UI != UE; ++UI) {
- if (UI.getUse().getResNo() != 0)
- continue;
- if (UI->getNodeId() > LoadId)
- return false;
- }
-
+ LoopWorklist.push_back(Op.getNode());
ChainOps.push_back(Op);
}
-
- if (ChainCheck)
- // Make a new TokenFactor with all the other input chains except
- // for the load.
- InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
- MVT::Other, ChainOps);
}
- if (!ChainCheck)
+
+ if (!FoundLoad)
return false;
+ // Worklist is currently Xn. Add Yn to worklist.
+ for (SDValue Op : StoredVal->ops())
+ if (Op.getNode() != LoadNode)
+ LoopWorklist.push_back(Op.getNode());
+
+ // Check (a) if Load is a predecessor to Xn + Yn
+ if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
+ true))
+ return false;
+
+ InputChain =
+ CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
return true;
}
@@ -2177,7 +2357,9 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
case X86ISD::INC:
case X86ISD::DEC:
case X86ISD::ADD:
+ case X86ISD::ADC:
case X86ISD::SUB:
+ case X86ISD::SBB:
case X86ISD::AND:
case X86ISD::OR:
case X86ISD::XOR:
@@ -2225,7 +2407,9 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
break;
}
case X86ISD::ADD:
+ case X86ISD::ADC:
case X86ISD::SUB:
+ case X86ISD::SBB:
case X86ISD::AND:
case X86ISD::OR:
case X86ISD::XOR: {
@@ -2234,9 +2418,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
case X86ISD::ADD:
return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
X86::ADD8mr);
+ case X86ISD::ADC:
+ return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
+ X86::ADC8mr);
case X86ISD::SUB:
return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
X86::SUB8mr);
+ case X86ISD::SBB:
+ return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
+ X86::SBB8mr);
case X86ISD::AND:
return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
X86::AND8mr);
@@ -2253,8 +2443,12 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
switch (Opc) {
case X86ISD::ADD:
return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
+ case X86ISD::ADC:
+ return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
case X86ISD::SUB:
return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
+ case X86ISD::SBB:
+ return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
case X86ISD::AND:
return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
case X86ISD::OR:
@@ -2270,9 +2464,15 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
case X86ISD::ADD:
return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
X86::ADD8mi);
+ case X86ISD::ADC:
+ return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
+ X86::ADC8mi);
case X86ISD::SUB:
return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
X86::SUB8mi);
+ case X86ISD::SBB:
+ return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
+ X86::SBB8mi);
case X86ISD::AND:
return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
X86::AND8mi);
@@ -2320,10 +2520,21 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
}
}
- const SDValue Ops[] = {Base, Scale, Index, Disp,
- Segment, Operand, InputChain};
- Result =
- CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
+ if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
+ SDValue CopyTo =
+ CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
+ StoredVal.getOperand(2), SDValue());
+
+ const SDValue Ops[] = {Base, Scale, Index, Disp,
+ Segment, Operand, CopyTo, CopyTo.getValue(1)};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
+ Ops);
+ } else {
+ const SDValue Ops[] = {Base, Scale, Index, Disp,
+ Segment, Operand, InputChain};
+ Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
+ Ops);
+ }
break;
}
default:
@@ -2335,6 +2546,8 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
MemOp[1] = LoadNode->getMemOperand();
Result->setMemRefs(MemOp, MemOp + 2);
+ // Update Load Chain uses as well.
+ ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
CurDAG->RemoveDeadNode(Node);
@@ -2388,57 +2601,169 @@ bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
if (Shift + MaskSize > NVT.getSizeInBits())
return false;
- SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
- unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
- unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+ // Create a BEXTR node and run it through selection.
+ SDValue C = CurDAG->getConstant(Shift | (MaskSize << 8), dl, NVT);
+ SDValue New = CurDAG->getNode(X86ISD::BEXTR, dl, NVT,
+ N0->getOperand(0), C);
+ ReplaceNode(Node, New.getNode());
+ SelectCode(New.getNode());
+ return true;
+}
- // BMI requires the immediate to placed in a register.
- if (!Subtarget->hasTBM()) {
- ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
- MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
- New = SDValue(CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New), 0);
- if (NVT == MVT::i64) {
- New =
- SDValue(CurDAG->getMachineNode(
- TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
- CurDAG->getTargetConstant(0, dl, MVT::i64), New,
- CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
- 0);
- }
+// Emit a PCMISTR(I/M) instruction.
+MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
+ bool MayFoldLoad, const SDLoc &dl,
+ MVT VT, SDNode *Node) {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ SDValue Imm = Node->getOperand(2);
+ const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+ Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+ // If there is a load, it will be behind a bitcast. We don't need to check
+ // alignment on this load.
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (MayFoldLoad && N1->getOpcode() == ISD::BITCAST && N1->hasOneUse() &&
+ tryFoldVecLoad(Node, N1.getNode(), N1.getOperand(0), Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4)) {
+ SDValue Load = N1.getOperand(0);
+ SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+ Load.getOperand(0) };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ // Update the chain.
+ ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
+ // Record the mem-refs
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+ CNode->setMemRefs(MemOp, MemOp + 1);
+ return CNode;
}
- MachineSDNode *NewNode;
- SDValue Input = N0->getOperand(0);
+ SDValue Ops[] = { N0, N1, Imm };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
+ MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+ return CNode;
+}
+
+// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
+// to emit a second instruction after this one. This is needed since we have two
+// copyToReg nodes glued before this and we need to continue that glue through.
+MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
+ bool MayFoldLoad, const SDLoc &dl,
+ MVT VT, SDNode *Node,
+ SDValue &InFlag) {
+ SDValue N0 = Node->getOperand(0);
+ SDValue N2 = Node->getOperand(2);
+ SDValue Imm = Node->getOperand(4);
+ const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
+ Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
+
+ // If there is a load, it will be behind a bitcast. We don't need to check
+ // alignment on this load.
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
- if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
- SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
- SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
- NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ if (MayFoldLoad && N2->getOpcode() == ISD::BITCAST && N2->hasOneUse() &&
+ tryFoldVecLoad(Node, N2.getNode(), N2.getOperand(0), Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4)) {
+ SDValue Load = N2.getOperand(0);
+ SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
+ Load.getOperand(0), InFlag };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
+ MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 3);
// Update the chain.
- ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
+ ReplaceUses(Load.getValue(1), SDValue(CNode, 2));
// Record the mem-refs
MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<LoadSDNode>(Input)->getMemOperand();
- NewNode->setMemRefs(MemOp, MemOp + 1);
- } else {
- NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
+ MemOp[0] = cast<LoadSDNode>(Load)->getMemOperand();
+ CNode->setMemRefs(MemOp, MemOp + 1);
+ return CNode;
}
- ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
+ SDValue Ops[] = { N0, N2, Imm, InFlag };
+ SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
+ MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 2);
+ return CNode;
+}
+
+/// If the high bits of an 'and' operand are known zero, try setting the
+/// high bits of an 'and' constant operand to produce a smaller encoding by
+/// creating a small, sign-extended negative immediate rather than a large
+/// positive one. This reverses a transform in SimplifyDemandedBits that
+/// shrinks mask constants by clearing bits. There is also a possibility that
+/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
+/// case, just replace the 'and'. Return 'true' if the node is replaced.
+bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
+ // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
+ // have immediate operands.
+ MVT VT = And->getSimpleValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
+ if (!And1C)
+ return false;
+
+ // Bail out if the mask constant is already negative. It's can't shrink more.
+ // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
+ // patterns to use a 32-bit and instead of a 64-bit and by relying on the
+ // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
+ // are negative too.
+ APInt MaskVal = And1C->getAPIntValue();
+ unsigned MaskLZ = MaskVal.countLeadingZeros();
+ if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
+ return false;
+
+ // Don't extend into the upper 32 bits of a 64 bit mask.
+ if (VT == MVT::i64 && MaskLZ >= 32) {
+ MaskLZ -= 32;
+ MaskVal = MaskVal.trunc(32);
+ }
+
+ SDValue And0 = And->getOperand(0);
+ APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
+ APInt NegMaskVal = MaskVal | HighZeros;
+
+ // If a negative constant would not allow a smaller encoding, there's no need
+ // to continue. Only change the constant when we know it's a win.
+ unsigned MinWidth = NegMaskVal.getMinSignedBits();
+ if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
+ return false;
+
+ // Extend masks if we truncated above.
+ if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
+ NegMaskVal = NegMaskVal.zext(64);
+ HighZeros = HighZeros.zext(64);
+ }
+
+ // The variable operand must be all zeros in the top bits to allow using the
+ // new, negative constant as the mask.
+ if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
+ return false;
+
+ // Check if the mask is -1. In that case, this is an unnecessary instruction
+ // that escaped earlier analysis.
+ if (NegMaskVal.isAllOnesValue()) {
+ ReplaceNode(And, And0.getNode());
+ return true;
+ }
+
+ // A negative mask allows a smaller encoding. Create a new 'and' node.
+ SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
+ SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
+ ReplaceNode(And, NewAnd.getNode());
+ SelectCode(NewAnd.getNode());
return true;
}
void X86DAGToDAGISel::Select(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
- unsigned Opc, MOpc;
unsigned Opcode = Node->getOpcode();
SDLoc dl(Node);
- DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
-
if (Node->isMachineOpcode()) {
- DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
Node->setNodeId(-1);
return; // Already selected.
}
@@ -2483,9 +2808,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
case ISD::AND:
- // Try to match BEXTR/BEXTRI instruction.
if (matchBEXTRFromAnd(Node))
return;
+ if (shrinkAndImmediate(Node))
+ return;
LLVM_FALLTHROUGH;
case ISD::OR:
@@ -2577,7 +2903,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
- Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
+ unsigned Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r);
SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL,
N0, SDValue()).getValue(1);
@@ -2594,7 +2920,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
- unsigned LoReg;
+ unsigned LoReg, Opc;
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
// MVT::i8 is handled by X86ISD::UMUL8.
@@ -2619,13 +2945,12 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
+ unsigned Opc, MOpc;
bool isSigned = Opcode == ISD::SMUL_LOHI;
bool hasBMI2 = Subtarget->hasBMI2();
if (!isSigned) {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
- case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
- case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
@@ -2634,8 +2959,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
} else {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
- case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
- case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
}
@@ -2644,14 +2967,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned SrcReg, LoReg, HiReg;
switch (Opc) {
default: llvm_unreachable("Unknown MUL opcode!");
- case X86::IMUL8r:
- case X86::MUL8r:
- SrcReg = LoReg = X86::AL; HiReg = X86::AH;
- break;
- case X86::IMUL16r:
- case X86::MUL16r:
- SrcReg = LoReg = X86::AX; HiReg = X86::DX;
- break;
case X86::IMUL32r:
case X86::MUL32r:
SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
@@ -2721,27 +3036,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
}
- // Prevent use of AH in a REX instruction by referencing AX instead.
- if (HiReg == X86::AH && Subtarget->is64Bit() &&
- !SDValue(Node, 1).use_empty()) {
- SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
- X86::AX, MVT::i16, InFlag);
- InFlag = Result.getValue(2);
- // Get the low part if needed. Don't use getCopyFromReg for aliasing
- // registers.
- if (!SDValue(Node, 0).use_empty())
- ReplaceUses(SDValue(Node, 0),
- CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
-
- // Shift AX down 8 bits.
- Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
- Result,
- CurDAG->getTargetConstant(8, dl, MVT::i8)),
- 0);
- // Then truncate it down to i8.
- ReplaceUses(SDValue(Node, 1),
- CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
- }
// Copy the low half of the result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
if (!ResLo.getNode()) {
@@ -2751,7 +3045,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
InFlag = ResLo.getValue(2);
}
ReplaceUses(SDValue(Node, 0), ResLo);
- DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
// Copy the high half of the result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
@@ -2762,7 +3057,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
InFlag = ResHi.getValue(2);
}
ReplaceUses(SDValue(Node, 1), ResHi);
- DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
CurDAG->RemoveDeadNode(Node);
@@ -2776,6 +3072,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
+ unsigned Opc, MOpc;
bool isSigned = (Opcode == ISD::SDIVREM ||
Opcode == X86ISD::SDIVREM8_SEXT_HREG);
if (!isSigned) {
@@ -2909,7 +3206,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
unsigned AHExtOpcode =
- isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8;
+ isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
MVT::Glue, AHCopy, InFlag);
@@ -2924,7 +3221,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
}
ReplaceUses(SDValue(Node, 1), Result);
- DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
// Copy the division (low) result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
@@ -2932,7 +3230,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
LoReg, NVT, InFlag);
InFlag = Result.getValue(2);
ReplaceUses(SDValue(Node, 0), Result);
- DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
// Copy the remainder (high) result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
@@ -2940,18 +3239,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
HiReg, NVT, InFlag);
InFlag = Result.getValue(2);
ReplaceUses(SDValue(Node, 1), Result);
- DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+ LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
+ dbgs() << '\n');
}
CurDAG->RemoveDeadNode(Node);
return;
}
- case X86ISD::CMP:
- case X86ISD::SUB: {
- // Sometimes a SUB is used to perform comparison.
- if (Opcode == X86ISD::SUB && Node->hasAnyUseOfValue(0))
- // This node is not a CMP.
- break;
+ case X86ISD::CMP: {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
@@ -2962,8 +3257,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
// use a smaller encoding.
// Look past the truncate if CMP is the only use of it.
- if ((N0.getOpcode() == ISD::AND ||
- (N0.getResNo() == 0 && N0.getOpcode() == X86ISD::AND)) &&
+ if (N0.getOpcode() == ISD::AND &&
N0.getNode()->hasOneUse() &&
N0.getValueType() != MVT::i8 &&
X86::isZeroNode(N1)) {
@@ -2971,98 +3265,119 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (!C) break;
uint64_t Mask = C->getZExtValue();
- // For example, convert "testl %eax, $8" to "testb %al, $8"
+ MVT VT;
+ int SubRegOp;
+ unsigned Op;
+
if (isUInt<8>(Mask) &&
(!(Mask & 0x80) || hasNoSignedComparisonUses(Node))) {
- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i8);
- SDValue Reg = N0.getOperand(0);
-
- // Extract the l-register.
- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
- MVT::i8, Reg);
-
- // Emit a testb.
- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
- Subreg, Imm);
- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
- // one, do not call ReplaceAllUsesWith.
- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
- SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
- return;
+ // For example, convert "testl %eax, $8" to "testb %al, $8"
+ VT = MVT::i8;
+ SubRegOp = X86::sub_8bit;
+ Op = X86::TEST8ri;
+ } else if (OptForMinSize && isUInt<16>(Mask) &&
+ (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
+ // For example, "testl %eax, $32776" to "testw %ax, $32776".
+ // NOTE: We only want to form TESTW instructions if optimizing for
+ // min size. Otherwise we only save one byte and possibly get a length
+ // changing prefix penalty in the decoders.
+ VT = MVT::i16;
+ SubRegOp = X86::sub_16bit;
+ Op = X86::TEST16ri;
+ } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
+ (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
+ // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+ // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
+ // Otherwize, we find ourselves in a position where we have to do
+ // promotion. If previous passes did not promote the and, we assume
+ // they had a good reason not to and do not promote here.
+ VT = MVT::i32;
+ SubRegOp = X86::sub_32bit;
+ Op = X86::TEST32ri;
+ } else {
+ // No eligible transformation was found.
+ break;
}
- // For example, "testl %eax, $2048" to "testb %ah, $8".
- if (isShiftedUInt<8, 8>(Mask) &&
- (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
- // Shift the immediate right by 8 bits.
- SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8);
- SDValue Reg = N0.getOperand(0);
-
- // Extract the h-register.
- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
- MVT::i8, Reg);
-
- // Emit a testb. The EXTRACT_SUBREG becomes a COPY that can only
- // target GR8_NOREX registers, so make sure the register class is
- // forced.
- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
- MVT::i32, Subreg, ShiftedImm);
- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
- // one, do not call ReplaceAllUsesWith.
- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
- SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
- return;
- }
+ SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
+ SDValue Reg = N0.getOperand(0);
- // For example, "testl %eax, $32776" to "testw %ax, $32776".
- // NOTE: We only want to form TESTW instructions if optimizing for
- // min size. Otherwise we only save one byte and possibly get a length
- // changing prefix penalty in the decoders.
- if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 &&
- (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16);
- SDValue Reg = N0.getOperand(0);
-
- // Extract the 16-bit subregister.
- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl,
- MVT::i16, Reg);
-
- // Emit a testw.
- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
- Subreg, Imm);
- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
- // one, do not call ReplaceAllUsesWith.
- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
- SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
- return;
- }
+ // Extract the subregister if necessary.
+ if (N0.getValueType() != VT)
+ Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
- // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
- if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 &&
- (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
- SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32);
- SDValue Reg = N0.getOperand(0);
-
- // Extract the 32-bit subregister.
- SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl,
- MVT::i32, Reg);
-
- // Emit a testl.
- SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
- Subreg, Imm);
- // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
- // one, do not call ReplaceAllUsesWith.
- ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
- SDValue(NewNode, 0));
- CurDAG->RemoveDeadNode(Node);
- return;
- }
+ // Emit a testl or testw.
+ SDNode *NewNode = CurDAG->getMachineNode(Op, dl, MVT::i32, Reg, Imm);
+ // Replace CMP with TEST.
+ ReplaceNode(Node, NewNode);
+ return;
}
break;
}
+ case X86ISD::PCMPISTR: {
+ if (!Subtarget->hasSSE42())
+ break;
+
+ bool NeedIndex = !SDValue(Node, 0).use_empty();
+ bool NeedMask = !SDValue(Node, 1).use_empty();
+ // We can't fold a load if we are going to make two instructions.
+ bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+ MachineSDNode *CNode;
+ if (NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
+ CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+ }
+ if (NeedIndex || !NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
+ CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ }
+
+ // Connect the flag usage to the last instruction created.
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case X86ISD::PCMPESTR: {
+ if (!Subtarget->hasSSE42())
+ break;
+
+ // Copy the two implicit register inputs.
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
+ Node->getOperand(1),
+ SDValue()).getValue(1);
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
+ Node->getOperand(3), InFlag).getValue(1);
+
+ bool NeedIndex = !SDValue(Node, 0).use_empty();
+ bool NeedMask = !SDValue(Node, 1).use_empty();
+ // We can't fold a load if we are going to make two instructions.
+ bool MayFoldLoad = !NeedIndex || !NeedMask;
+
+ MachineSDNode *CNode;
+ if (NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
+ CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
+ InFlag);
+ ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
+ }
+ if (NeedIndex || !NeedMask) {
+ unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
+ unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
+ CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
+ ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
+ }
+ // Connect the flag usage to the last instruction created.
+ ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
case ISD::STORE:
if (foldLoadStoreIntoMemOperand(Node))
return;