diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86')
92 files changed, 19171 insertions, 10474 deletions
diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index a365f62190d2..543af8e4e77d 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -10,9 +10,12 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "X86AsmInstrumentation.h" #include "X86Operand.h" +#include "X86RegisterInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" @@ -23,6 +26,73 @@ #include "llvm/MC/MCTargetAsmParser.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/Support/CommandLine.h" +#include <algorithm> +#include <cassert> +#include <vector> + +// Following comment describes how assembly instrumentation works. +// Currently we have only AddressSanitizer instrumentation, but we're +// planning to implement MemorySanitizer for inline assembly too. If +// you're not familiar with AddressSanitizer algorithm, please, read +// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm. +// +// When inline assembly is parsed by an instance of X86AsmParser, all +// instructions are emitted via EmitInstruction method. That's the +// place where X86AsmInstrumentation analyzes an instruction and +// decides, whether the instruction should be emitted as is or +// instrumentation is required. The latter case happens when an +// instruction reads from or writes to memory. Now instruction opcode +// is explicitly checked, and if an instruction has a memory operand +// (for instance, movq (%rsi, %rcx, 8), %rax) - it should be +// instrumented. There're also exist instructions that modify +// memory but don't have an explicit memory operands, for instance, +// movs. +// +// Let's consider at first 8-byte memory accesses when an instruction +// has an explicit memory operand. In this case we need two registers - +// AddressReg to compute address of a memory cells which are accessed +// and ShadowReg to compute corresponding shadow address. So, we need +// to spill both registers before instrumentation code and restore them +// after instrumentation. Thus, in general, instrumentation code will +// look like this: +// PUSHF # Store flags, otherwise they will be overwritten +// PUSH AddressReg # spill AddressReg +// PUSH ShadowReg # spill ShadowReg +// LEA MemOp, AddressReg # compute address of the memory operand +// MOV AddressReg, ShadowReg +// SHR ShadowReg, 3 +// # ShadowOffset(AddressReg >> 3) contains address of a shadow +// # corresponding to MemOp. +// CMP ShadowOffset(ShadowReg), 0 # test shadow value +// JZ .Done # when shadow equals to zero, everything is fine +// MOV AddressReg, RDI +// # Call __asan_report function with AddressReg as an argument +// CALL __asan_report +// .Done: +// POP ShadowReg # Restore ShadowReg +// POP AddressReg # Restore AddressReg +// POPF # Restore flags +// +// Memory accesses with different size (1-, 2-, 4- and 16-byte) are +// handled in a similar manner, but small memory accesses (less than 8 +// byte) require an additional ScratchReg, which is used for shadow value. +// +// If, suppose, we're instrumenting an instruction like movs, only +// contents of RDI, RDI + AccessSize * RCX, RSI, RSI + AccessSize * +// RCX are checked. In this case there're no need to spill and restore +// AddressReg , ShadowReg or flags four times, they're saved on stack +// just once, before instrumentation of these four addresses, and restored +// at the end of the instrumentation. +// +// There exist several things which complicate this simple algorithm. +// * Instrumented memory operand can have RSP as a base or an index +// register. So we need to add a constant offset before computation +// of memory address, since flags, AddressReg, ShadowReg, etc. were +// already stored on stack and RSP was modified. +// * Debug info (usually, DWARF) should be adjusted, because sometimes +// RSP is used as a frame register. So, we need to select some +// register as a frame register and temprorary override current CFA +// register. namespace llvm { namespace { @@ -32,10 +102,23 @@ static cl::opt<bool> ClAsanInstrumentAssembly( cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden, cl::init(false)); -bool IsStackReg(unsigned Reg) { - return Reg == X86::RSP || Reg == X86::ESP || Reg == X86::SP; +const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min(); +const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max(); + +int64_t ApplyDisplacementBounds(int64_t Displacement) { + return std::max(std::min(MaxAllowedDisplacement, Displacement), + MinAllowedDisplacement); +} + +void CheckDisplacementBounds(int64_t Displacement) { + assert(Displacement >= MinAllowedDisplacement && + Displacement <= MaxAllowedDisplacement); } +bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; } + +bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } + std::string FuncName(unsigned AccessSize, bool IsWrite) { return std::string("__asan_report_") + (IsWrite ? "store" : "load") + utostr(AccessSize); @@ -43,60 +126,264 @@ std::string FuncName(unsigned AccessSize, bool IsWrite) { class X86AddressSanitizer : public X86AsmInstrumentation { public: - X86AddressSanitizer(const MCSubtargetInfo &STI) : STI(STI) {} + struct RegisterContext { + private: + enum RegOffset { + REG_OFFSET_ADDRESS = 0, + REG_OFFSET_SHADOW, + REG_OFFSET_SCRATCH + }; + + public: + RegisterContext(unsigned AddressReg, unsigned ShadowReg, + unsigned ScratchReg) { + BusyRegs.push_back(convReg(AddressReg, MVT::i64)); + BusyRegs.push_back(convReg(ShadowReg, MVT::i64)); + BusyRegs.push_back(convReg(ScratchReg, MVT::i64)); + } + + unsigned AddressReg(MVT::SimpleValueType VT) const { + return convReg(BusyRegs[REG_OFFSET_ADDRESS], VT); + } + + unsigned ShadowReg(MVT::SimpleValueType VT) const { + return convReg(BusyRegs[REG_OFFSET_SHADOW], VT); + } + + unsigned ScratchReg(MVT::SimpleValueType VT) const { + return convReg(BusyRegs[REG_OFFSET_SCRATCH], VT); + } + + void AddBusyReg(unsigned Reg) { + if (Reg != X86::NoRegister) + BusyRegs.push_back(convReg(Reg, MVT::i64)); + } + + void AddBusyRegs(const X86Operand &Op) { + AddBusyReg(Op.getMemBaseReg()); + AddBusyReg(Op.getMemIndexReg()); + } + + unsigned ChooseFrameReg(MVT::SimpleValueType VT) const { + static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX, + X86::RCX, X86::RDX, X86::RDI, + X86::RSI }; + for (unsigned Reg : Candidates) { + if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg)) + return convReg(Reg, VT); + } + return X86::NoRegister; + } + + private: + unsigned convReg(unsigned Reg, MVT::SimpleValueType VT) const { + return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, VT); + } + + std::vector<unsigned> BusyRegs; + }; + + X86AddressSanitizer(const MCSubtargetInfo &STI) + : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {} + virtual ~X86AddressSanitizer() {} // X86AsmInstrumentation implementation: - virtual void InstrumentInstruction( - const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, MCStreamer &Out) override { + virtual void InstrumentAndEmitInstruction(const MCInst &Inst, + OperandVector &Operands, + MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out) override { + InstrumentMOVS(Inst, Operands, Ctx, MII, Out); + if (RepPrefix) + EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX)); + InstrumentMOV(Inst, Operands, Ctx, MII, Out); - } - // Should be implemented differently in x86_32 and x86_64 subclasses. - virtual void InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) = 0; - virtual void InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) = 0; + RepPrefix = (Inst.getOpcode() == X86::REP_PREFIX); + if (!RepPrefix) + EmitInstruction(Out, Inst); + } - void InstrumentMemOperand(MCParsedAsmOperand &Op, unsigned AccessSize, - bool IsWrite, MCContext &Ctx, MCStreamer &Out); + // Adjusts up stack and saves all registers used in instrumentation. + virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) = 0; + + // Restores all registers used in instrumentation and adjusts stack. + virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) = 0; + + virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, MCStreamer &Out) = 0; + virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, MCStreamer &Out) = 0; + + virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) = 0; + + void InstrumentMemOperand(X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, + MCStreamer &Out); + void InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, unsigned CntReg, + unsigned AccessSize, MCContext &Ctx, MCStreamer &Out); + + void InstrumentMOVS(const MCInst &Inst, OperandVector &Operands, + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); void InstrumentMOV(const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); - void EmitInstruction(MCStreamer &Out, const MCInst &Inst) { - Out.EmitInstruction(Inst, STI); - } +protected: void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } -protected: - const MCSubtargetInfo &STI; + void EmitLEA(X86Operand &Op, MVT::SimpleValueType VT, unsigned Reg, + MCStreamer &Out) { + assert(VT == MVT::i32 || VT == MVT::i64); + MCInst Inst; + Inst.setOpcode(VT == MVT::i32 ? X86::LEA32r : X86::LEA64r); + Inst.addOperand(MCOperand::CreateReg(getX86SubSuperRegister(Reg, VT))); + Op.addMemOperands(Inst, 5); + EmitInstruction(Out, Inst); + } + + void ComputeMemOperandAddress(X86Operand &Op, MVT::SimpleValueType VT, + unsigned Reg, MCContext &Ctx, MCStreamer &Out); + + // Creates new memory operand with Displacement added to an original + // displacement. Residue will contain a residue which could happen when the + // total displacement exceeds 32-bit limitation. + std::unique_ptr<X86Operand> AddDisplacement(X86Operand &Op, + int64_t Displacement, + MCContext &Ctx, int64_t *Residue); + + bool is64BitMode() const { + return (STI.getFeatureBits() & X86::Mode64Bit) != 0; + } + bool is32BitMode() const { + return (STI.getFeatureBits() & X86::Mode32Bit) != 0; + } + bool is16BitMode() const { + return (STI.getFeatureBits() & X86::Mode16Bit) != 0; + } + + unsigned getPointerWidth() { + if (is16BitMode()) return 16; + if (is32BitMode()) return 32; + if (is64BitMode()) return 64; + llvm_unreachable("invalid mode"); + } + + // True when previous instruction was actually REP prefix. + bool RepPrefix; + + // Offset from the original SP register. + int64_t OrigSPOffset; }; void X86AddressSanitizer::InstrumentMemOperand( - MCParsedAsmOperand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { assert(Op.isMem() && "Op should be a memory operand."); assert((AccessSize & (AccessSize - 1)) == 0 && AccessSize <= 16 && "AccessSize should be a power of two, less or equal than 16."); + // FIXME: take into account load/store alignment. + if (IsSmallMemAccess(AccessSize)) + InstrumentMemOperandSmall(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); + else + InstrumentMemOperandLarge(Op, AccessSize, IsWrite, RegCtx, Ctx, Out); +} + +void X86AddressSanitizer::InstrumentMOVSBase(unsigned DstReg, unsigned SrcReg, + unsigned CntReg, + unsigned AccessSize, + MCContext &Ctx, MCStreamer &Out) { + // FIXME: check whole ranges [DstReg .. DstReg + AccessSize * (CntReg - 1)] + // and [SrcReg .. SrcReg + AccessSize * (CntReg - 1)]. + RegisterContext RegCtx(X86::RDX /* AddressReg */, X86::RAX /* ShadowReg */, + IsSmallMemAccess(AccessSize) + ? X86::RBX + : X86::NoRegister /* ScratchReg */); + RegCtx.AddBusyReg(DstReg); + RegCtx.AddBusyReg(SrcReg); + RegCtx.AddBusyReg(CntReg); + + InstrumentMemOperandPrologue(RegCtx, Ctx, Out); + + // Test (%SrcReg) + { + const MCExpr *Disp = MCConstantExpr::Create(0, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + getPointerWidth(), 0, Disp, SrcReg, 0, AccessSize, SMLoc(), SMLoc())); + InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, + Out); + } - X86Operand &MemOp = static_cast<X86Operand &>(Op); - // FIXME: get rid of this limitation. - if (IsStackReg(MemOp.getMemBaseReg()) || IsStackReg(MemOp.getMemIndexReg())) + // Test -1(%SrcReg, %CntReg, AccessSize) + { + const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + getPointerWidth(), 0, Disp, SrcReg, CntReg, AccessSize, SMLoc(), + SMLoc())); + InstrumentMemOperand(*Op, AccessSize, false /* IsWrite */, RegCtx, Ctx, + Out); + } + + // Test (%DstReg) + { + const MCExpr *Disp = MCConstantExpr::Create(0, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + getPointerWidth(), 0, Disp, DstReg, 0, AccessSize, SMLoc(), SMLoc())); + InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); + } + + // Test -1(%DstReg, %CntReg, AccessSize) + { + const MCExpr *Disp = MCConstantExpr::Create(-1, Ctx); + std::unique_ptr<X86Operand> Op(X86Operand::CreateMem( + getPointerWidth(), 0, Disp, DstReg, CntReg, AccessSize, SMLoc(), + SMLoc())); + InstrumentMemOperand(*Op, AccessSize, true /* IsWrite */, RegCtx, Ctx, Out); + } + + InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); +} + +void X86AddressSanitizer::InstrumentMOVS(const MCInst &Inst, + OperandVector &Operands, + MCContext &Ctx, const MCInstrInfo &MII, + MCStreamer &Out) { + // Access size in bytes. + unsigned AccessSize = 0; + + switch (Inst.getOpcode()) { + case X86::MOVSB: + AccessSize = 1; + break; + case X86::MOVSW: + AccessSize = 2; + break; + case X86::MOVSL: + AccessSize = 4; + break; + case X86::MOVSQ: + AccessSize = 8; + break; + default: return; + } - // FIXME: take into account load/store alignment. - if (AccessSize < 8) - InstrumentMemOperandSmallImpl(MemOp, AccessSize, IsWrite, Ctx, Out); - else - InstrumentMemOperandLargeImpl(MemOp, AccessSize, IsWrite, Ctx, Out); + InstrumentMOVSImpl(AccessSize, Ctx, Out); } -void X86AddressSanitizer::InstrumentMOV( - const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, MCStreamer &Out) { +void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, + OperandVector &Operands, MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out) { // Access size in bytes. unsigned AccessSize = 0; @@ -132,41 +419,201 @@ void X86AddressSanitizer::InstrumentMOV( } const bool IsWrite = MII.get(Inst.getOpcode()).mayStore(); + for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) { assert(Operands[Ix]); MCParsedAsmOperand &Op = *Operands[Ix]; - if (Op.isMem()) - InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out); + if (Op.isMem()) { + X86Operand &MemOp = static_cast<X86Operand &>(Op); + RegisterContext RegCtx( + X86::RDI /* AddressReg */, X86::RAX /* ShadowReg */, + IsSmallMemAccess(AccessSize) ? X86::RCX + : X86::NoRegister /* ScratchReg */); + RegCtx.AddBusyRegs(MemOp); + InstrumentMemOperandPrologue(RegCtx, Ctx, Out); + InstrumentMemOperand(MemOp, AccessSize, IsWrite, RegCtx, Ctx, Out); + InstrumentMemOperandEpilogue(RegCtx, Ctx, Out); + } + } +} + +void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, + MVT::SimpleValueType VT, + unsigned Reg, MCContext &Ctx, + MCStreamer &Out) { + int64_t Displacement = 0; + if (IsStackReg(Op.getMemBaseReg())) + Displacement -= OrigSPOffset; + if (IsStackReg(Op.getMemIndexReg())) + Displacement -= OrigSPOffset * Op.getMemScale(); + + assert(Displacement >= 0); + + // Emit Op as is. + if (Displacement == 0) { + EmitLEA(Op, VT, Reg, Out); + return; + } + + int64_t Residue; + std::unique_ptr<X86Operand> NewOp = + AddDisplacement(Op, Displacement, Ctx, &Residue); + EmitLEA(*NewOp, VT, Reg, Out); + + while (Residue != 0) { + const MCConstantExpr *Disp = + MCConstantExpr::Create(ApplyDisplacementBounds(Residue), Ctx); + std::unique_ptr<X86Operand> DispOp = + X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(), + SMLoc()); + EmitLEA(*DispOp, VT, Reg, Out); + Residue -= Disp->getValue(); } } +std::unique_ptr<X86Operand> +X86AddressSanitizer::AddDisplacement(X86Operand &Op, int64_t Displacement, + MCContext &Ctx, int64_t *Residue) { + assert(Displacement >= 0); + + if (Displacement == 0 || + (Op.getMemDisp() && Op.getMemDisp()->getKind() != MCExpr::Constant)) { + *Residue = Displacement; + return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), + Op.getMemDisp(), Op.getMemBaseReg(), + Op.getMemIndexReg(), Op.getMemScale(), + SMLoc(), SMLoc()); + } + + int64_t OrigDisplacement = + static_cast<const MCConstantExpr *>(Op.getMemDisp())->getValue(); + CheckDisplacementBounds(OrigDisplacement); + Displacement += OrigDisplacement; + + int64_t NewDisplacement = ApplyDisplacementBounds(Displacement); + CheckDisplacementBounds(NewDisplacement); + + *Residue = Displacement - NewDisplacement; + const MCExpr *Disp = MCConstantExpr::Create(NewDisplacement, Ctx); + return X86Operand::CreateMem(Op.getMemModeSize(), Op.getMemSegReg(), Disp, + Op.getMemBaseReg(), Op.getMemIndexReg(), + Op.getMemScale(), SMLoc(), SMLoc()); +} + class X86AddressSanitizer32 : public X86AddressSanitizer { public: static const long kShadowOffset = 0x20000000; X86AddressSanitizer32(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} + virtual ~X86AddressSanitizer32() {} - virtual void InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; + unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { + unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); + if (FrameReg == X86::NoRegister) + return FrameReg; + return getX86SubSuperRegister(FrameReg, MVT::i32); + } + + void SpillReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(Reg)); + OrigSPOffset -= 4; + } + + void RestoreReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(Reg)); + OrigSPOffset += 4; + } + + void StoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); + OrigSPOffset -= 4; + } + + void RestoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::POPF32)); + OrigSPOffset += 4; + } + + virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + assert(LocalFrameReg != X86::NoRegister); + + const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (MRI && FrameReg != X86::NoRegister) { + SpillReg(Out, LocalFrameReg); + if (FrameReg == X86::ESP) { + Out.EmitCFIAdjustCfaOffset(4 /* byte size of the LocalFrameReg */); + Out.EmitCFIRelOffset( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); + } + EmitInstruction( + Out, + MCInstBuilder(X86::MOV32rr).addReg(LocalFrameReg).addReg(FrameReg)); + Out.EmitCFIRememberState(); + Out.EmitCFIDefCfaRegister( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); + } + + SpillReg(Out, RegCtx.AddressReg(MVT::i32)); + SpillReg(Out, RegCtx.ShadowReg(MVT::i32)); + if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(MVT::i32)); + StoreFlags(Out); + } - private: - void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, - bool IsWrite, unsigned AddressReg) { + virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + assert(LocalFrameReg != X86::NoRegister); + + RestoreFlags(Out); + if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(MVT::i32)); + RestoreReg(Out, RegCtx.ShadowReg(MVT::i32)); + RestoreReg(Out, RegCtx.AddressReg(MVT::i32)); + + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { + RestoreReg(Out, LocalFrameReg); + Out.EmitCFIRestoreState(); + if (FrameReg == X86::ESP) + Out.EmitCFIAdjustCfaOffset(-4 /* byte size of the LocalFrameReg */); + } + } + + virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; + +private: + void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out, const RegisterContext &RegCtx) { EmitInstruction(Out, MCInstBuilder(X86::CLD)); EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); - EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::ESP) - .addReg(X86::ESP).addImm(-16)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(AddressReg)); + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8) + .addReg(X86::ESP) + .addReg(X86::ESP) + .addImm(-16)); + EmitInstruction( + Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(MVT::i32))); - - const std::string& Fn = FuncName(AccessSize, IsWrite); + const std::string &Fn = FuncName(AccessSize, IsWrite); MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); @@ -174,136 +621,140 @@ public: } }; -void X86AddressSanitizer32::InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EDX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); +void X86AddressSanitizer32::InstrumentMemOperandSmall( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); - { - MCInst Inst; - Inst.setOpcode(X86::LEA32r); - Inst.addOperand(MCOperand::CreateReg(X86::EAX)); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } + assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); - EmitInstruction( - Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) - .addReg(X86::ECX).addImm(3)); + ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) + .addReg(ShadowRegI32) + .addReg(ShadowRegI32) + .addImm(3)); { MCInst Inst; Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::CreateReg(X86::CL)); + Inst.addOperand(MCOperand::CreateReg(ShadowRegI8)); const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, + SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } - EmitInstruction(Out, - MCInstBuilder(X86::TEST8rr).addReg(X86::CL).addReg(X86::CL)); + EmitInstruction( + Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - EmitInstruction( - Out, MCInstBuilder(X86::MOV32rr).addReg(X86::EDX).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::EDX) - .addReg(X86::EDX).addImm(7)); + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(7)); switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); case 1: break; case 2: { - MCInst Inst; - Inst.setOpcode(X86::LEA32r); - Inst.addOperand(MCOperand::CreateReg(X86::EDX)); - const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::EDX, 0, 1, SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, + SMLoc(), SMLoc())); + EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); break; } case 4: - EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::EDX) - .addReg(X86::EDX).addImm(3)); - break; - default: - assert(false && "Incorrect access size"); + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(3)); break; } EmitInstruction( - Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::ECX).addReg(X86::CL)); - EmitInstruction( - Out, MCInstBuilder(X86::CMP32rr).addReg(X86::EDX).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + Out, + MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); + EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( + ShadowRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); - EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); - - EmitInstruction(Out, MCInstBuilder(X86::POPF32)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EDX)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); } -void X86AddressSanitizer32::InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH32r).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSHF32)); +void X86AddressSanitizer32::InstrumentMemOperandLarge( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); - { - MCInst Inst; - Inst.setOpcode(X86::LEA32r); - Inst.addOperand(MCOperand::CreateReg(X86::EAX)); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - EmitInstruction( - Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::SHR32ri).addReg(X86::ECX) - .addReg(X86::ECX).addImm(3)); + ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::SHR32ri) + .addReg(ShadowRegI32) + .addReg(ShadowRegI32) + .addImm(3)); { MCInst Inst; switch (AccessSize) { - case 8: - Inst.setOpcode(X86::CMP8mi); - break; - case 16: - Inst.setOpcode(X86::CMP16mi); - break; - default: - assert(false && "Incorrect access size"); - break; + default: llvm_unreachable("Incorrect access size"); + case 8: + Inst.setOpcode(X86::CMP8mi); + break; + case 16: + Inst.setOpcode(X86::CMP16mi); + break; } const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI32, 0, 1, + SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); Inst.addOperand(MCOperand::CreateImm(0)); EmitInstruction(Out, Inst); } MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite, X86::EAX); + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); +} - EmitInstruction(Out, MCInstBuilder(X86::POPF32)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::ECX)); - EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX)); +void X86AddressSanitizer32::InstrumentMOVSImpl(unsigned AccessSize, + MCContext &Ctx, + MCStreamer &Out) { + StoreFlags(Out); + + // No need to test when ECX is equals to zero. + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction( + Out, MCInstBuilder(X86::TEST32rr).addReg(X86::ECX).addReg(X86::ECX)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); + + // Instrument first and last elements in src and dst range. + InstrumentMOVSBase(X86::EDI /* DstReg */, X86::ESI /* SrcReg */, + X86::ECX /* CntReg */, AccessSize, Ctx, Out); + + EmitLabel(Out, DoneSym); + RestoreFlags(Out); } class X86AddressSanitizer64 : public X86AddressSanitizer { @@ -312,37 +763,127 @@ public: X86AddressSanitizer64(const MCSubtargetInfo &STI) : X86AddressSanitizer(STI) {} + virtual ~X86AddressSanitizer64() {} - virtual void InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) override; + unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { + unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); + if (FrameReg == X86::NoRegister) + return FrameReg; + return getX86SubSuperRegister(FrameReg, MVT::i64); + } + + void SpillReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(Reg)); + OrigSPOffset -= 8; + } + + void RestoreReg(MCStreamer &Out, unsigned Reg) { + EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(Reg)); + OrigSPOffset += 8; + } + + void StoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); + OrigSPOffset -= 8; + } + + void RestoreFlags(MCStreamer &Out) { + EmitInstruction(Out, MCInstBuilder(X86::POPF64)); + OrigSPOffset += 8; + } + + virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + assert(LocalFrameReg != X86::NoRegister); + + const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (MRI && FrameReg != X86::NoRegister) { + SpillReg(Out, X86::RBP); + if (FrameReg == X86::RSP) { + Out.EmitCFIAdjustCfaOffset(8 /* byte size of the LocalFrameReg */); + Out.EmitCFIRelOffset( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */), 0); + } + EmitInstruction( + Out, + MCInstBuilder(X86::MOV64rr).addReg(LocalFrameReg).addReg(FrameReg)); + Out.EmitCFIRememberState(); + Out.EmitCFIDefCfaRegister( + MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); + } + + EmitAdjustRSP(Ctx, Out, -128); + SpillReg(Out, RegCtx.ShadowReg(MVT::i64)); + SpillReg(Out, RegCtx.AddressReg(MVT::i64)); + if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(MVT::i64)); + StoreFlags(Out); + } + + virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + assert(LocalFrameReg != X86::NoRegister); + + RestoreFlags(Out); + if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(MVT::i64)); + RestoreReg(Out, RegCtx.AddressReg(MVT::i64)); + RestoreReg(Out, RegCtx.ShadowReg(MVT::i64)); + EmitAdjustRSP(Ctx, Out, 128); + + unsigned FrameReg = GetFrameReg(Ctx, Out); + if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { + RestoreReg(Out, LocalFrameReg); + Out.EmitCFIRestoreState(); + if (FrameReg == X86::RSP) + Out.EmitCFIAdjustCfaOffset(-8 /* byte size of the LocalFrameReg */); + } + } + + virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; private: void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { - MCInst Inst; - Inst.setOpcode(X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(X86::RSP)); - const MCExpr *Disp = MCConstantExpr::Create(Offset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1, + SMLoc(), SMLoc())); + EmitLEA(*Op, MVT::i64, X86::RSP, Out); + OrigSPOffset += Offset; } - void EmitCallAsanReport(MCContext &Ctx, MCStreamer &Out, unsigned AccessSize, - bool IsWrite) { + void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, + MCStreamer &Out, const RegisterContext &RegCtx) { EmitInstruction(Out, MCInstBuilder(X86::CLD)); EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); - EmitInstruction(Out, MCInstBuilder(X86::AND64ri8).addReg(X86::RSP) - .addReg(X86::RSP).addImm(-16)); + EmitInstruction(Out, MCInstBuilder(X86::AND64ri8) + .addReg(X86::RSP) + .addReg(X86::RSP) + .addImm(-16)); - const std::string& Fn = FuncName(AccessSize, IsWrite); + if (RegCtx.AddressReg(MVT::i64) != X86::RDI) { + EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg( + RegCtx.AddressReg(MVT::i64))); + } + const std::string &Fn = FuncName(AccessSize, IsWrite); MCSymbol *FnSym = Ctx.GetOrCreateSymbol(StringRef(Fn)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::Create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); @@ -350,119 +891,111 @@ private: } }; -void X86AddressSanitizer64::InstrumentMemOperandSmallImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { - EmitAdjustRSP(Ctx, Out, -128); - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RCX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI)); - EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); - { - MCInst Inst; - Inst.setOpcode(X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(X86::RDI)); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - EmitInstruction( - Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RAX).addReg(X86::RDI)); - EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) - .addReg(X86::RAX).addImm(3)); +void X86AddressSanitizer64::InstrumentMemOperandSmall( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); + unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); + unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); + unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); + + assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); + + ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( + AddressRegI64)); + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) + .addReg(ShadowRegI64) + .addReg(ShadowRegI64) + .addImm(3)); { MCInst Inst; Inst.setOpcode(X86::MOV8rm); - Inst.addOperand(MCOperand::CreateReg(X86::AL)); + Inst.addOperand(MCOperand::CreateReg(ShadowRegI8)); const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, + SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } - EmitInstruction(Out, - MCInstBuilder(X86::TEST8rr).addReg(X86::AL).addReg(X86::AL)); + EmitInstruction( + Out, MCInstBuilder(X86::TEST8rr).addReg(ShadowRegI8).addReg(ShadowRegI8)); MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - EmitInstruction( - Out, MCInstBuilder(X86::MOV32rr).addReg(X86::ECX).addReg(X86::EDI)); - EmitInstruction(Out, MCInstBuilder(X86::AND32ri).addReg(X86::ECX) - .addReg(X86::ECX).addImm(7)); + EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ScratchRegI32).addReg( + AddressRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::AND32ri) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(7)); switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); case 1: break; case 2: { - MCInst Inst; - Inst.setOpcode(X86::LEA32r); - Inst.addOperand(MCOperand::CreateReg(X86::ECX)); - const MCExpr *Disp = MCConstantExpr::Create(1, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::ECX, 0, 1, SMLoc(), SMLoc())); - Op->addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, + SMLoc(), SMLoc())); + EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); break; } case 4: - EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8).addReg(X86::ECX) - .addReg(X86::ECX).addImm(3)); - break; - default: - assert(false && "Incorrect access size"); + EmitInstruction(Out, MCInstBuilder(X86::ADD32ri8) + .addReg(ScratchRegI32) + .addReg(ScratchRegI32) + .addImm(3)); break; } EmitInstruction( - Out, MCInstBuilder(X86::MOVSX32rr8).addReg(X86::EAX).addReg(X86::AL)); - EmitInstruction( - Out, MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::EAX)); - EmitInstruction(Out, MCInstBuilder(X86::JL_4).addExpr(DoneExpr)); + Out, + MCInstBuilder(X86::MOVSX32rr8).addReg(ShadowRegI32).addReg(ShadowRegI8)); + EmitInstruction(Out, MCInstBuilder(X86::CMP32rr).addReg(ScratchRegI32).addReg( + ShadowRegI32)); + EmitInstruction(Out, MCInstBuilder(X86::JL_1).addExpr(DoneExpr)); - EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); - - EmitInstruction(Out, MCInstBuilder(X86::POPF64)); - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI)); - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RCX)); - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); - EmitAdjustRSP(Ctx, Out, 128); } -void X86AddressSanitizer64::InstrumentMemOperandLargeImpl( - X86Operand &Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx, - MCStreamer &Out) { - EmitAdjustRSP(Ctx, Out, -128); - EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); - EmitInstruction(Out, MCInstBuilder(X86::PUSHF64)); +void X86AddressSanitizer64::InstrumentMemOperandLarge( + X86Operand &Op, unsigned AccessSize, bool IsWrite, + const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { + unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); + unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); - { - MCInst Inst; - Inst.setOpcode(X86::LEA64r); - Inst.addOperand(MCOperand::CreateReg(X86::RAX)); - Op.addMemOperands(Inst, 5); - EmitInstruction(Out, Inst); - } - EmitInstruction(Out, MCInstBuilder(X86::SHR64ri).addReg(X86::RAX) - .addReg(X86::RAX).addImm(3)); + ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + + EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( + AddressRegI64)); + EmitInstruction(Out, MCInstBuilder(X86::SHR64ri) + .addReg(ShadowRegI64) + .addReg(ShadowRegI64) + .addImm(3)); { MCInst Inst; switch (AccessSize) { + default: llvm_unreachable("Incorrect access size"); case 8: Inst.setOpcode(X86::CMP8mi); break; case 16: Inst.setOpcode(X86::CMP16mi); break; - default: - assert(false && "Incorrect access size"); - break; } const MCExpr *Disp = MCConstantExpr::Create(kShadowOffset, Ctx); std::unique_ptr<X86Operand> Op( - X86Operand::CreateMem(0, Disp, X86::RAX, 0, 1, SMLoc(), SMLoc())); + X86Operand::CreateMem(getPointerWidth(), 0, Disp, ShadowRegI64, 0, 1, + SMLoc(), SMLoc())); Op->addMemOperands(Inst, 5); Inst.addOperand(MCOperand::CreateImm(0)); EmitInstruction(Out, Inst); @@ -470,24 +1003,68 @@ void X86AddressSanitizer64::InstrumentMemOperandLargeImpl( MCSymbol *DoneSym = Ctx.CreateTempSymbol(); const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); - EmitInstruction(Out, MCInstBuilder(X86::JE_4).addExpr(DoneExpr)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); - EmitCallAsanReport(Ctx, Out, AccessSize, IsWrite); + EmitCallAsanReport(AccessSize, IsWrite, Ctx, Out, RegCtx); EmitLabel(Out, DoneSym); +} - EmitInstruction(Out, MCInstBuilder(X86::POPF64)); - EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RAX)); - EmitAdjustRSP(Ctx, Out, 128); +void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, + MCContext &Ctx, + MCStreamer &Out) { + StoreFlags(Out); + + // No need to test when RCX is equals to zero. + MCSymbol *DoneSym = Ctx.CreateTempSymbol(); + const MCExpr *DoneExpr = MCSymbolRefExpr::Create(DoneSym, Ctx); + EmitInstruction( + Out, MCInstBuilder(X86::TEST64rr).addReg(X86::RCX).addReg(X86::RCX)); + EmitInstruction(Out, MCInstBuilder(X86::JE_1).addExpr(DoneExpr)); + + // Instrument first and last elements in src and dst range. + InstrumentMOVSBase(X86::RDI /* DstReg */, X86::RSI /* SrcReg */, + X86::RCX /* CntReg */, AccessSize, Ctx, Out); + + EmitLabel(Out, DoneSym); + RestoreFlags(Out); } } // End anonymous namespace -X86AsmInstrumentation::X86AsmInstrumentation() {} +X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo &STI) + : STI(STI), InitialFrameReg(0) {} + X86AsmInstrumentation::~X86AsmInstrumentation() {} -void X86AsmInstrumentation::InstrumentInstruction( +void X86AsmInstrumentation::InstrumentAndEmitInstruction( const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, MCStreamer &Out) {} + const MCInstrInfo &MII, MCStreamer &Out) { + EmitInstruction(Out, Inst); +} + +void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, + const MCInst &Inst) { + Out.EmitInstruction(Inst, STI); +} + +unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, + MCStreamer &Out) { + if (!Out.getNumFrameInfos()) // No active dwarf frame + return X86::NoRegister; + const MCDwarfFrameInfo &Frame = Out.getDwarfFrameInfos().back(); + if (Frame.End) // Active dwarf frame is closed + return X86::NoRegister; + const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); + if (!MRI) // No register info + return X86::NoRegister; + + if (InitialFrameReg) { + // FrameReg is set explicitly, we're instrumenting a MachineFunction. + return InitialFrameReg; + } + + return MRI->getLLVMRegNum(Frame.CurrentCfaRegister, true /* IsEH */); +} X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, @@ -501,7 +1078,7 @@ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, if ((STI.getFeatureBits() & X86::Mode64Bit) != 0) return new X86AddressSanitizer64(STI); } - return new X86AsmInstrumentation(); + return new X86AsmInstrumentation(STI); } } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 1bc3c0930153..19ebcc44f61e 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_ASM_INSTRUMENTATION_H -#define X86_ASM_INSTRUMENTATION_H +#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H +#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H #include "llvm/ADT/SmallVector.h" @@ -34,11 +34,15 @@ class X86AsmInstrumentation { public: virtual ~X86AsmInstrumentation(); - // Instruments Inst. Should be called just before the original - // instruction is sent to Out. - virtual void InstrumentInstruction( + // Sets frame register corresponding to a current frame. + void SetInitialFrameRegister(unsigned RegNo) { + InitialFrameReg = RegNo; + } + + // Tries to instrument and emit instruction. + virtual void InstrumentAndEmitInstruction( const MCInst &Inst, - SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands, + SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands, MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); protected: @@ -46,9 +50,17 @@ protected: CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, const MCContext &Ctx, const MCSubtargetInfo &STI); - X86AsmInstrumentation(); + X86AsmInstrumentation(const MCSubtargetInfo &STI); + + unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out); + + void EmitInstruction(MCStreamer &Out, const MCInst &Inst); + + const MCSubtargetInfo &STI; + + unsigned InitialFrameReg; }; } // End llvm namespace -#endif // X86_ASM_INSTRUMENTATION_H +#endif diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index a11a238fc976..d743aa6ef333 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -32,6 +32,7 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" +#include <algorithm> #include <memory> using namespace llvm; @@ -55,12 +56,12 @@ static const char OpPrecedence[] = { class X86AsmParser : public MCTargetAsmParser { MCSubtargetInfo &STI; - MCAsmParser &Parser; const MCInstrInfo &MII; ParseInstructionInfo *InstInfo; std::unique_ptr<X86AsmInstrumentation> Instrumentation; private: SMLoc consumeToken() { + MCAsmParser &Parser = getParser(); SMLoc Result = Parser.getTok().getLoc(); Parser.Lex(); return Result; @@ -85,7 +86,7 @@ private: typedef std::pair< InfixCalculatorTok, int64_t > ICToken; SmallVector<InfixCalculatorTok, 4> InfixOperatorStack; SmallVector<ICToken, 4> PostfixStack; - + public: int64_t popOperand() { assert (!PostfixStack.empty() && "Poped an empty stack!"); @@ -99,7 +100,7 @@ private: "Unexpected operand!"); PostfixStack.push_back(std::make_pair(Op, Val)); } - + void popOperator() { InfixOperatorStack.pop_back(); } void pushOperator(InfixCalculatorTok Op) { // Push the new operator if the stack is empty. @@ -107,7 +108,7 @@ private: InfixOperatorStack.push_back(Op); return; } - + // Push the new operator if it has a higher precedence than the operator // on the top of the stack or the operator on the top of the stack is a // left parentheses. @@ -117,7 +118,7 @@ private: InfixOperatorStack.push_back(Op); return; } - + // The operator on the top of the stack has higher precedence than the // new operator. unsigned ParenCount = 0; @@ -125,17 +126,17 @@ private: // Nothing to process. if (InfixOperatorStack.empty()) break; - + Idx = InfixOperatorStack.size() - 1; StackOp = InfixOperatorStack[Idx]; if (!(OpPrecedence[StackOp] >= OpPrecedence[Op] || ParenCount)) break; - + // If we have an even parentheses count and we see a left parentheses, // then stop processing. if (!ParenCount && StackOp == IC_LPAREN) break; - + if (StackOp == IC_RPAREN) { ++ParenCount; InfixOperatorStack.pop_back(); @@ -157,10 +158,10 @@ private: if (StackOp != IC_LPAREN && StackOp != IC_RPAREN) PostfixStack.push_back(std::make_pair(StackOp, 0)); } - + if (PostfixStack.empty()) return 0; - + SmallVector<ICToken, 16> OperandStack; for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) { ICToken Op = PostfixStack[i]; @@ -262,7 +263,7 @@ private: State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac), AddImmPrefix(addimmprefix) { Info.clear(); } - + unsigned getBaseReg() { return BaseReg; } unsigned getIndexReg() { return IndexReg; } unsigned getScale() { return Scale; } @@ -630,13 +631,10 @@ private: } }; - MCAsmParser &getParser() const { return Parser; } - - MCAsmLexer &getLexer() const { return Parser.getLexer(); } - bool Error(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges = None, bool MatchingInlineAsm = false) { + MCAsmParser &Parser = getParser(); if (MatchingInlineAsm) return true; return Parser.Error(L, Msg, Ranges); } @@ -644,8 +642,9 @@ private: bool ErrorAndEatStatement(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges = None, bool MatchingInlineAsm = false) { - Parser.eatToEndOfStatement(); - return Error(L, Msg, Ranges, MatchingInlineAsm); + MCAsmParser &Parser = getParser(); + Parser.eatToEndOfStatement(); + return Error(L, Msg, Ranges, MatchingInlineAsm); } std::nullptr_t ErrorOperand(SMLoc Loc, StringRef Msg) { @@ -685,6 +684,7 @@ private: bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); + bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds @@ -693,10 +693,26 @@ private: bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, - unsigned &ErrorInfo, + uint64_t &ErrorInfo, bool MatchingInlineAsm) override; - virtual bool OmitRegisterFromClobberLists(unsigned RegNo) override; + void MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, OperandVector &Operands, + MCStreamer &Out, bool MatchingInlineAsm); + + bool ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, + bool MatchingInlineAsm); + + bool MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm); + + bool MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm); + + bool OmitRegisterFromClobberLists(unsigned RegNo) override; /// doSrcDstMatch - Returns true if operands are matching in their /// word size (%si and %di, %esi and %edi, etc.). Order depends on @@ -730,6 +746,13 @@ private: (X86::Mode64Bit | X86::Mode32Bit | X86::Mode16Bit))); } + unsigned getPointerWidth() { + if (is16BitMode()) return 16; + if (is32BitMode()) return 32; + if (is64BitMode()) return 64; + llvm_unreachable("invalid mode"); + } + bool isParsingIntelSyntax() { return getParser().getAssemblerDialect(); } @@ -743,11 +766,9 @@ private: /// } public: - X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, - const MCInstrInfo &mii, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), Parser(parser), MII(mii), - InstInfo(nullptr) { + X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &Parser, + const MCInstrInfo &mii, const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(sti), MII(mii), InstInfo(nullptr) { // Initialize the set of available features. setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); @@ -757,6 +778,8 @@ public: bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + void SetFrameRegister(unsigned RegNo) override; + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; @@ -832,6 +855,7 @@ bool X86AsmParser::doSrcDstMatch(X86Operand &Op1, X86Operand &Op2) bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + MCAsmParser &Parser = getParser(); RegNo = 0; const AsmToken &PercentTok = Parser.getTok(); StartLoc = PercentTok.getLoc(); @@ -939,20 +963,26 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, return false; } +void X86AsmParser::SetFrameRegister(unsigned RegNo) { + Instrumentation->SetInitialFrameRegister(RegNo); +} + std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { unsigned basereg = is64BitMode() ? X86::RSI : (is32BitMode() ? X86::ESI : X86::SI); const MCExpr *Disp = MCConstantExpr::Create(0, getContext()); - return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg, - /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); + return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1, + Loc, Loc, 0); } std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) { unsigned basereg = is64BitMode() ? X86::RDI : (is32BitMode() ? X86::EDI : X86::DI); const MCExpr *Disp = MCConstantExpr::Create(0, getContext()); - return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/basereg, - /*IndexReg=*/0, /*Scale=*/1, Loc, Loc, 0); + return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/basereg, /*IndexReg=*/0, /*Scale=*/1, + Loc, Loc, 0); } std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() { @@ -981,15 +1011,20 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, InlineAsmIdentifierInfo &Info) { - // If this is not a VarDecl then assume it is a FuncDecl or some other label - // reference. We need an 'r' constraint here, so we need to create register - // operand to ensure proper matching. Just pick a GPR based on the size of - // a pointer. - if (isa<MCSymbolRefExpr>(Disp) && !Info.IsVarDecl) { - unsigned RegNo = - is64BitMode() ? X86::RBX : (is32BitMode() ? X86::EBX : X86::BX); - return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true, - SMLoc(), Identifier, Info.OpDecl); + // If we found a decl other than a VarDecl, then assume it is a FuncDecl or + // some other label reference. + if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) { + // Insert an explicit size if the user didn't have one. + if (!Size) { + Size = getPointerWidth(); + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, + /*Len=*/0, Size)); + } + + // Create an absolute memory reference in order to match against + // instructions taking a PC relative operand. + return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size, + Identifier, Info.OpDecl); } // We either have a direct symbol reference, or an offset from a symbol. The @@ -1011,8 +1046,9 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( // if we don't know the actual value at this time. This is necessary to // get the matching correct in some cases. BaseReg = BaseReg ? BaseReg : 1; - return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start, - End, Size, Identifier, Info.OpDecl); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, Start, End, Size, Identifier, + Info.OpDecl); } static void @@ -1064,7 +1100,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, (*I).Kind = AOK_Delete; } const char *SymLocPtr = SymName.data(); - // Skip everything before the symbol. + // Skip everything before the symbol. if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) { assert(Len > 0 && "Expected a non-negative length."); AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len)); @@ -1078,6 +1114,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, } bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); bool Done = false; @@ -1088,7 +1125,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { // identifier. Don't try an parse it as a register. if (Tok.getString().startswith(".")) break; - + // If we're parsing an immediate expression, we don't expect a '['. if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) break; @@ -1154,7 +1191,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { MCSymbol *Sym = getContext().GetDirectionalLocalSymbol(IntVal, IDVal == "b"); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; - const MCExpr *Val = + const MCExpr *Val = MCSymbolRefExpr::Create(Sym, Variant, getContext()); if (IDVal == "b" && Sym->isUndefined()) return Error(Loc, "invalid reference to undefined symbol"); @@ -1199,6 +1236,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { std::unique_ptr<X86Operand> X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp, unsigned Size) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc(); if (getLexer().isNot(AsmToken::LBrac)) @@ -1238,7 +1276,7 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, const MCExpr *NewDisp; if (ParseIntelDotOperator(Disp, NewDisp)) return nullptr; - + End = Tok.getEndLoc(); Parser.Lex(); // Eat the field. Disp = NewDisp; @@ -1251,17 +1289,17 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, // handle [-42] if (!BaseReg && !IndexReg) { if (!SegReg) - return X86Operand::CreateMem(Disp, Start, End, Size); - else - return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, + Start, End, Size); } StringRef ErrMsg; if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) { Error(StartInBrac, ErrMsg); return nullptr; } - return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start, - End, Size); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, Start, End, Size); } InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); @@ -1274,13 +1312,16 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End) { + MCAsmParser &Parser = getParser(); assert (isParsingInlineAsm() && "Expected to be parsing inline assembly."); Val = nullptr; StringRef LineBuf(Identifier.data()); - SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); + void *Result = + SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); const AsmToken &Tok = Parser.getTok(); + SMLoc Loc = Tok.getLoc(); // Advance the token stream until the end of the current token is // after the end of what the frontend claimed. @@ -1292,9 +1333,22 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?"); if (End.getPointer() == EndPtr) break; } + Identifier = LineBuf; + + // If the identifier lookup was unsuccessful, assume that we are dealing with + // a label. + if (!Result) { + StringRef InternalName = + SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(), + Loc, false); + assert(InternalName.size() && "We should have an internal name here."); + // Push a rewrite for replacing the identifier name with the internal name. + InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Label, Loc, + Identifier.size(), + InternalName)); + } // Create the symbol reference. - Identifier = LineBuf; MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext()); @@ -1305,6 +1359,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, std::unique_ptr<X86Operand> X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size) { + MCAsmParser &Parser = getParser(); assert(SegReg != 0 && "Tried to parse a segment override without a segment!"); const AsmToken &Tok = Parser.getTok(); // Eat colon. if (Tok.isNot(AsmToken::Colon)) @@ -1325,9 +1380,9 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, // be followed by a bracketed expression. If it isn't we know we have our // final segment override. const MCExpr *Disp = MCConstantExpr::Create(ImmDisp, getContext()); - return X86Operand::CreateMem(SegReg, Disp, /*BaseReg=*/0, /*IndexReg=*/0, - /*Scale=*/1, Start, ImmDispToken.getEndLoc(), - Size); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, + /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1, + Start, ImmDispToken.getEndLoc(), Size); } } @@ -1340,7 +1395,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, if (getParser().parsePrimaryExpr(Val, End)) return ErrorOperand(Tok.getLoc(), "unknown token in expression"); - return X86Operand::CreateMem(Val, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size); } InlineAsmIdentifierInfo Info; @@ -1356,6 +1411,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start, unsigned Size) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc End; @@ -1369,7 +1425,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, if (getParser().parsePrimaryExpr(Val, End)) return ErrorOperand(Tok.getLoc(), "unknown token in expression"); - return X86Operand::CreateMem(Val, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size); } InlineAsmIdentifierInfo Info; @@ -1407,14 +1463,15 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, // BaseReg is non-zero to avoid assertions. In the context of inline asm, // we're pointing to a local variable in memory, so the base register is // really the frame or stack pointer. - return X86Operand::CreateMem(/*SegReg=*/0, Disp, /*BaseReg=*/1, /*IndexReg=*/0, - /*Scale=*/1, Start, End, Size, Identifier, - Info.OpDecl); + return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/1, /*IndexReg=*/0, /*Scale=*/1, + Start, End, Size, Identifier, Info.OpDecl); } /// Parse the '.' operator. bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); int64_t OrigDispVal, DotDispVal; @@ -1459,6 +1516,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, /// Parse the 'offset' operator. This operator is used to specify the /// location rather then the content of a variable. std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc OffsetOfLoc = Tok.getLoc(); Parser.Lex(); // Eat offset. @@ -1496,6 +1554,7 @@ enum IntelOperatorKind { /// TYPE operator returns the size of a C or C++ type or variable. If the /// variable is an array, TYPE returns the size of a single element. std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc TypeLoc = Tok.getLoc(); Parser.Lex(); // Eat operator. @@ -1529,6 +1588,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { } std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { + MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); SMLoc Start, End; @@ -1549,7 +1609,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { if (Size) { Parser.Lex(); // Eat operand size (e.g., byte, word). if (Tok.getString() != "PTR" && Tok.getString() != "ptr") - return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!"); + return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); Parser.Lex(); // Eat ptr. } Start = Tok.getLoc(); @@ -1580,7 +1640,8 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { // to the MCExpr with the directional local symbol and this is a // memory operand not an immediate operand. if (SM.getSym()) - return X86Operand::CreateMem(SM.getSym(), Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End, + Size); const MCExpr *ImmExpr = MCConstantExpr::Create(Imm, getContext()); return X86Operand::CreateImm(ImmExpr, Start, End); @@ -1611,6 +1672,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { } std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { + MCAsmParser &Parser = getParser(); switch (getLexer().getKind()) { default: // Parse a memory operand with no segment register. @@ -1631,6 +1693,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { if (getLexer().isNot(AsmToken::Colon)) return X86Operand::CreateReg(RegNo, Start, End); + if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo)) + return ErrorOperand(Start, "invalid segment register"); + getParser().Lex(); // Eat the colon. return ParseMemOperand(RegNo, Start); } @@ -1648,6 +1713,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, const MCParsedAsmOperand &Op) { + MCAsmParser &Parser = getParser(); if(STI.getFeatureBits() & X86::FeatureAVX512) { if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. @@ -1719,6 +1785,7 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) { + MCAsmParser &Parser = getParser(); // We have to disambiguate a parenthesized expression "(4+5)" from the start // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)". The // only way to do this without lookahead is to eat the '(' and see what is @@ -1733,8 +1800,9 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, if (getLexer().isNot(AsmToken::LParen)) { // Unless we have a segment register, treat this as an immediate. if (SegReg == 0) - return X86Operand::CreateMem(Disp, MemStart, ExprEnd); - return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, + MemStart, ExprEnd); } // Eat the '('. @@ -1760,8 +1828,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, if (getLexer().isNot(AsmToken::LParen)) { // Unless we have a segment register, treat this as an immediate. if (SegReg == 0) - return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd); - return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), Disp, LParenLoc, + ExprEnd); + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, + MemStart, ExprEnd); } // Eat the '('. @@ -1876,12 +1946,15 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, return nullptr; } - return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, - MemStart, MemEnd); + if (SegReg || BaseReg || IndexReg) + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, MemStart, MemEnd); + return X86Operand::CreateMem(getPointerWidth(), Disp, MemStart, MemEnd); } bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) { + MCAsmParser &Parser = getParser(); InstInfo = &Info; StringRef PatchedName = Name; @@ -2200,6 +2273,22 @@ static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode, return convertToSExti8(Inst, Opcode, X86::RAX, isCmp); } +bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { + switch (Inst.getOpcode()) { + default: return true; + case X86::INT: + X86Operand &Op = static_cast<X86Operand &>(*Ops[1]); + assert(Op.isImm() && "expected immediate"); + int64_t Res; + if (!Op.getImm()->EvaluateAsAbsolute(Res) || Res > 255) { + Error(Op.getStartLoc(), "interrupt vector must be in range [0-255]"); + return false; + } + return true; + } + llvm_unreachable("handle the instruction appropriately"); +} + bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; @@ -2279,51 +2368,79 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { } } -static const char *getSubtargetFeatureName(unsigned Val); +static const char *getSubtargetFeatureName(uint64_t Val); void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out) { - Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII, - Out); - Out.EmitInstruction(Inst, STI); + Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(), + MII, Out); } bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, - MCStreamer &Out, unsigned &ErrorInfo, + MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) { - assert(!Operands.empty() && "Unexpect empty operand list!"); - X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); - assert(Op.isToken() && "Leading operand should always be a mnemonic!"); - ArrayRef<SMRange> EmptyRanges = None; + if (isParsingIntelSyntax()) + return MatchAndEmitIntelInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo, + MatchingInlineAsm); + return MatchAndEmitATTInstruction(IDLoc, Opcode, Operands, Out, ErrorInfo, + MatchingInlineAsm); +} - // First, handle aliases that expand to multiple instructions. +void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, + OperandVector &Operands, MCStreamer &Out, + bool MatchingInlineAsm) { // FIXME: This should be replaced with a real .td file alias mechanism. // Also, MatchInstructionImpl should actually *do* the EmitInstruction // call. - if (Op.getToken() == "fstsw" || Op.getToken() == "fstcw" || - Op.getToken() == "fstsww" || Op.getToken() == "fstcww" || - Op.getToken() == "finit" || Op.getToken() == "fsave" || - Op.getToken() == "fstenv" || Op.getToken() == "fclex") { + const char *Repl = StringSwitch<const char *>(Op.getToken()) + .Case("finit", "fninit") + .Case("fsave", "fnsave") + .Case("fstcw", "fnstcw") + .Case("fstcww", "fnstcw") + .Case("fstenv", "fnstenv") + .Case("fstsw", "fnstsw") + .Case("fstsww", "fnstsw") + .Case("fclex", "fnclex") + .Default(nullptr); + if (Repl) { MCInst Inst; Inst.setOpcode(X86::WAIT); Inst.setLoc(IDLoc); if (!MatchingInlineAsm) EmitInstruction(Inst, Operands, Out); - - const char *Repl = StringSwitch<const char *>(Op.getToken()) - .Case("finit", "fninit") - .Case("fsave", "fnsave") - .Case("fstcw", "fnstcw") - .Case("fstcww", "fnstcw") - .Case("fstenv", "fnstenv") - .Case("fstsw", "fnstsw") - .Case("fstsww", "fnstsw") - .Case("fclex", "fnclex") - .Default(nullptr); - assert(Repl && "Unknown wait-prefixed instruction"); Operands[0] = X86Operand::CreateToken(Repl, IDLoc); } +} + +bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, + bool MatchingInlineAsm) { + assert(ErrorInfo && "Unknown missing feature!"); + ArrayRef<SMRange> EmptyRanges = None; + SmallString<126> Msg; + raw_svector_ostream OS(Msg); + OS << "instruction requires:"; + uint64_t Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { + if (ErrorInfo & Mask) + OS << ' ' << getSubtargetFeatureName(ErrorInfo & Mask); + Mask <<= 1; + } + return Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm); +} + +bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + ArrayRef<SMRange> EmptyRanges = None; + + // First, handle aliases that expand to multiple instructions. + MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); bool WasOriginallyInvalidOperand = false; MCInst Inst; @@ -2332,8 +2449,11 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax())) { - default: break; + default: llvm_unreachable("Unexpected match result!"); case Match_Success: + if (!validateInstruction(Inst, Operands)) + return true; + // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the // individual transformations can chain off each other. @@ -2346,21 +2466,8 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, EmitInstruction(Inst, Operands, Out); Opcode = Inst.getOpcode(); return false; - case Match_MissingFeature: { - assert(ErrorInfo && "Unknown missing feature!"); - // Special case the error message for the very common case where only - // a single subtarget feature is missing. - std::string Msg = "instruction requires:"; - unsigned Mask = 1; - for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { - if (ErrorInfo & Mask) { - Msg += " "; - Msg += getSubtargetFeatureName(ErrorInfo & Mask); - } - Mask <<= 1; - } - return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm); - } + case Match_MissingFeature: + return ErrorMissingFeature(IDLoc, ErrorInfo, MatchingInlineAsm); case Match_InvalidOperand: WasOriginallyInvalidOperand = true; break; @@ -2389,34 +2496,18 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0"; // Check for the various suffix matches. - Tmp[Base.size()] = Suffixes[0]; - unsigned ErrorInfoIgnore; - unsigned ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings. - unsigned Match1, Match2, Match3, Match4; - - Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); - // If this returned as a missing feature failure, remember that. - if (Match1 == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; - Tmp[Base.size()] = Suffixes[1]; - Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); - // If this returned as a missing feature failure, remember that. - if (Match2 == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; - Tmp[Base.size()] = Suffixes[2]; - Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); - // If this returned as a missing feature failure, remember that. - if (Match3 == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; - Tmp[Base.size()] = Suffixes[3]; - Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, - MatchingInlineAsm, isParsingIntelSyntax()); - // If this returned as a missing feature failure, remember that. - if (Match4 == Match_MissingFeature) - ErrorInfoMissingFeature = ErrorInfoIgnore; + uint64_t ErrorInfoIgnore; + uint64_t ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings. + unsigned Match[4]; + + for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) { + Tmp.back() = Suffixes[I]; + Match[I] = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + MatchingInlineAsm, isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match[I] == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + } // Restore the old token. Op.setTokenValue(Base); @@ -2425,8 +2516,7 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // instruction will already have been filled in correctly, since the failing // matches won't have modified it). unsigned NumSuccessfulMatches = - (Match1 == Match_Success) + (Match2 == Match_Success) + - (Match3 == Match_Success) + (Match4 == Match_Success); + std::count(std::begin(Match), std::end(Match), Match_Success); if (NumSuccessfulMatches == 1) { Inst.setLoc(IDLoc); if (!MatchingInlineAsm) @@ -2442,10 +2532,9 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (NumSuccessfulMatches > 1) { char MatchChars[4]; unsigned NumMatches = 0; - if (Match1 == Match_Success) MatchChars[NumMatches++] = Suffixes[0]; - if (Match2 == Match_Success) MatchChars[NumMatches++] = Suffixes[1]; - if (Match3 == Match_Success) MatchChars[NumMatches++] = Suffixes[2]; - if (Match4 == Match_Success) MatchChars[NumMatches++] = Suffixes[3]; + for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) + if (Match[I] == Match_Success) + MatchChars[NumMatches++] = Suffixes[I]; SmallString<126> Msg; raw_svector_ostream OS(Msg); @@ -2466,8 +2555,7 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // If all of the instructions reported an invalid mnemonic, then the original // mnemonic was invalid. - if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) && - (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) { + if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) { if (!WasOriginallyInvalidOperand) { ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); @@ -2476,7 +2564,7 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } // Recover location info for the operand if we know which was the problem. - if (ErrorInfo != ~0U) { + if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction", EmptyRanges, MatchingInlineAsm); @@ -2495,27 +2583,19 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // If one instruction matched with a missing feature, report this as a // missing feature. - if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) + - (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){ - std::string Msg = "instruction requires:"; - unsigned Mask = 1; - for (unsigned i = 0; i < (sizeof(ErrorInfoMissingFeature)*8-1); ++i) { - if (ErrorInfoMissingFeature & Mask) { - Msg += " "; - Msg += getSubtargetFeatureName(ErrorInfoMissingFeature & Mask); - } - Mask <<= 1; - } - return Error(IDLoc, Msg, EmptyRanges, MatchingInlineAsm); + if (std::count(std::begin(Match), std::end(Match), + Match_MissingFeature) == 1) { + ErrorInfo = ErrorInfoMissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + MatchingInlineAsm); } // If one instruction matched with an invalid operand, report this as an // operand failure. - if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) + - (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){ - Error(IDLoc, "invalid operand for instruction", EmptyRanges, - MatchingInlineAsm); - return true; + if (std::count(std::begin(Match), std::end(Match), + Match_InvalidOperand) == 1) { + return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + MatchingInlineAsm); } // If all of these were an outright failure, report it in a useless way. @@ -2524,25 +2604,176 @@ bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; } +bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + X86Operand &Op = static_cast<X86Operand &>(*Operands[0]); + assert(Op.isToken() && "Leading operand should always be a mnemonic!"); + StringRef Mnemonic = Op.getToken(); + ArrayRef<SMRange> EmptyRanges = None; + + // First, handle aliases that expand to multiple instructions. + MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); + + MCInst Inst; + + // Find one unsized memory operand, if present. + X86Operand *UnsizedMemOp = nullptr; + for (const auto &Op : Operands) { + X86Operand *X86Op = static_cast<X86Operand *>(Op.get()); + if (X86Op->isMemUnsized()) + UnsizedMemOp = X86Op; + } + + // Allow some instructions to have implicitly pointer-sized operands. This is + // compatible with gas. + if (UnsizedMemOp) { + static const char *const PtrSizedInstrs[] = {"call", "jmp", "push"}; + for (const char *Instr : PtrSizedInstrs) { + if (Mnemonic == Instr) { + UnsizedMemOp->Mem.Size = getPointerWidth(); + break; + } + } + } + + // If an unsized memory operand is present, try to match with each memory + // operand size. In Intel assembly, the size is not part of the instruction + // mnemonic. + SmallVector<unsigned, 8> Match; + uint64_t ErrorInfoMissingFeature = 0; + if (UnsizedMemOp && UnsizedMemOp->isMemUnsized()) { + static const unsigned MopSizes[] = {8, 16, 32, 64, 80, 128, 256, 512}; + for (unsigned Size : MopSizes) { + UnsizedMemOp->Mem.Size = Size; + uint64_t ErrorInfoIgnore; + unsigned LastOpcode = Inst.getOpcode(); + unsigned M = + MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore, + MatchingInlineAsm, isParsingIntelSyntax()); + if (Match.empty() || LastOpcode != Inst.getOpcode()) + Match.push_back(M); + + // If this returned as a missing feature failure, remember that. + if (Match.back() == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfoIgnore; + } + + // Restore the size of the unsized memory operand if we modified it. + if (UnsizedMemOp) + UnsizedMemOp->Mem.Size = 0; + } + + // If we haven't matched anything yet, this is not a basic integer or FPU + // operation. There shouldn't be any ambiguity in our mneumonic table, so try + // matching with the unsized operand. + if (Match.empty()) { + Match.push_back(MatchInstructionImpl(Operands, Inst, ErrorInfo, + MatchingInlineAsm, + isParsingIntelSyntax())); + // If this returned as a missing feature failure, remember that. + if (Match.back() == Match_MissingFeature) + ErrorInfoMissingFeature = ErrorInfo; + } + + // Restore the size of the unsized memory operand if we modified it. + if (UnsizedMemOp) + UnsizedMemOp->Mem.Size = 0; + + // If it's a bad mnemonic, all results will be the same. + if (Match.back() == Match_MnemonicFail) { + ArrayRef<SMRange> Ranges = + MatchingInlineAsm ? EmptyRanges : Op.getLocRange(); + return Error(IDLoc, "invalid instruction mnemonic '" + Mnemonic + "'", + Ranges, MatchingInlineAsm); + } + + // If exactly one matched, then we treat that as a successful match (and the + // instruction will already have been filled in correctly, since the failing + // matches won't have modified it). + unsigned NumSuccessfulMatches = + std::count(std::begin(Match), std::end(Match), Match_Success); + if (NumSuccessfulMatches == 1) { + if (!validateInstruction(Inst, Operands)) + return true; + + // Some instructions need post-processing to, for example, tweak which + // encoding is selected. Loop on it while changes happen so the individual + // transformations can chain off each other. + if (!MatchingInlineAsm) + while (processInstruction(Inst, Operands)) + ; + Inst.setLoc(IDLoc); + if (!MatchingInlineAsm) + EmitInstruction(Inst, Operands, Out); + Opcode = Inst.getOpcode(); + return false; + } else if (NumSuccessfulMatches > 1) { + assert(UnsizedMemOp && + "multiple matches only possible with unsized memory operands"); + ArrayRef<SMRange> Ranges = + MatchingInlineAsm ? EmptyRanges : UnsizedMemOp->getLocRange(); + return Error(UnsizedMemOp->getStartLoc(), + "ambiguous operand size for instruction '" + Mnemonic + "\'", + Ranges, MatchingInlineAsm); + } + + // If one instruction matched with a missing feature, report this as a + // missing feature. + if (std::count(std::begin(Match), std::end(Match), + Match_MissingFeature) == 1) { + ErrorInfo = ErrorInfoMissingFeature; + return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeature, + MatchingInlineAsm); + } + + // If one instruction matched with an invalid operand, report this as an + // operand failure. + if (std::count(std::begin(Match), std::end(Match), + Match_InvalidOperand) == 1) { + return Error(IDLoc, "invalid operand for instruction", EmptyRanges, + MatchingInlineAsm); + } + + // If all of these were an outright failure, report it in a useless way. + return Error(IDLoc, "unknown instruction mnemonic", EmptyRanges, + MatchingInlineAsm); +} + bool X86AsmParser::OmitRegisterFromClobberLists(unsigned RegNo) { return X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo); } bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { + MCAsmParser &Parser = getParser(); StringRef IDVal = DirectiveID.getIdentifier(); if (IDVal == ".word") return ParseDirectiveWord(2, DirectiveID.getLoc()); else if (IDVal.startswith(".code")) return ParseDirectiveCode(IDVal, DirectiveID.getLoc()); else if (IDVal.startswith(".att_syntax")) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + if (Parser.getTok().getString() == "prefix") + Parser.Lex(); + else if (Parser.getTok().getString() == "noprefix") + return Error(DirectiveID.getLoc(), "'.att_syntax noprefix' is not " + "supported: registers must have a " + "'%' prefix in .att_syntax"); + } getParser().setAssemblerDialect(0); return false; } else if (IDVal.startswith(".intel_syntax")) { getParser().setAssemblerDialect(1); if (getLexer().isNot(AsmToken::EndOfStatement)) { - // FIXME: Handle noprefix if (Parser.getTok().getString() == "noprefix") Parser.Lex(); + else if (Parser.getTok().getString() == "prefix") + return Error(DirectiveID.getLoc(), "'.intel_syntax prefix' is not " + "supported: registers must not have " + "a '%' prefix in .intel_syntax"); } return false; } @@ -2552,6 +2783,7 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { /// ParseDirectiveWord /// ::= .word [ expression (, expression)* ] bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { + MCAsmParser &Parser = getParser(); if (getLexer().isNot(AsmToken::EndOfStatement)) { for (;;) { const MCExpr *Value; @@ -2579,6 +2811,7 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { /// ParseDirectiveCode /// ::= .code16 | .code32 | .code64 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { + MCAsmParser &Parser = getParser(); if (IDVal == ".code16") { Parser.Lex(); if (!is16BitMode()) { diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h index ef1565f585e9..72aeeaac1639 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_ASM_PARSER_COMMON_H -#define X86_ASM_PARSER_COMMON_H +#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H +#define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMPARSERCOMMON_H namespace llvm { @@ -24,10 +24,6 @@ inline bool isImmSExti32i8Value(uint64_t Value) { (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); } -inline bool isImmZExtu32u8Value(uint64_t Value) { - return (Value <= 0x00000000000000FFULL); -} - inline bool isImmSExti64i8Value(uint64_t Value) { return (( Value <= 0x000000000000007FULL)|| (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); @@ -40,4 +36,4 @@ inline bool isImmSExti64i32Value(uint64_t Value) { } // End of namespace llvm -#endif // X86_ASM_PARSER_COMMON_H +#endif diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h index 1bbfc1192447..6675d4dc045c 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_OPERAND_H -#define X86_OPERAND_H +#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H +#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H #include "X86AsmParserCommon.h" #include "llvm/MC/MCExpr.h" @@ -53,6 +53,7 @@ struct X86Operand : public MCParsedAsmOperand { unsigned IndexReg; unsigned Scale; unsigned Size; + unsigned ModeSize; }; union { @@ -120,6 +121,10 @@ struct X86Operand : public MCParsedAsmOperand { assert(Kind == Memory && "Invalid access!"); return Mem.Scale; } + unsigned getMemModeSize() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.ModeSize; + } bool isToken() const override {return Kind == Token; } @@ -153,20 +158,6 @@ struct X86Operand : public MCParsedAsmOperand { // extension. return isImmSExti32i8Value(CE->getValue()); } - bool isImmZExtu32u8() const { - if (!isImm()) - return false; - - // If this isn't a constant expr, just assume it fits and let relaxation - // handle it. - const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); - if (!CE) - return true; - - // Otherwise, check the value is in a range that makes sense for this - // extension. - return isImmZExtu32u8Value(CE->getValue()); - } bool isImmSExti64i8() const { if (!isImm()) return false; @@ -205,6 +196,9 @@ struct X86Operand : public MCParsedAsmOperand { } bool isMem() const override { return Kind == Memory; } + bool isMemUnsized() const { + return Kind == Memory && Mem.Size == 0; + } bool isMem8() const { return Kind == Memory && (!Mem.Size || Mem.Size == 8); } @@ -260,6 +254,14 @@ struct X86Operand : public MCParsedAsmOperand { !getMemIndexReg() && getMemScale() == 1; } + bool isAbsMem16() const { + return isAbsMem() && Mem.ModeSize == 16; + } + + bool isAbsMem32() const { + return isAbsMem() && Mem.ModeSize != 16; + } + bool isSrcIdx() const { return !getMemIndexReg() && getMemScale() == 1 && (getMemBaseReg() == X86::RSI || getMemBaseReg() == X86::ESI || @@ -299,21 +301,43 @@ struct X86Operand : public MCParsedAsmOperand { return isMem64() && isDstIdx(); } - bool isMemOffs8() const { - return Kind == Memory && !getMemBaseReg() && - !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8); + bool isMemOffs() const { + return Kind == Memory && !getMemBaseReg() && !getMemIndexReg() && + getMemScale() == 1; + } + + bool isMemOffs16_8() const { + return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 8); + } + bool isMemOffs16_16() const { + return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 16); + } + bool isMemOffs16_32() const { + return isMemOffs() && Mem.ModeSize == 16 && (!Mem.Size || Mem.Size == 32); + } + bool isMemOffs32_8() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 8); } - bool isMemOffs16() const { - return Kind == Memory && !getMemBaseReg() && - !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16); + bool isMemOffs32_16() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 16); } - bool isMemOffs32() const { - return Kind == Memory && !getMemBaseReg() && - !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32); + bool isMemOffs32_32() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 32); } - bool isMemOffs64() const { - return Kind == Memory && !getMemBaseReg() && - !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64); + bool isMemOffs32_64() const { + return isMemOffs() && Mem.ModeSize == 32 && (!Mem.Size || Mem.Size == 64); + } + bool isMemOffs64_8() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 8); + } + bool isMemOffs64_16() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 16); + } + bool isMemOffs64_32() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 32); + } + bool isMemOffs64_64() const { + return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64); } bool isReg() const override { return Kind == Register; } @@ -441,8 +465,9 @@ struct X86Operand : public MCParsedAsmOperand { /// Create an absolute memory operand. static std::unique_ptr<X86Operand> - CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, unsigned Size = 0, - StringRef SymName = StringRef(), void *OpDecl = nullptr) { + CreateMem(unsigned ModeSize, const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc, + unsigned Size = 0, StringRef SymName = StringRef(), + void *OpDecl = nullptr) { auto Res = llvm::make_unique<X86Operand>(Memory, StartLoc, EndLoc); Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; @@ -450,6 +475,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = 0; Res->Mem.Scale = 1; Res->Mem.Size = Size; + Res->Mem.ModeSize = ModeSize; Res->SymName = SymName; Res->OpDecl = OpDecl; Res->AddressOf = false; @@ -458,9 +484,9 @@ struct X86Operand : public MCParsedAsmOperand { /// Create a generalized memory operand. static std::unique_ptr<X86Operand> - CreateMem(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, - unsigned IndexReg, unsigned Scale, SMLoc StartLoc, SMLoc EndLoc, - unsigned Size = 0, StringRef SymName = StringRef(), + CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp, + unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc, + SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), void *OpDecl = nullptr) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. @@ -476,6 +502,7 @@ struct X86Operand : public MCParsedAsmOperand { Res->Mem.IndexReg = IndexReg; Res->Mem.Scale = Scale; Res->Mem.Size = Size; + Res->Mem.ModeSize = ModeSize; Res->SymName = SymName; Res->OpDecl = OpDecl; Res->AddressOf = false; @@ -485,4 +512,4 @@ struct X86Operand : public MCParsedAsmOperand { } // End of namespace llvm -#endif // X86_OPERAND +#endif diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index 521bd21b81c6..1c5618288e75 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -23,7 +23,6 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/MemoryObject.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -52,8 +51,8 @@ const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode, #define debug(s) DEBUG(Debug(__FILE__, __LINE__, s)); -namespace llvm { - +namespace llvm { + // Fill-ins to make the compiler happy. These constants are never actually // assigned; they are just filler to make an automatically-generated switch // statement work. @@ -97,16 +96,26 @@ X86GenericDisassembler::X86GenericDisassembler( } } -/// regionReader - a callback function that wraps the readByte method from -/// MemoryObject. +struct Region { + ArrayRef<uint8_t> Bytes; + uint64_t Base; + Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {} +}; + +/// A callback function that wraps the readByte method from Region. /// -/// @param arg - The generic callback parameter. In this case, this should -/// be a pointer to a MemoryObject. -/// @param byte - A pointer to the byte to be read. -/// @param address - The address to be read. -static int regionReader(const void* arg, uint8_t* byte, uint64_t address) { - const MemoryObject* region = static_cast<const MemoryObject*>(arg); - return region->readByte(address, byte); +/// @param Arg - The generic callback parameter. In this case, this should +/// be a pointer to a Region. +/// @param Byte - A pointer to the byte to be read. +/// @param Address - The address to be read. +static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) { + auto *R = static_cast<const Region *>(Arg); + ArrayRef<uint8_t> Bytes = R->Bytes; + unsigned Index = Address - R->Base; + if (Bytes.size() <= Index) + return -1; + *Byte = Bytes[Index]; + return 0; } /// logger - a callback function that wraps the operator<< method from @@ -118,47 +127,38 @@ static int regionReader(const void* arg, uint8_t* byte, uint64_t address) { static void logger(void* arg, const char* log) { if (!arg) return; - + raw_ostream &vStream = *(static_cast<raw_ostream*>(arg)); vStream << log << "\n"; -} - +} + // // Public interface for the disassembler // -MCDisassembler::DecodeStatus -X86GenericDisassembler::getInstruction(MCInst &instr, - uint64_t &size, - const MemoryObject ®ion, - uint64_t address, - raw_ostream &vStream, - raw_ostream &cStream) const { - CommentStream = &cStream; - - InternalInstruction internalInstr; - - dlog_t loggerFn = logger; - if (&vStream == &nulls()) - loggerFn = nullptr; // Disable logging completely if it's going to nulls(). - - int ret = decodeInstruction(&internalInstr, - regionReader, - (const void*)®ion, - loggerFn, - (void*)&vStream, - (const void*)MII.get(), - address, - fMode); - - if (ret) { - size = internalInstr.readerCursor - address; +MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction( + MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &VStream, raw_ostream &CStream) const { + CommentStream = &CStream; + + InternalInstruction InternalInstr; + + dlog_t LoggerFn = logger; + if (&VStream == &nulls()) + LoggerFn = nullptr; // Disable logging completely if it's going to nulls(). + + Region R(Bytes, Address); + + int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R, + LoggerFn, (void *)&VStream, + (const void *)MII.get(), Address, fMode); + + if (Ret) { + Size = InternalInstr.readerCursor - Address; return Fail; - } - else { - size = internalInstr.length; - return (!translateInstruction(instr, internalInstr, this)) ? - Success : Fail; + } else { + Size = InternalInstr.length; + return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail; } } @@ -184,7 +184,7 @@ static void translateRegister(MCInst &mcInst, Reg reg) { } /// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the -/// immediate Value in the MCInst. +/// immediate Value in the MCInst. /// /// @param Value - The immediate Value, has had any PC adjustment made by /// the caller. @@ -196,7 +196,7 @@ static void translateRegister(MCInst &mcInst, Reg reg) { /// If the getOpInfo() function was set when setupForSymbolicDisassembly() was /// called then that function is called to get any symbolic information for the /// immediate in the instruction using the Address, Offset and Width. If that -/// returns non-zero then the symbolic information it returns is used to create +/// returns non-zero then the symbolic information it returns is used to create /// an MCExpr and that is added as an operand to the MCInst. If getOpInfo() /// returns zero and isBranch is true then a symbol look up for immediate Value /// is done and if a symbol is found an MCExpr is created with that, else @@ -204,8 +204,8 @@ static void translateRegister(MCInst &mcInst, Reg reg) { /// if it adds an operand to the MCInst and false otherwise. static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, uint64_t Address, uint64_t Offset, - uint64_t Width, MCInst &MI, - const MCDisassembler *Dis) { + uint64_t Width, MCInst &MI, + const MCDisassembler *Dis) { return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch, Offset, Width); } @@ -215,7 +215,7 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch, /// These can often be addresses in a literal pool. The Address of the /// instruction and its immediate Value are used to determine the address /// being referenced in the literal pool entry. The SymbolLookUp call back will -/// return a pointer to a literal 'C' string if the referenced address is an +/// return a pointer to a literal 'C' string if the referenced address is an /// address into a section with 'C' string literals. static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value, const void *Decoder) { @@ -287,7 +287,7 @@ static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) { static void translateImmediate(MCInst &mcInst, uint64_t immediate, const OperandSpecifier &operand, InternalInstruction &insn, - const MCDisassembler *Dis) { + const MCDisassembler *Dis) { // Sign-extend the immediate if necessary. OperandType type = (OperandType)operand.type; @@ -350,6 +350,54 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case ENCODING_IO: break; } + } else if (type == TYPE_IMM3) { + // Check for immediates that printSSECC can't handle. + if (immediate >= 8) { + unsigned NewOpc; + switch (mcInst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case X86::CMPPDrmi: NewOpc = X86::CMPPDrmi_alt; break; + case X86::CMPPDrri: NewOpc = X86::CMPPDrri_alt; break; + case X86::CMPPSrmi: NewOpc = X86::CMPPSrmi_alt; break; + case X86::CMPPSrri: NewOpc = X86::CMPPSrri_alt; break; + case X86::CMPSDrm: NewOpc = X86::CMPSDrm_alt; break; + case X86::CMPSDrr: NewOpc = X86::CMPSDrr_alt; break; + case X86::CMPSSrm: NewOpc = X86::CMPSSrm_alt; break; + case X86::CMPSSrr: NewOpc = X86::CMPSSrr_alt; break; + } + // Switch opcode to the one that doesn't get special printing. + mcInst.setOpcode(NewOpc); + } + } else if (type == TYPE_IMM5) { + // Check for immediates that printAVXCC can't handle. + if (immediate >= 32) { + unsigned NewOpc; + switch (mcInst.getOpcode()) { + default: llvm_unreachable("unexpected opcode"); + case X86::VCMPPDrmi: NewOpc = X86::VCMPPDrmi_alt; break; + case X86::VCMPPDrri: NewOpc = X86::VCMPPDrri_alt; break; + case X86::VCMPPSrmi: NewOpc = X86::VCMPPSrmi_alt; break; + case X86::VCMPPSrri: NewOpc = X86::VCMPPSrri_alt; break; + case X86::VCMPSDrm: NewOpc = X86::VCMPSDrm_alt; break; + case X86::VCMPSDrr: NewOpc = X86::VCMPSDrr_alt; break; + case X86::VCMPSSrm: NewOpc = X86::VCMPSSrm_alt; break; + case X86::VCMPSSrr: NewOpc = X86::VCMPSSrr_alt; break; + case X86::VCMPPDYrmi: NewOpc = X86::VCMPPDYrmi_alt; break; + case X86::VCMPPDYrri: NewOpc = X86::VCMPPDYrri_alt; break; + case X86::VCMPPSYrmi: NewOpc = X86::VCMPPSYrmi_alt; break; + case X86::VCMPPSYrri: NewOpc = X86::VCMPPSYrri_alt; break; + case X86::VCMPPDZrmi: NewOpc = X86::VCMPPDZrmi_alt; break; + case X86::VCMPPDZrri: NewOpc = X86::VCMPPDZrri_alt; break; + case X86::VCMPPSZrmi: NewOpc = X86::VCMPPSZrmi_alt; break; + case X86::VCMPPSZrri: NewOpc = X86::VCMPPSZrri_alt; break; + case X86::VCMPSDZrm: NewOpc = X86::VCMPSDZrmi_alt; break; + case X86::VCMPSDZrr: NewOpc = X86::VCMPSDZrri_alt; break; + case X86::VCMPSSZrm: NewOpc = X86::VCMPSSZrmi_alt; break; + case X86::VCMPSSZrr: NewOpc = X86::VCMPSSZrri_alt; break; + } + // Switch opcode to the one that doesn't get special printing. + mcInst.setOpcode(NewOpc); + } } switch (type) { @@ -407,7 +455,7 @@ static bool translateRMRegister(MCInst &mcInst, debug("A R/M register operand may not have a SIB byte"); return true; } - + switch (insn.eaBase) { default: debug("Unexpected EA base register"); @@ -427,7 +475,7 @@ static bool translateRMRegister(MCInst &mcInst, ALL_REGS #undef ENTRY } - + return false; } @@ -440,26 +488,26 @@ static bool translateRMRegister(MCInst &mcInst, /// from. /// @return - 0 on success; nonzero otherwise static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, - const MCDisassembler *Dis) { + const MCDisassembler *Dis) { // Addresses in an MCInst are represented as five operands: - // 1. basereg (register) The R/M base, or (if there is a SIB) the + // 1. basereg (register) The R/M base, or (if there is a SIB) the // SIB base - // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified + // 2. scaleamount (immediate) 1, or (if there is a SIB) the specified // scale amount // 3. indexreg (register) x86_registerNONE, or (if there is a SIB) - // the index (which is multiplied by the + // the index (which is multiplied by the // scale amount) // 4. displacement (immediate) 0, or the displacement if there is one // 5. segmentreg (register) x86_registerNONE for now, but could be set // if we have segment overrides - + MCOperand baseReg; MCOperand scaleAmount; MCOperand indexReg; MCOperand displacement; MCOperand segmentReg; uint64_t pcrel = 0; - + if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) { if (insn.sibBase != SIB_BASE_NONE) { switch (insn.sibBase) { @@ -512,7 +560,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX); SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 : IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0; - insn.sibIndex = (SIBIndex)(IndexBase + + insn.sibIndex = (SIBIndex)(IndexBase + (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset)); } @@ -534,7 +582,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, } else { indexReg = MCOperand::CreateReg(0); } - + scaleAmount = MCOperand::CreateImm(insn.sibScale); } else { switch (insn.eaBase) { @@ -553,7 +601,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, } else baseReg = MCOperand::CreateReg(0); - + indexReg = MCOperand::CreateReg(0); break; case EA_BASE_BX_SI: @@ -584,7 +632,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, // placeholders to keep the compiler happy. #define ENTRY(x) \ case EA_BASE_##x: \ - baseReg = MCOperand::CreateReg(X86::x); break; + baseReg = MCOperand::CreateReg(X86::x); break; ALL_EA_BASES #undef ENTRY #define ENTRY(x) case EA_REG_##x: @@ -595,14 +643,14 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, return true; } } - + scaleAmount = MCOperand::CreateImm(1); } - + displacement = MCOperand::CreateImm(insn.displacement); segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]); - + mcInst.addOperand(baseReg); mcInst.addOperand(scaleAmount); mcInst.addOperand(indexReg); @@ -623,7 +671,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, /// from. /// @return - 0 on success; nonzero otherwise static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, - InternalInstruction &insn, const MCDisassembler *Dis) { + InternalInstruction &insn, const MCDisassembler *Dis) { switch (operand.type) { default: debug("Unexpected type for a R/M operand"); @@ -633,8 +681,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_R32: case TYPE_R64: case TYPE_Rv: - case TYPE_MM: - case TYPE_MM32: case TYPE_MM64: case TYPE_XMM: case TYPE_XMM32: @@ -660,9 +706,6 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_M32FP: case TYPE_M64FP: case TYPE_M80FP: - case TYPE_M16INT: - case TYPE_M32INT: - case TYPE_M64INT: case TYPE_M1616: case TYPE_M1632: case TYPE_M1664: @@ -670,7 +713,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, return translateRMMemory(mcInst, insn, Dis); } } - + /// translateFPRegister - Translates a stack position on the FPU stack to its /// LLVM form, and appends it to an MCInst. /// @@ -698,7 +741,7 @@ static bool translateMaskRegister(MCInst &mcInst, return false; } -/// translateOperand - Translates an operand stored in an internal instruction +/// translateOperand - Translates an operand stored in an internal instruction /// to LLVM's format and appends it to an MCInst. /// /// @param mcInst - The MCInst to append to. @@ -707,7 +750,7 @@ static bool translateMaskRegister(MCInst &mcInst, /// @return - false on success; true otherwise. static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, InternalInstruction &insn, - const MCDisassembler *Dis) { + const MCDisassembler *Dis) { switch (operand.encoding) { default: debug("Unhandled operand encoding during translation"); @@ -761,7 +804,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, insn, Dis); } } - + /// translateInstruction - Translates an internal instruction and all its /// operands to an MCInst. /// @@ -770,12 +813,12 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, /// @return - false on success; true otherwise. static bool translateInstruction(MCInst &mcInst, InternalInstruction &insn, - const MCDisassembler *Dis) { + const MCDisassembler *Dis) { if (!insn.spec) { debug("Instruction has no specification"); return true; } - + mcInst.setOpcode(insn.instructionID); // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3 // prefix bytes should be disassembled as xrelease and xacquire then set the @@ -786,9 +829,9 @@ static bool translateInstruction(MCInst &mcInst, else if(mcInst.getOpcode() == X86::REPNE_PREFIX) mcInst.setOpcode(X86::XACQUIRE_PREFIX); } - + insn.numImmediatesTranslated = 0; - + for (const auto &Op : insn.operands) { if (Op.encoding != ENCODING_NONE) { if (translateOperand(mcInst, Op, insn, Dis)) { @@ -796,7 +839,7 @@ static bool translateInstruction(MCInst &mcInst, } } } - + return false; } @@ -807,9 +850,9 @@ static MCDisassembler *createX86Disassembler(const Target &T, return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII)); } -extern "C" void LLVMInitializeX86Disassembler() { +extern "C" void LLVMInitializeX86Disassembler() { // Register the disassembler. - TargetRegistry::RegisterMCDisassembler(TheX86_32Target, + TargetRegistry::RegisterMCDisassembler(TheX86_32Target, createX86Disassembler); TargetRegistry::RegisterMCDisassembler(TheX86_64Target, createX86Disassembler); diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h index 4dc7c29078fc..d7f426b2641d 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.h @@ -71,8 +71,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86DISASSEMBLER_H -#define X86DISASSEMBLER_H +#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H +#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLER_H #include "X86DisassemblerDecoderCommon.h" #include "llvm/MC/MCDisassembler.h" @@ -87,21 +87,17 @@ class raw_ostream; namespace X86Disassembler { -/// X86GenericDisassembler - Generic disassembler for all X86 platforms. -/// All each platform class should have to do is subclass the constructor, and -/// provide a different disassemblerMode value. +/// Generic disassembler for all X86 platforms. All each platform class should +/// have to do is subclass the constructor, and provide a different +/// disassemblerMode value. class X86GenericDisassembler : public MCDisassembler { std::unique_ptr<const MCInstrInfo> MII; public: - /// Constructor - Initializes the disassembler. - /// X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, std::unique_ptr<const MCInstrInfo> MII); public: - - /// getInstruction - See MCDisassembler. DecodeStatus getInstruction(MCInst &instr, uint64_t &size, - const MemoryObject ®ion, uint64_t address, + ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &vStream, raw_ostream &cStream) const override; diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index ab3d1f774bc7..619a0d4dd65e 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1,4 +1,4 @@ -//===-- X86DisassemblerDecoder.c - Disassembler decoder -------------------===// +//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===// // // The LLVM Compiler Infrastructure // @@ -13,10 +13,10 @@ // //===----------------------------------------------------------------------===// -#include <stdarg.h> /* for va_*() */ -#include <stdio.h> /* for vsnprintf() */ -#include <stdlib.h> /* for exit() */ -#include <string.h> /* for memset() */ +#include <cstdarg> /* for va_*() */ +#include <cstdio> /* for vsnprintf() */ +#include <cstdlib> /* for exit() */ +#include <cstring> /* for memset() */ #include "X86DisassemblerDecoder.h" @@ -472,8 +472,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) && ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) { insn->vectorExtensionType = TYPE_EVEX; - } - else { + } else { unconsumeByte(insn); /* unconsume byte1 */ unconsumeByte(insn); /* unconsume byte */ insn->necessaryPrefixLocation = insn->readerCursor - 2; @@ -504,8 +503,7 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]); } - } - else if (byte == 0xc4) { + } else if (byte == 0xc4) { uint8_t byte1; if (lookAtByte(insn, &byte1)) { @@ -516,8 +514,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { insn->vectorExtensionType = TYPE_VEX_3B; insn->necessaryPrefixLocation = insn->readerCursor - 1; - } - else { + } else { unconsumeByte(insn); insn->necessaryPrefixLocation = insn->readerCursor - 1; } @@ -541,8 +538,7 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], insn->vectorExtensionPrefix[2]); } - } - else if (byte == 0xc5) { + } else if (byte == 0xc5) { uint8_t byte1; if (lookAtByte(insn, &byte1)) { @@ -552,8 +548,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { insn->vectorExtensionType = TYPE_VEX_2B; - } - else { + } else { unconsumeByte(insn); } @@ -566,8 +561,7 @@ static int readPrefixes(struct InternalInstruction* insn) { | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2); } - switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) - { + switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { default: break; case VEX_PREFIX_66: @@ -579,8 +573,7 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1]); } - } - else if (byte == 0x8f) { + } else if (byte == 0x8f) { uint8_t byte1; if (lookAtByte(insn, &byte1)) { @@ -591,8 +584,7 @@ static int readPrefixes(struct InternalInstruction* insn) { if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */ insn->vectorExtensionType = TYPE_XOP; insn->necessaryPrefixLocation = insn->readerCursor - 1; - } - else { + } else { unconsumeByte(insn); insn->necessaryPrefixLocation = insn->readerCursor - 1; } @@ -612,8 +604,7 @@ static int readPrefixes(struct InternalInstruction* insn) { | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0); } - switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) - { + switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { default: break; case VEX_PREFIX_66: @@ -625,8 +616,7 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], insn->vectorExtensionPrefix[2]); } - } - else { + } else { if (insn->mode == MODE_64BIT) { if ((byte & 0xf0) == 0x40) { uint8_t opcodeByte; @@ -698,8 +688,7 @@ static int readOpcode(struct InternalInstruction* insn) { insn->opcodeType = ONEBYTE; - if (insn->vectorExtensionType == TYPE_EVEX) - { + if (insn->vectorExtensionType == TYPE_EVEX) { switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) { default: dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)", @@ -715,8 +704,7 @@ static int readOpcode(struct InternalInstruction* insn) { insn->opcodeType = THREEBYTE_3A; return consumeByte(insn, &insn->opcode); } - } - else if (insn->vectorExtensionType == TYPE_VEX_3B) { + } else if (insn->vectorExtensionType == TYPE_VEX_3B) { switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) { default: dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", @@ -732,12 +720,10 @@ static int readOpcode(struct InternalInstruction* insn) { insn->opcodeType = THREEBYTE_3A; return consumeByte(insn, &insn->opcode); } - } - else if (insn->vectorExtensionType == TYPE_VEX_2B) { + } else if (insn->vectorExtensionType == TYPE_VEX_2B) { insn->opcodeType = TWOBYTE; return consumeByte(insn, &insn->opcode); - } - else if (insn->vectorExtensionType == TYPE_XOP) { + } else if (insn->vectorExtensionType == TYPE_XOP) { switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) { default: dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", @@ -866,6 +852,22 @@ static bool is16BitEquivalent(const char* orig, const char* equiv) { } /* + * is64Bit - Determines whether this instruction is a 64-bit instruction. + * + * @param name - The instruction that is not 16-bit + */ +static bool is64Bit(const char* name) { + off_t i; + + for (i = 0;; ++i) { + if (name[i] == '\0') + return false; + if (name[i] == '6' && name[i+1] == '4') + return true; + } +} + +/* * getID - Determines the ID of an instruction, consuming the ModR/M byte as * appropriate for extended and escape opcodes. Determines the attributes and * context for the instruction before doing so. @@ -911,8 +913,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { attrMask |= ATTR_EVEXL; if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3])) attrMask |= ATTR_EVEXL2; - } - else if (insn->vectorExtensionType == TYPE_VEX_3B) { + } else if (insn->vectorExtensionType == TYPE_VEX_3B) { switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) { case VEX_PREFIX_66: attrMask |= ATTR_OPSIZE; @@ -927,8 +928,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (lFromVEX3of3(insn->vectorExtensionPrefix[2])) attrMask |= ATTR_VEXL; - } - else if (insn->vectorExtensionType == TYPE_VEX_2B) { + } else if (insn->vectorExtensionType == TYPE_VEX_2B) { switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { case VEX_PREFIX_66: attrMask |= ATTR_OPSIZE; @@ -943,8 +943,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (lFromVEX2of2(insn->vectorExtensionPrefix[1])) attrMask |= ATTR_VEXL; - } - else if (insn->vectorExtensionType == TYPE_XOP) { + } else if (insn->vectorExtensionType == TYPE_XOP) { switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { case VEX_PREFIX_66: attrMask |= ATTR_OPSIZE; @@ -959,12 +958,10 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (lFromXOP3of3(insn->vectorExtensionPrefix[2])) attrMask |= ATTR_VEXL; - } - else { + } else { return -1; } - } - else { + } else { if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) attrMask |= ATTR_OPSIZE; else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation)) @@ -978,29 +975,75 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { if (insn->rexPrefix & 0x08) attrMask |= ATTR_REXW; - if (getIDWithAttrMask(&instructionID, insn, attrMask)) - return -1; - /* * JCXZ/JECXZ need special handling for 16-bit mode because the meaning * of the AdSize prefix is inverted w.r.t. 32-bit mode. */ - if (insn->mode == MODE_16BIT && insn->opcode == 0xE3) { - const struct InstructionSpecifier *spec; - spec = specifierForUID(instructionID); + if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE && + insn->opcode == 0xE3) + attrMask ^= ATTR_ADSIZE; + if (getIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; + + /* The following clauses compensate for limitations of the tables. */ + + if (insn->mode != MODE_64BIT && + insn->vectorExtensionType != TYPE_NO_VEX_XOP) { /* - * Check for Ii8PCRel instructions. We could alternatively do a - * string-compare on the names, but this is probably cheaper. + * The tables can't distinquish between cases where the W-bit is used to + * select register size and cases where its a required part of the opcode. */ - if (x86OperandSets[spec->operands][0].type == TYPE_REL8) { - attrMask ^= ATTR_ADSIZE; - if (getIDWithAttrMask(&instructionID, insn, attrMask)) - return -1; + if ((insn->vectorExtensionType == TYPE_EVEX && + wFromEVEX3of4(insn->vectorExtensionPrefix[2])) || + (insn->vectorExtensionType == TYPE_VEX_3B && + wFromVEX3of3(insn->vectorExtensionPrefix[2])) || + (insn->vectorExtensionType == TYPE_XOP && + wFromXOP3of3(insn->vectorExtensionPrefix[2]))) { + + uint16_t instructionIDWithREXW; + if (getIDWithAttrMask(&instructionIDWithREXW, + insn, attrMask | ATTR_REXW)) { + insn->instructionID = instructionID; + insn->spec = specifierForUID(instructionID); + return 0; + } + + const char *SpecName = GetInstrName(instructionIDWithREXW, miiArg); + // If not a 64-bit instruction. Switch the opcode. + if (!is64Bit(SpecName)) { + insn->instructionID = instructionIDWithREXW; + insn->spec = specifierForUID(instructionIDWithREXW); + return 0; + } } } - /* The following clauses compensate for limitations of the tables. */ + /* + * Absolute moves need special handling. + * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are + * inverted w.r.t. + * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in + * any position. + */ + if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) { + /* Make sure we observed the prefixes in any position. */ + if (insn->prefixPresent[0x67]) + attrMask |= ATTR_ADSIZE; + if (insn->prefixPresent[0x66]) + attrMask |= ATTR_OPSIZE; + + /* In 16-bit, invert the attributes. */ + if (insn->mode == MODE_16BIT) + attrMask ^= ATTR_ADSIZE | ATTR_OPSIZE; + + if (getIDWithAttrMask(&instructionID, insn, attrMask)) + return -1; + + insn->instructionID = instructionID; + insn->spec = specifierForUID(instructionID); + return 0; + } if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) && !(attrMask & ATTR_OPSIZE)) { @@ -1417,22 +1460,14 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_VK16: \ return prefix##_K0 + index; \ case TYPE_MM64: \ - case TYPE_MM32: \ - case TYPE_MM: \ - if (index > 7) \ - *valid = 0; \ - return prefix##_MM0 + index; \ + return prefix##_MM0 + (index & 0x7); \ case TYPE_SEGMENTREG: \ if (index > 5) \ *valid = 0; \ return prefix##_ES + index; \ case TYPE_DEBUGREG: \ - if (index > 7) \ - *valid = 0; \ return prefix##_DR0 + index; \ case TYPE_CONTROLREG: \ - if (index > 8) \ - *valid = 0; \ return prefix##_CR0 + index; \ } \ } @@ -1709,12 +1744,6 @@ static int readOperands(struct InternalInstruction* insn) { } if (readImmediate(insn, 1)) return -1; - if (Op.type == TYPE_IMM3 && - insn->immediates[insn->numImmediatesConsumed - 1] > 7) - return -1; - if (Op.type == TYPE_IMM5 && - insn->immediates[insn->numImmediatesConsumed - 1] > 31) - return -1; if (Op.type == TYPE_XMM128 || Op.type == TYPE_XMM256) sawRegImm = 1; diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 8c45402ab5e1..a79a923ac525 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86DISASSEMBLERDECODER_H -#define X86DISASSEMBLERDECODER_H +#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H +#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODER_H #include "X86DisassemblerDecoderCommon.h" #include "llvm/ADT/ArrayRef.h" @@ -341,7 +341,15 @@ namespace X86Disassembler { ENTRY(DR4) \ ENTRY(DR5) \ ENTRY(DR6) \ - ENTRY(DR7) + ENTRY(DR7) \ + ENTRY(DR8) \ + ENTRY(DR9) \ + ENTRY(DR10) \ + ENTRY(DR11) \ + ENTRY(DR12) \ + ENTRY(DR13) \ + ENTRY(DR14) \ + ENTRY(DR15) #define REGS_CONTROL \ ENTRY(CR0) \ @@ -352,7 +360,14 @@ namespace X86Disassembler { ENTRY(CR5) \ ENTRY(CR6) \ ENTRY(CR7) \ - ENTRY(CR8) + ENTRY(CR8) \ + ENTRY(CR9) \ + ENTRY(CR10) \ + ENTRY(CR11) \ + ENTRY(CR12) \ + ENTRY(CR13) \ + ENTRY(CR14) \ + ENTRY(CR15) #define ALL_EA_BASES \ EA_BASES_16BIT \ diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index 13a7b557b440..1f8f9da5250e 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -14,8 +14,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86DISASSEMBLERDECODERCOMMON_H -#define X86DISASSEMBLERDECODERCOMMON_H +#ifndef LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H +#define LLVM_LIB_TARGET_X86_DISASSEMBLER_X86DISASSEMBLERDECODERCOMMON_H #include "llvm/Support/DataTypes.h" @@ -82,6 +82,7 @@ enum attributeBits { "operands change width") \ ENUM_ENTRY(IC_ADSIZE, 3, "requires an ADSIZE prefix, so " \ "operands change width") \ + ENUM_ENTRY(IC_OPSIZE_ADSIZE, 4, "requires ADSIZE and OPSIZE prefixes") \ ENUM_ENTRY(IC_XD, 2, "may say something about the opcode " \ "but not the operands") \ ENUM_ENTRY(IC_XS, 2, "may say something about the opcode " \ @@ -90,20 +91,24 @@ enum attributeBits { "operands change width") \ ENUM_ENTRY(IC_XS_OPSIZE, 3, "requires an OPSIZE prefix, so " \ "operands change width") \ - ENUM_ENTRY(IC_64BIT_REXW, 4, "requires a REX.W prefix, so operands "\ + ENUM_ENTRY(IC_64BIT_REXW, 5, "requires a REX.W prefix, so operands "\ "change width; overrides IC_OPSIZE") \ + ENUM_ENTRY(IC_64BIT_REXW_ADSIZE, 6, "requires a REX.W prefix and 0x67 " \ + "prefix") \ ENUM_ENTRY(IC_64BIT_OPSIZE, 3, "Just as meaningful as IC_OPSIZE") \ ENUM_ENTRY(IC_64BIT_ADSIZE, 3, "Just as meaningful as IC_ADSIZE") \ - ENUM_ENTRY(IC_64BIT_XD, 5, "XD instructions are SSE; REX.W is " \ + ENUM_ENTRY(IC_64BIT_OPSIZE_ADSIZE, 4, "Just as meaningful as IC_OPSIZE/" \ + "IC_ADSIZE") \ + ENUM_ENTRY(IC_64BIT_XD, 6, "XD instructions are SSE; REX.W is " \ "secondary") \ - ENUM_ENTRY(IC_64BIT_XS, 5, "Just as meaningful as IC_64BIT_XD") \ + ENUM_ENTRY(IC_64BIT_XS, 6, "Just as meaningful as IC_64BIT_XD") \ ENUM_ENTRY(IC_64BIT_XD_OPSIZE, 3, "Just as meaningful as IC_XD_OPSIZE") \ ENUM_ENTRY(IC_64BIT_XS_OPSIZE, 3, "Just as meaningful as IC_XS_OPSIZE") \ - ENUM_ENTRY(IC_64BIT_REXW_XS, 6, "OPSIZE could mean a different " \ + ENUM_ENTRY(IC_64BIT_REXW_XS, 7, "OPSIZE could mean a different " \ "opcode") \ - ENUM_ENTRY(IC_64BIT_REXW_XD, 6, "Just as meaningful as " \ + ENUM_ENTRY(IC_64BIT_REXW_XD, 7, "Just as meaningful as " \ "IC_64BIT_REXW_XS") \ - ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 7, "The Dynamic Duo! Prefer over all " \ + ENUM_ENTRY(IC_64BIT_REXW_OPSIZE, 8, "The Dynamic Duo! Prefer over all " \ "else because this changes most " \ "operands' meaning") \ ENUM_ENTRY(IC_VEX, 1, "requires a VEX prefix") \ @@ -416,10 +421,6 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_M1616, "2+2-byte segment+offset address") \ ENUM_ENTRY(TYPE_M1632, "2+4-byte") \ ENUM_ENTRY(TYPE_M1664, "2+8-byte") \ - ENUM_ENTRY(TYPE_M16_32, "2+4-byte two-part memory operand (LIDT, LGDT)") \ - ENUM_ENTRY(TYPE_M16_16, "2+2-byte (BOUND)") \ - ENUM_ENTRY(TYPE_M32_32, "4+4-byte (BOUND)") \ - ENUM_ENTRY(TYPE_M16_64, "2+8-byte (LIDT, LGDT)") \ ENUM_ENTRY(TYPE_SRCIDX8, "1-byte memory at source index") \ ENUM_ENTRY(TYPE_SRCIDX16, "2-byte memory at source index") \ ENUM_ENTRY(TYPE_SRCIDX32, "4-byte memory at source index") \ @@ -438,14 +439,8 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_M32FP, "32-bit IEE754 memory floating-point operand") \ ENUM_ENTRY(TYPE_M64FP, "64-bit") \ ENUM_ENTRY(TYPE_M80FP, "80-bit extended") \ - ENUM_ENTRY(TYPE_M16INT, "2-byte memory integer operand for use in " \ - "floating-point instructions") \ - ENUM_ENTRY(TYPE_M32INT, "4-byte") \ - ENUM_ENTRY(TYPE_M64INT, "8-byte") \ ENUM_ENTRY(TYPE_ST, "Position on the floating-point stack") \ - ENUM_ENTRY(TYPE_MM, "MMX register operand") \ - ENUM_ENTRY(TYPE_MM32, "4-byte MMX register or memory operand") \ - ENUM_ENTRY(TYPE_MM64, "8-byte") \ + ENUM_ENTRY(TYPE_MM64, "8-byte MMX register") \ ENUM_ENTRY(TYPE_XMM, "XMM register operand") \ ENUM_ENTRY(TYPE_XMM32, "4-byte XMM register or memory operand") \ ENUM_ENTRY(TYPE_XMM64, "8-byte") \ @@ -500,7 +495,7 @@ enum ModifierType { }; #undef ENUM_ENTRY -static const unsigned X86_MAX_OPERANDS = 5; +static const unsigned X86_MAX_OPERANDS = 6; /// Decoding mode for the Intel disassembler. 16-bit, 32-bit, and 64-bit mode /// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode, diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index b45b1185bd67..db6948518c5a 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -45,50 +45,36 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, const MCInstrDesc &Desc = MII.get(MI->getOpcode()); uint64_t TSFlags = Desc.TSFlags; + // If verbose assembly is enabled, we can print some informative comments. + if (CommentStream) + HasCustomInstComment = + EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); + if (TSFlags & X86II::LOCK) OS << "\tlock\n"; + // Output CALLpcrel32 as "callq" in 64-bit mode. + // In Intel annotation it's always emitted as "call". + // + // TODO: Probably this hack should be redesigned via InstAlias in + // InstrInfo.td as soon as Requires clause is supported properly + // for InstAlias. + if (MI->getOpcode() == X86::CALLpcrel32 && + (getAvailableFeatures() & X86::Mode64Bit) != 0) { + OS << "\tcallq\t"; + printPCRelImm(MI, 0, OS); + } // Try to print any aliases first. - if (!printAliasInstr(MI, OS)) + else if (!printAliasInstr(MI, OS)) printInstruction(MI, OS); // Next always print the annotation. printAnnotation(OS, Annot); - - // If verbose assembly is enabled, we can print some informative comments. - if (CommentStream) - EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); -} - -void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0xf; - switch (Imm) { - default: llvm_unreachable("Invalid ssecc argument!"); - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; - case 8: O << "eq_uq"; break; - case 9: O << "nge"; break; - case 0xa: O << "ngt"; break; - case 0xb: O << "false"; break; - case 0xc: O << "neq_oq"; break; - case 0xd: O << "ge"; break; - case 0xe: O << "gt"; break; - case 0xf: O << "true"; break; - } } -void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0x1f; +static void printSSEAVXCC(int64_t Imm, raw_ostream &O) { switch (Imm) { - default: llvm_unreachable("Invalid avxcc argument!"); + default: llvm_unreachable("Invalid ssecc/avxcc argument!"); case 0: O << "eq"; break; case 1: O << "lt"; break; case 2: O << "le"; break; @@ -124,6 +110,20 @@ void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, } } +void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + assert((Imm & 0x7) == Imm); // Ensure valid immediate. + printSSEAVXCC(Imm, O); +} + +void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + assert((Imm & 0x1f) == Imm); // Ensure valid immediate. + printSSEAVXCC(Imm, O); +} + void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm() & 0x3; @@ -151,8 +151,7 @@ void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo, int64_t Address; if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { O << formatHex((uint64_t)Address); - } - else { + } else { // Otherwise, just print the expression. O << *Op.getExpr(); } @@ -170,7 +169,11 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, << '$' << formatImm((int64_t)Op.getImm()) << markup(">"); - if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256)) + // If there are no instruction-specific comments, add a comment clarifying + // the hex value of the immediate operand when it isn't in the range + // [-256,255]. + if (CommentStream && !HasCustomInstComment && + (Op.getImm() > 255 || Op.getImm() < -256)) *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm()); } else { diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 531183b02edf..649dc84a0cfc 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -11,10 +11,11 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_ATT_INST_PRINTER_H -#define X86_ATT_INST_PRINTER_H +#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { @@ -23,8 +24,11 @@ class MCOperand; class X86ATTInstPrinter final : public MCInstPrinter { public: X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI) - : MCInstPrinter(MAI, MII, MRI) {} + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI) + : MCInstPrinter(MAI, MII, MRI) { + // Initialize the set of available features. + setAvailableFeatures(STI.getFeatureBits()); + } void printRegName(raw_ostream &OS, unsigned RegNo) const override; void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override; @@ -49,10 +53,14 @@ public: void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS); void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } - + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } @@ -129,8 +137,11 @@ public: void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemOffset(MI, OpNo, O); } + +private: + bool HasCustomInstComment; }; - + } #endif diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp index baf6507f8244..a4c0ca882c71 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -28,20 +28,127 @@ using namespace llvm; /// EmitAnyX86InstComments - This function decodes x86 instructions and prints /// newline terminated strings to the specified string if desired. This /// information is shown in disassembly dumps when verbose assembly is enabled. -void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, +bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, const char *(*getRegName)(unsigned)) { // If this is a shuffle operation, the switch should fill in this state. SmallVector<int, 8> ShuffleMask; const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr; switch (MI->getOpcode()) { + default: + // Not an instruction for which we can decode comments. + return false; + + case X86::BLENDPDrri: + case X86::VBLENDPDrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::BLENDPDrmi: + case X86::VBLENDPDrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v2f64, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VBLENDPDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VBLENDPDYrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v4f64, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::BLENDPSrri: + case X86::VBLENDPSrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::BLENDPSrmi: + case X86::VBLENDPSrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v4f32, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VBLENDPSYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VBLENDPSYrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v8f32, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::PBLENDWrri: + case X86::VPBLENDWrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::PBLENDWrmi: + case X86::VPBLENDWrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v8i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::VPBLENDWYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPBLENDWYrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v16i16, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::VPBLENDDrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPBLENDDrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v4i32, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + + case X86::VPBLENDDYrri: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPBLENDDYrmi: + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeBLENDMask(MVT::v8i32, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::INSERTPSrr: case X86::VINSERTPSrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::INSERTPSrm: + case X86::VINSERTPSrm: DestName = getRegName(MI->getOperand(0).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); - Src2Name = getRegName(MI->getOperand(2).getReg()); - if(MI->getOperand(3).isImm()) - DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodeINSERTPSMask(MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); break; case X86::MOVLHPSrr: @@ -60,6 +167,80 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeMOVHLPSMask(2, ShuffleMask); break; + case X86::MOVSLDUPrr: + case X86::VMOVSLDUPrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSLDUPrm: + case X86::VMOVSLDUPrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask); + break; + + case X86::VMOVSHDUPYrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VMOVSHDUPYrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask); + break; + + case X86::VMOVSLDUPYrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::VMOVSLDUPYrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask); + break; + + case X86::MOVSHDUPrr: + case X86::VMOVSHDUPrr: + Src1Name = getRegName(MI->getOperand(1).getReg()); + // FALL THROUGH. + case X86::MOVSHDUPrm: + case X86::VMOVSHDUPrm: + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); + break; + + case X86::PSLLDQri: + case X86::VPSLLDQri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSLLDQMask(MVT::v16i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::VPSLLDQYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSLLDQMask(MVT::v32i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::PSRLDQri: + case X86::VPSRLDQri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSRLDQMask(MVT::v16i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + + case X86::VPSRLDQYri: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + if(MI->getOperand(MI->getNumOperands()-1).isImm()) + DecodePSRLDQMask(MVT::v32i8, + MI->getOperand(MI->getNumOperands()-1).getImm(), + ShuffleMask); + break; + case X86::PALIGNR128rr: case X86::VPALIGNR128rr: Src1Name = getRegName(MI->getOperand(2).getReg()); @@ -208,6 +389,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKHMask(MVT::v8i32, ShuffleMask); break; + case X86::VPUNPCKHDQZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHDQZrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v16i32, ShuffleMask); + break; case X86::PUNPCKHQDQrr: case X86::VPUNPCKHQDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -226,6 +415,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKHMask(MVT::v4i64, ShuffleMask); break; + case X86::VPUNPCKHQDQZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKHQDQZrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKHMask(MVT::v8i64, ShuffleMask); + break; case X86::PUNPCKLBWrr: case X86::VPUNPCKLBWrr: @@ -281,6 +478,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKLMask(MVT::v8i32, ShuffleMask); break; + case X86::VPUNPCKLDQZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLDQZrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v16i32, ShuffleMask); + break; case X86::PUNPCKLQDQrr: case X86::VPUNPCKLQDQrr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -299,6 +504,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKLMask(MVT::v4i64, ShuffleMask); break; + case X86::VPUNPCKLQDQZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VPUNPCKLQDQZrm: + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(MVT::v8i64, ShuffleMask); + break; case X86::SHUFPDrri: case X86::VSHUFPDrri: @@ -368,6 +581,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKLPDZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPDZrm: + DecodeUNPCKLMask(MVT::v8f64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; case X86::UNPCKLPSrr: case X86::VUNPCKLPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -386,6 +607,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKLPSZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKLPSZrm: + DecodeUNPCKLMask(MVT::v16f32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; case X86::UNPCKHPDrr: case X86::VUNPCKHPDrr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -404,6 +633,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKHPDZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKHPDZrm: + DecodeUNPCKHMask(MVT::v8f64, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; case X86::UNPCKHPSrr: case X86::VUNPCKHPSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -422,6 +659,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VUNPCKHPSZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + // FALL THROUGH. + case X86::VUNPCKHPSZrm: + DecodeUNPCKHMask(MVT::v16f32, ShuffleMask); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DestName = getRegName(MI->getOperand(0).getReg()); + break; case X86::VPERMILPSri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. @@ -489,54 +734,59 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, break; } + // The only comments we decode are shuffles, so give up if we were unable to + // decode a shuffle mask. + if (ShuffleMask.empty()) + return false; - // If this was a shuffle operation, print the shuffle mask. - if (!ShuffleMask.empty()) { - if (!DestName) DestName = Src1Name; - OS << (DestName ? DestName : "mem") << " = "; + if (!DestName) DestName = Src1Name; + OS << (DestName ? DestName : "mem") << " = "; - // If the two sources are the same, canonicalize the input elements to be - // from the first src so that we get larger element spans. - if (Src1Name == Src2Name) { - for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { - if ((int)ShuffleMask[i] >= 0 && // Not sentinel. - ShuffleMask[i] >= (int)e) // From second mask. - ShuffleMask[i] -= e; - } + // If the two sources are the same, canonicalize the input elements to be + // from the first src so that we get larger element spans. + if (Src1Name == Src2Name) { + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if ((int)ShuffleMask[i] >= 0 && // Not sentinel. + ShuffleMask[i] >= (int)e) // From second mask. + ShuffleMask[i] -= e; } + } - // The shuffle mask specifies which elements of the src1/src2 fill in the - // destination, with a few sentinel values. Loop through and print them - // out. - for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { - if (i != 0) + // The shuffle mask specifies which elements of the src1/src2 fill in the + // destination, with a few sentinel values. Loop through and print them + // out. + for (unsigned i = 0, e = ShuffleMask.size(); i != e; ++i) { + if (i != 0) + OS << ','; + if (ShuffleMask[i] == SM_SentinelZero) { + OS << "zero"; + continue; + } + + // Otherwise, it must come from src1 or src2. Print the span of elements + // that comes from this src. + bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); + const char *SrcName = isSrc1 ? Src1Name : Src2Name; + OS << (SrcName ? SrcName : "mem") << '['; + bool IsFirst = true; + while (i != e && (int)ShuffleMask[i] != SM_SentinelZero && + (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { + if (!IsFirst) OS << ','; - if (ShuffleMask[i] == SM_SentinelZero) { - OS << "zero"; - continue; - } - - // Otherwise, it must come from src1 or src2. Print the span of elements - // that comes from this src. - bool isSrc1 = ShuffleMask[i] < (int)ShuffleMask.size(); - const char *SrcName = isSrc1 ? Src1Name : Src2Name; - OS << (SrcName ? SrcName : "mem") << '['; - bool IsFirst = true; - while (i != e && - (int)ShuffleMask[i] >= 0 && - (ShuffleMask[i] < (int)ShuffleMask.size()) == isSrc1) { - if (!IsFirst) - OS << ','; - else - IsFirst = false; + else + IsFirst = false; + if (ShuffleMask[i] == SM_SentinelUndef) + OS << "u"; + else OS << ShuffleMask[i] % ShuffleMask.size(); - ++i; - } - OS << ']'; - --i; // For loop increments element #. + ++i; } - //MI->print(OS, 0); - OS << "\n"; + OS << ']'; + --i; // For loop increments element #. } + //MI->print(OS, 0); + OS << "\n"; + // We successfully added a comment to this instruction. + return true; } diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h index 13fdf9af8c98..687581b10aec 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.h @@ -12,13 +12,13 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_INST_COMMENTS_H -#define X86_INST_COMMENTS_H +#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H +#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H namespace llvm { class MCInst; class raw_ostream; - void EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, + bool EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, const char *(*getRegName)(unsigned)); } diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index 1c8466bf2945..449445df7d27 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -50,33 +50,7 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); } -void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0xf; - switch (Imm) { - default: llvm_unreachable("Invalid ssecc argument!"); - case 0: O << "eq"; break; - case 1: O << "lt"; break; - case 2: O << "le"; break; - case 3: O << "unord"; break; - case 4: O << "neq"; break; - case 5: O << "nlt"; break; - case 6: O << "nle"; break; - case 7: O << "ord"; break; - case 8: O << "eq_uq"; break; - case 9: O << "nge"; break; - case 0xa: O << "ngt"; break; - case 0xb: O << "false"; break; - case 0xc: O << "neq_oq"; break; - case 0xd: O << "ge"; break; - case 0xe: O << "gt"; break; - case 0xf: O << "true"; break; - } -} - -void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, - raw_ostream &O) { - int64_t Imm = MI->getOperand(Op).getImm() & 0x1f; +static void printSSEAVXCC(int64_t Imm, raw_ostream &O) { switch (Imm) { default: llvm_unreachable("Invalid avxcc argument!"); case 0: O << "eq"; break; @@ -114,6 +88,20 @@ void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, } } +void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + assert((Imm & 0x7) == Imm); // Ensure valid immediate. + printSSEAVXCC(Imm, O); +} + +void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op, + raw_ostream &O) { + int64_t Imm = MI->getOperand(Op).getImm(); + assert((Imm & 0x1f) == Imm); // Ensure valid immediate. + printSSEAVXCC(Imm, O); +} + void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm() & 0x3; @@ -168,21 +156,21 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg); const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp); const MCOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg); - + // If this has a segment register, print it. if (SegReg.getReg()) { printOperand(MI, Op+X86::AddrSegmentReg, O); O << ':'; } - + O << '['; - + bool NeedPlus = false; if (BaseReg.getReg()) { printOperand(MI, Op+X86::AddrBaseReg, O); NeedPlus = true; } - + if (IndexReg.getReg()) { if (NeedPlus) O << " + "; if (ScaleVal != 1) @@ -209,7 +197,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op, O << formatImm(DispVal); } } - + O << ']'; } diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 4d9b48155430..641423d56d03 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_INTEL_INST_PRINTER_H -#define X86_INTEL_INST_PRINTER_H +#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H +#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" #include "llvm/Support/raw_ostream.h" @@ -44,11 +44,15 @@ public: void printDstIdx(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &OS); + void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printMemReference(MI, OpNo, O); + } + void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "opaque ptr "; printMemReference(MI, OpNo, O); } - + void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "byte ptr "; printMemReference(MI, OpNo, O); @@ -152,7 +156,7 @@ public: printMemOffset(MI, OpNo, O); } }; - + } #endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 23bca0d53d3d..164b4192ae66 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -11,7 +11,6 @@ #include "MCTargetDesc/X86FixupKinds.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixupKindInfo.h" @@ -437,10 +436,30 @@ class DarwinX86AsmBackend : public X86AsmBackend { bool Is64Bit; unsigned OffsetSize; ///< Offset of a "push" instruction. - unsigned PushInstrSize; ///< Size of a "push" instruction. unsigned MoveInstrSize; ///< Size of a "move" instruction. - unsigned StackDivide; ///< Amount to adjust stack stize by. + unsigned StackDivide; ///< Amount to adjust stack size by. protected: + /// \brief Size of a "push" instruction for the given register. + unsigned PushInstrSize(unsigned Reg) const { + switch (Reg) { + case X86::EBX: + case X86::ECX: + case X86::EDX: + case X86::EDI: + case X86::ESI: + case X86::EBP: + case X86::RBX: + case X86::RBP: + return 1; + case X86::R12: + case X86::R13: + case X86::R14: + case X86::R15: + return 2; + } + return 1; + } + /// \brief Implementation of algorithm to generate the compact unwind encoding /// for the CFI instructions. uint32_t @@ -493,7 +512,7 @@ protected: // Defines a new offset for the CFA. E.g. // // With frame: - // + // // pushq %rbp // L0: // .cfi_def_cfa_offset 16 @@ -530,7 +549,7 @@ protected: unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); SavedRegs[SavedRegIdx++] = Reg; StackAdjust += OffsetSize; - InstrOffset += PushInstrSize; + InstrOffset += PushInstrSize(Reg); break; } } @@ -663,7 +682,7 @@ private: // 4 3 // 5 3 // - for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { + for (unsigned i = 0; i < RegCount; ++i) { int CUReg = getCompactUnwindRegNum(SavedRegs[i]); if (CUReg == -1) return ~0U; SavedRegs[i] = CUReg; @@ -724,7 +743,6 @@ public: OffsetSize = Is64Bit ? 8 : 4; MoveInstrSize = Is64Bit ? 3 : 2; StackDivide = Is64Bit ? 8 : 4; - PushInstrSize = 1; } }; @@ -772,26 +790,6 @@ public: return SMO.getType() == MachO::S_CSTRING_LITERALS; } - bool isSectionAtomizable(const MCSection &Section) const override { - const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section); - // Fixed sized data sections are uniqued, they cannot be diced into atoms. - switch (SMO.getType()) { - default: - return true; - - case MachO::S_4BYTE_LITERALS: - case MachO::S_8BYTE_LITERALS: - case MachO::S_16BYTE_LITERALS: - case MachO::S_LITERAL_POINTERS: - case MachO::S_NON_LAZY_SYMBOL_POINTERS: - case MachO::S_LAZY_SYMBOL_POINTERS: - case MachO::S_MOD_INIT_FUNC_POINTERS: - case MachO::S_MOD_TERM_FUNC_POINTERS: - case MachO::S_INTERPOSING: - return false; - } - } - /// \brief Generate the compact unwind encoding for the CFI instructions. uint32_t generateCompactUnwindEncoding( ArrayRef<MCCFIInstruction> Instrs) const override { diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 026e4c487d81..28be15beb675 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -14,8 +14,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86BASEINFO_H -#define X86BASEINFO_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86BASEINFO_H #include "X86MCTargetDesc.h" #include "llvm/MC/MCInstrDesc.h" @@ -216,7 +216,7 @@ namespace X86II { MO_SECREL }; - enum { + enum : uint64_t { //===------------------------------------------------------------------===// // Instruction encodings. These are the standard/most common forms for X86 // instructions. @@ -303,17 +303,18 @@ namespace X86II { //// MRM_XX - A mod/rm byte of exactly 0xXX. MRM_C0 = 32, MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36, MRM_C8 = 37, MRM_C9 = 38, MRM_CA = 39, - MRM_CB = 40, MRM_D0 = 41, MRM_D1 = 42, MRM_D4 = 43, - MRM_D5 = 44, MRM_D6 = 45, MRM_D8 = 46, MRM_D9 = 47, - MRM_DA = 48, MRM_DB = 49, MRM_DC = 50, MRM_DD = 51, - MRM_DE = 52, MRM_DF = 53, MRM_E0 = 54, MRM_E1 = 55, - MRM_E2 = 56, MRM_E3 = 57, MRM_E4 = 58, MRM_E5 = 59, - MRM_E8 = 60, MRM_E9 = 61, MRM_EA = 62, MRM_EB = 63, - MRM_EC = 64, MRM_ED = 65, MRM_EE = 66, MRM_F0 = 67, - MRM_F1 = 68, MRM_F2 = 69, MRM_F3 = 70, MRM_F4 = 71, - MRM_F5 = 72, MRM_F6 = 73, MRM_F7 = 74, MRM_F8 = 75, - MRM_F9 = 76, MRM_FA = 77, MRM_FB = 78, MRM_FC = 79, - MRM_FD = 80, MRM_FE = 81, MRM_FF = 82, + MRM_CB = 40, MRM_CF = 41, MRM_D0 = 42, MRM_D1 = 43, + MRM_D4 = 44, MRM_D5 = 45, MRM_D6 = 46, MRM_D7 = 47, + MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50, MRM_DB = 51, + MRM_DC = 52, MRM_DD = 53, MRM_DE = 54, MRM_DF = 55, + MRM_E0 = 56, MRM_E1 = 57, MRM_E2 = 58, MRM_E3 = 59, + MRM_E4 = 60, MRM_E5 = 61, MRM_E8 = 62, MRM_E9 = 63, + MRM_EA = 64, MRM_EB = 65, MRM_EC = 66, MRM_ED = 67, + MRM_EE = 68, MRM_F0 = 69, MRM_F1 = 70, MRM_F2 = 71, + MRM_F3 = 72, MRM_F4 = 73, MRM_F5 = 74, MRM_F6 = 75, + MRM_F7 = 76, MRM_F8 = 77, MRM_F9 = 78, MRM_FA = 79, + MRM_FB = 80, MRM_FC = 81, MRM_FD = 82, MRM_FE = 83, + MRM_FF = 84, FormMask = 127, @@ -327,21 +328,28 @@ namespace X86II { OpSizeShift = 7, OpSizeMask = 0x3 << OpSizeShift, - OpSize16 = 1, - OpSize32 = 2, + OpSizeFixed = 0 << OpSizeShift, + OpSize16 = 1 << OpSizeShift, + OpSize32 = 2 << OpSizeShift, - // AsSize - Set if this instruction requires an operand size prefix (0x67), - // which most often indicates that the instruction address 16 bit address - // instead of 32 bit address (or 32 bit address in 64 bit mode). + // AsSize - AdSizeX implies this instruction determines its need of 0x67 + // prefix from a normal ModRM memory operand. The other types indicate that + // an operand is encoded with a specific width and a prefix is needed if + // it differs from the current mode. AdSizeShift = OpSizeShift + 2, - AdSize = 1 << AdSizeShift, + AdSizeMask = 0x3 << AdSizeShift, + + AdSizeX = 1 << AdSizeShift, + AdSize16 = 1 << AdSizeShift, + AdSize32 = 2 << AdSizeShift, + AdSize64 = 3 << AdSizeShift, //===------------------------------------------------------------------===// // OpPrefix - There are several prefix bytes that are used as opcode // extensions. These are 0x66, 0xF3, and 0xF2. If this field is 0 there is // no prefix. // - OpPrefixShift = AdSizeShift + 1, + OpPrefixShift = AdSizeShift + 2, OpPrefixMask = 0x7 << OpPrefixShift, // PS, PD - Prefix code for packed single and double precision vector @@ -454,51 +462,53 @@ namespace X86II { EncodingMask = 0x3 << EncodingShift, // VEX - encoding using 0xC4/0xC5 - VEX = 1, + VEX = 1 << EncodingShift, /// XOP - Opcode prefix used by XOP instructions. - XOP = 2, + XOP = 2 << EncodingShift, // VEX_EVEX - Specifies that this instruction use EVEX form which provides // syntax support up to 32 512-bit register operands and up to 7 16-bit // mask operands as well as source operand data swizzling/memory operand // conversion, eviction hint, and rounding mode. - EVEX = 3, + EVEX = 3 << EncodingShift, // Opcode OpcodeShift = EncodingShift + 2, - //===------------------------------------------------------------------===// - /// VEX - The opcode prefix used by AVX instructions - VEXShift = OpcodeShift + 8, - /// VEX_W - Has a opcode specific functionality, but is used in the same /// way as REX_W is for regular SSE instructions. - VEX_W = 1U << 0, + VEX_WShift = OpcodeShift + 8, + VEX_W = 1ULL << VEX_WShift, /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2 /// address instructions in SSE are represented as 3 address ones in AVX /// and the additional register is encoded in VEX_VVVV prefix. - VEX_4V = 1U << 1, + VEX_4VShift = VEX_WShift + 1, + VEX_4V = 1ULL << VEX_4VShift, /// VEX_4VOp3 - Similar to VEX_4V, but used on instructions that encode /// operand 3 with VEX.vvvv. - VEX_4VOp3 = 1U << 2, + VEX_4VOp3Shift = VEX_4VShift + 1, + VEX_4VOp3 = 1ULL << VEX_4VOp3Shift, /// VEX_I8IMM - Specifies that the last register used in a AVX instruction, /// must be encoded in the i8 immediate field. This usually happens in /// instructions with 4 operands. - VEX_I8IMM = 1U << 3, + VEX_I8IMMShift = VEX_4VOp3Shift + 1, + VEX_I8IMM = 1ULL << VEX_I8IMMShift, /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current /// instruction uses 256-bit wide registers. This is usually auto detected /// if a VR256 register is used, but some AVX instructions also have this /// field marked when using a f256 memory references. - VEX_L = 1U << 4, + VEX_LShift = VEX_I8IMMShift + 1, + VEX_L = 1ULL << VEX_LShift, // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX // prefix. Usually used for scalar instructions. Needed by disassembler. - VEX_LIG = 1U << 5, + VEX_LIGShift = VEX_LShift + 1, + VEX_LIG = 1ULL << VEX_LIGShift, // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field // with following encoding: @@ -509,20 +519,24 @@ namespace X86II { // this will save 1 tsflag bit // EVEX_K - Set if this instruction requires masking - EVEX_K = 1U << 6, + EVEX_KShift = VEX_LIGShift + 1, + EVEX_K = 1ULL << EVEX_KShift, // EVEX_Z - Set if this instruction has EVEX.Z field set. - EVEX_Z = 1U << 7, + EVEX_ZShift = EVEX_KShift + 1, + EVEX_Z = 1ULL << EVEX_ZShift, // EVEX_L2 - Set if this instruction has EVEX.L' field set. - EVEX_L2 = 1U << 8, + EVEX_L2Shift = EVEX_ZShift + 1, + EVEX_L2 = 1ULL << EVEX_L2Shift, // EVEX_B - Set if this instruction has EVEX.B field set. - EVEX_B = 1U << 9, + EVEX_BShift = EVEX_L2Shift + 1, + EVEX_B = 1ULL << EVEX_BShift, // The scaling factor for the AVX512's 8-bit compressed displacement. - CD8_Scale_Shift = VEXShift + 10, - CD8_Scale_Mask = 127, + CD8_Scale_Shift = EVEX_BShift + 1, + CD8_Scale_Mask = 127ULL << CD8_Scale_Shift, /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the /// wacky 0x0F 0x0F prefix for 3DNow! instructions. The manual documents @@ -531,16 +545,16 @@ namespace X86II { /// we handle this by storeing the classifier in the opcode field and using /// this flag to indicate that the encoder should do the wacky 3DNow! thing. Has3DNow0F0FOpcodeShift = CD8_Scale_Shift + 7, - Has3DNow0F0FOpcode = 1U << (Has3DNow0F0FOpcodeShift - VEXShift), + Has3DNow0F0FOpcode = 1ULL << Has3DNow0F0FOpcodeShift, /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in /// ModRM or I8IMM. This is used for FMA4 and XOP instructions. MemOp4Shift = Has3DNow0F0FOpcodeShift + 1, - MemOp4 = 1U << (MemOp4Shift - VEXShift), + MemOp4 = 1ULL << MemOp4Shift, /// Explicitly specified rounding control EVEX_RCShift = MemOp4Shift + 1, - EVEX_RC = 1U << (EVEX_RCShift - VEXShift) + EVEX_RC = 1ULL << EVEX_RCShift }; // getBaseOpcodeFor - This function returns the "base" X86 opcode for the @@ -642,10 +656,10 @@ namespace X86II { /// counted as one operand. /// inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) { - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; - bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); - + bool HasVEX_4V = TSFlags & X86II::VEX_4V; + bool HasMemOp4 = TSFlags & X86II::MemOp4; + bool HasEVEX_K = TSFlags & X86II::EVEX_K; + switch (TSFlags & X86II::FormMask) { default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!"); case X86II::Pseudo: @@ -662,19 +676,10 @@ namespace X86II { return -1; case X86II::MRMDestMem: return 0; - case X86II::MRMSrcMem: { - unsigned FirstMemOp = 1; - if (HasVEX_4V) - ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). - if (HasMemOp4) - ++FirstMemOp;// Skip the register source (which is encoded in I8IMM). - if (HasEVEX_K) - ++FirstMemOp;// Skip the mask register - // FIXME: Maybe lea should have its own form? This is a horrible hack. - //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || - // Opcode == X86::LEA16r || Opcode == X86::LEA32r) - return FirstMemOp; - } + case X86II::MRMSrcMem: + // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a + // mask register. + return 1 + HasVEX_4V + HasMemOp4 + HasEVEX_K; case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: @@ -685,32 +690,27 @@ namespace X86II { case X86II::MRM0m: case X86II::MRM1m: case X86II::MRM2m: case X86II::MRM3m: case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: { - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - unsigned FirstMemOp = 0; - if (HasVEX_4V) - ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV). - if (HasEVEX_K) - ++FirstMemOp;// Skip the mask register - return FirstMemOp; - } + case X86II::MRM6m: case X86II::MRM7m: + // Start from 0, skip registers encoded in VEX_VVVV or a mask register. + return 0 + HasVEX_4V + HasEVEX_K; case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: - case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4: - case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8: - case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: - case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: - case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1: - case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4: - case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9: - case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: - case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0: - case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3: - case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6: - case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9: - case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC: - case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF: + case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: + case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: + case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: + case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: + case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: + case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: + case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: + case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: + case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: + case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1: + case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: + case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: + case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: + case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: + case X86II::MRM_FE: case X86II::MRM_FF: return -1; } } @@ -751,7 +751,7 @@ namespace X86II { (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31)); } - + inline bool isX86_64NonExtLowByteReg(unsigned reg) { return (reg == X86::SPL || reg == X86::BPL || reg == X86::SIL || reg == X86::DIL); diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 3fdec874cbac..e8b0b4c5826f 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -77,7 +77,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, break; case MCSymbolRefExpr::VK_GOTTPOFF: Type = ELF::R_X86_64_GOTTPOFF; - break; + break; case MCSymbolRefExpr::VK_TLSGD: Type = ELF::R_X86_64_TLSGD; break; @@ -222,6 +222,9 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, case MCSymbolRefExpr::VK_GOT: Type = ELF::R_386_GOT32; break; + case MCSymbolRefExpr::VK_PLT: + Type = ELF::R_386_PLT32; + break; case MCSymbolRefExpr::VK_GOTOFF: Type = ELF::R_386_GOTOFF; break; diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h index 09396b790df2..4899900dcef9 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_X86_X86FIXUPKINDS_H -#define LLVM_X86_X86FIXUPKINDS_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86FIXUPKINDS_H #include "llvm/MC/MCFixup.h" diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index b1411bc5040d..5795514beec9 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -102,21 +102,12 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { TextAlignFillValue = 0x90; - // Set up DWARF directives - HasLEB128 = true; // Target asm supports leb128 directives (little-endian) - // Debug Information SupportsDebugInformation = true; // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; - // OpenBSD and Bitrig have buggy support for .quad in 32-bit mode, just split - // into two .words. - if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) && - T.getArch() == Triple::x86) - Data64bitsDirective = nullptr; - // Always enable the integrated assembler by default. // Clang also enabled it when the OS is Solaris but that is redundant here. UseIntegratedAssembler = true; @@ -133,19 +124,17 @@ X86_64MCAsmInfoDarwin::getExprForPersonalitySymbol(const MCSymbol *Sym, return MCBinaryExpr::CreateAdd(Res, Four, Context); } -const MCSection *X86ELFMCAsmInfo:: -getNonexecutableStackSection(MCContext &Ctx) const { - return Ctx.getELFSection(".note.GNU-stack", ELF::SHT_PROGBITS, - 0, SectionKind::getMetadata()); -} - void X86MCAsmInfoMicrosoft::anchor() { } X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; PointerSize = 8; - ExceptionsType = ExceptionHandling::WinEH; + WinEHEncodingType = WinEH::EncodingType::Itanium; + + // Use MSVC-compatible EH data. + ExceptionsType = ExceptionHandling::MSVC; } AssemblerDialect = AsmWriterFlavor; @@ -163,8 +152,10 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { assert(Triple.isOSWindows() && "Windows is the only supported COFF target"); if (Triple.getArch() == Triple::x86_64) { PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; PointerSize = 8; - ExceptionsType = ExceptionHandling::WinEH; + WinEHEncodingType = WinEH::EncodingType::Itanium; + ExceptionsType = ExceptionHandling::ItaniumWinEH; } else { ExceptionsType = ExceptionHandling::DwarfCFI; } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index a7509b03809a..deaad2a5b8e8 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86TARGETASMINFO_H -#define X86TARGETASMINFO_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmInfoCOFF.h" @@ -23,7 +23,8 @@ namespace llvm { class Triple; class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { - void anchor() override; + virtual void anchor(); + public: explicit X86MCAsmInfoDarwin(const Triple &Triple); }; @@ -39,8 +40,6 @@ namespace llvm { void anchor() override; public: explicit X86ELFMCAsmInfo(const Triple &Triple); - const MCSection * - getNonexecutableStackSection(MCContext &Ctx) const override; }; class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 075db11027d1..5d925243dd68 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -185,12 +185,11 @@ static bool isDisp8(int Value) { /// isCDisp8 - Return true if this signed displacement fits in a 8-bit /// compressed dispacement field. static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) { - assert(((TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift == X86II::EVEX) && + assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) && "Compressed 8-bit displacement is only valid for EVEX inst."); unsigned CD8_Scale = - (TSFlags >> X86II::CD8_Scale_Shift) & X86II::CD8_Scale_Mask; + (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift; if (CD8_Scale == 0) { CValue = Value; return isDisp8(Value); @@ -373,9 +372,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt); const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); unsigned BaseReg = Base.getReg(); - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; - bool HasEVEX = (Encoding == X86II::EVEX); + bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX; // Handle %rip relative addressing. if (BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode @@ -593,13 +590,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, const MCInst &MI, const MCInstrDesc &Desc, raw_ostream &OS) const { - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; - bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; - bool HasEVEX_RC = (TSFlags >> X86II::VEXShift) & X86II::EVEX_RC; + assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX."); + + uint64_t Encoding = TSFlags & X86II::EncodingMask; + bool HasEVEX_K = TSFlags & X86II::EVEX_K; + bool HasVEX_4V = TSFlags & X86II::VEX_4V; + bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3; + bool HasMemOp4 = TSFlags & X86II::MemOp4; + bool HasEVEX_RC = TSFlags & X86II::EVEX_RC; // VEX_R: opcode externsion equivalent to REX.R in // 1's complement (inverted) form @@ -680,18 +678,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, bool EncodeRC = false; - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) + if (TSFlags & X86II::VEX_W) VEX_W = 1; - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) + if (TSFlags & X86II::VEX_L) VEX_L = 1; - if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2)) + if (TSFlags & X86II::EVEX_L2) EVEX_L2 = 1; - if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z)) + if (HasEVEX_K && (TSFlags & X86II::EVEX_Z)) EVEX_z = 1; - if (((TSFlags >> X86II::VEXShift) & X86II::EVEX_B)) + if ((TSFlags & X86II::EVEX_B)) EVEX_b = 1; switch (TSFlags & X86II::OpPrefixMask) { @@ -725,7 +723,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // MemAddr, src1(VEX_4V), src2(ModR/M) // MemAddr, src1(ModR/M), imm8 // - if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + + if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + X86::AddrBaseReg).getReg())) VEX_B = 0x0; if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + @@ -867,7 +865,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3; } EncodeRC = true; - } + } break; case X86II::MRMDestReg: // MRMDestReg instructions forms: @@ -1109,10 +1107,14 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, raw_ostream &OS) const { // Emit the operand size opcode prefix as needed. - unsigned char OpSize = (TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift; - if (OpSize == (is16BitMode(STI) ? X86II::OpSize32 : X86II::OpSize16)) + if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32 + : X86II::OpSize16)) EmitByte(0x66, CurByte, OS); + // Emit the LOCK opcode prefix. + if (TSFlags & X86II::LOCK) + EmitByte(0xF0, CurByte, OS); + switch (TSFlags & X86II::OpPrefixMask) { case X86II::PD: // 66 EmitByte(0x66, CurByte, OS); @@ -1170,27 +1172,22 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned CurByte = 0; // Encoding type for this instruction. - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; + uint64_t Encoding = TSFlags & X86II::EncodingMask; // It uses the VEX.VVVV field? - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; + bool HasVEX_4V = TSFlags & X86II::VEX_4V; + bool HasVEX_4VOp3 = TSFlags & X86II::VEX_4VOp3; + bool HasMemOp4 = TSFlags & X86II::MemOp4; const unsigned MemOp4_I8IMMOperand = 2; // It uses the EVEX.aaa field? - bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K); - bool HasEVEX_RC = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_RC); - + bool HasEVEX_K = TSFlags & X86II::EVEX_K; + bool HasEVEX_RC = TSFlags & X86II::EVEX_RC; + // Determine where the memory operand starts, if present. int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); if (MemoryOperand != -1) MemoryOperand += CurOp; - // Emit the lock opcode prefix as needed. - if (TSFlags & X86II::LOCK) - EmitByte(0xF0, CurByte, OS); - // Emit segment override opcode prefix as needed. if (MemoryOperand >= 0) EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg, @@ -1202,16 +1199,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, // Emit the address size opcode prefix as needed. bool need_address_override; - // The AdSize prefix is only for 32-bit and 64-bit modes. Hm, perhaps we - // should introduce an AdSize16 bit instead of having seven special cases? - if ((!is16BitMode(STI) && TSFlags & X86II::AdSize) || - (is16BitMode(STI) && (MI.getOpcode() == X86::JECXZ_32 || - MI.getOpcode() == X86::MOV8o8a || - MI.getOpcode() == X86::MOV16o16a || - MI.getOpcode() == X86::MOV32o32a || - MI.getOpcode() == X86::MOV8ao8 || - MI.getOpcode() == X86::MOV16ao16 || - MI.getOpcode() == X86::MOV32ao32))) { + uint64_t AdSize = TSFlags & X86II::AdSizeMask; + if ((is16BitMode(STI) && AdSize == X86II::AdSize32) || + (is32BitMode(STI) && AdSize == X86II::AdSize16) || + (is64BitMode(STI) && AdSize == X86II::AdSize32)) { need_address_override = true; } else if (MemoryOperand < 0) { need_address_override = false; @@ -1237,7 +1228,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags); - if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) + if (TSFlags & X86II::Has3DNow0F0FOpcode) BaseOpcode = 0x0F; // Weird 3DNow! encoding. unsigned SrcRegNum = 0; @@ -1437,20 +1428,21 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: - case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4: - case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8: - case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: - case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: - case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1: - case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4: - case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9: - case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: - case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0: - case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3: - case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6: - case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9: - case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC: - case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF: + case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: + case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: + case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: + case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: + case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: + case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: + case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: + case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: + case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: + case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1: + case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: + case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: + case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: + case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: + case X86II::MRM_FE: case X86II::MRM_FF: EmitByte(BaseOpcode, CurByte, OS); unsigned char MRM; @@ -1465,11 +1457,13 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM_C9: MRM = 0xC9; break; case X86II::MRM_CA: MRM = 0xCA; break; case X86II::MRM_CB: MRM = 0xCB; break; + case X86II::MRM_CF: MRM = 0xCF; break; case X86II::MRM_D0: MRM = 0xD0; break; case X86II::MRM_D1: MRM = 0xD1; break; case X86II::MRM_D4: MRM = 0xD4; break; case X86II::MRM_D5: MRM = 0xD5; break; case X86II::MRM_D6: MRM = 0xD6; break; + case X86II::MRM_D7: MRM = 0xD7; break; case X86II::MRM_D8: MRM = 0xD8; break; case X86II::MRM_D9: MRM = 0xD9; break; case X86II::MRM_DA: MRM = 0xDA; break; @@ -1518,7 +1512,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, while (CurOp != NumOps && NumOps - CurOp <= 2) { // The last source register of a 4 operand instruction in AVX is encoded // in bits[7:4] of a immediate byte. - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { + if (TSFlags & X86II::VEX_I8IMM) { const MCOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand : CurOp); ++CurOp; @@ -1544,7 +1538,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, } } - if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode) + if (TSFlags & X86II::Has3DNow0F0FOpcode) EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS); #ifndef NDEBUG diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 3bfad6c71b9b..5a9181df70e6 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -351,11 +351,8 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM, static MCStreamer *createMCStreamer(const Target &T, StringRef TT, MCContext &Ctx, MCAsmBackend &MAB, - raw_ostream &_OS, - MCCodeEmitter *_Emitter, - const MCSubtargetInfo &STI, - bool RelaxAll, - bool NoExecStack) { + raw_ostream &_OS, MCCodeEmitter *_Emitter, + const MCSubtargetInfo &STI, bool RelaxAll) { Triple TheTriple(TT); switch (TheTriple.getObjectFormat()) { @@ -366,7 +363,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT, assert(TheTriple.isOSWindows() && "only Windows COFF is supported"); return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll); case Triple::ELF: - return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack); + return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll); } } @@ -377,7 +374,7 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T, const MCRegisterInfo &MRI, const MCSubtargetInfo &STI) { if (SyntaxVariant == 0) - return new X86ATTInstPrinter(MAI, MII, MRI); + return new X86ATTInstPrinter(MAI, MII, MRI, STI); if (SyntaxVariant == 1) return new X86IntelInstPrinter(MAI, MII, MRI); return nullptr; diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index ebe74cfeaddc..d8320b97736d 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86MCTARGETDESC_H -#define X86MCTARGETDESC_H +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H #include "llvm/Support/DataTypes.h" #include <string> @@ -40,8 +40,8 @@ namespace DWARFFlavour { enum { X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2 }; -} - +} + /// N86 namespace - Native X86 register numbers /// namespace N86 { diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index ead333830090..67b0c8900850 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -179,11 +179,14 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, if (A_Base == B_Base && A_Base) report_fatal_error("unsupported relocation with identical base", false); - // A subtraction expression where both symbols are undefined is a + // A subtraction expression where either symbol is undefined is a // non-relocatable expression. - if (A->isUndefined() && B->isUndefined()) - report_fatal_error("unsupported relocation with subtraction expression", - false); + if (A->isUndefined() || B->isUndefined()) { + StringRef Name = A->isUndefined() ? A->getName() : B->getName(); + Asm.getContext().FatalError(Fixup.getLoc(), + "unsupported relocation with subtraction expression, symbol '" + + Name + "' can not be undefined in a subtraction expression"); + } Value += Writer->getSymbolAddress(&A_SD, Layout) - (!A_Base ? 0 : Writer->getSymbolAddress(A_Base, Layout)); @@ -193,8 +196,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, if (A_Base) { Index = A_Base->getIndex(); IsExtern = 1; - } - else { + } else { Index = A_SD.getFragment()->getParent()->getOrdinal() + 1; IsExtern = 0; } @@ -212,8 +214,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer, if (B_Base) { Index = B_Base->getIndex(); IsExtern = 1; - } - else { + } else { Index = B_SD.getFragment()->getParent()->getOrdinal() + 1; IsExtern = 0; } @@ -572,7 +573,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // For external relocations, make sure to offset the fixup value to // compensate for the addend of the symbol address, if it was // undefined. This occurs with weak definitions, for example. - if (!SD->Symbol->isUndefined()) + if (!SD->getSymbol().isUndefined()) FixedValue -= Layout.getSymbolOffset(SD); } else { // The index is the section ordinal (1-based). diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 6727f5edd26b..5f1596c46825 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -8,18 +8,21 @@ //===----------------------------------------------------------------------===// #include "X86MCTargetDesc.h" +#include "llvm/MC/MCWin64EH.h" #include "llvm/MC/MCWinCOFFStreamer.h" using namespace llvm; namespace { class X86WinCOFFStreamer : public MCWinCOFFStreamer { + Win64EH::UnwindEmitter EHStreamer; public: X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE, raw_ostream &OS) : MCWinCOFFStreamer(C, AB, *CE, OS) { } void EmitWinEHHandlerData() override; + void EmitWindowsUnwindTables() override; void FinishImpl() override; }; @@ -28,7 +31,13 @@ void X86WinCOFFStreamer::EmitWinEHHandlerData() { // We have to emit the unwind info now, because this directive // actually switches to the .xdata section! - MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentWinFrameInfo()); + EHStreamer.EmitUnwindInfo(*this, getCurrentWinFrameInfo()); +} + +void X86WinCOFFStreamer::EmitWindowsUnwindTables() { + if (!getNumWinFrameInfos()) + return; + EHStreamer.Emit(*this); } void X86WinCOFFStreamer::FinishImpl() { diff --git a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp index 1ea8798e316e..fceb083b5f2d 100644 --- a/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp +++ b/contrib/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -13,7 +13,7 @@ using namespace llvm; Target llvm::TheX86_32Target, llvm::TheX86_64Target; -extern "C" void LLVMInitializeX86TargetInfo() { +extern "C" void LLVMInitializeX86TargetInfo() { RegisterTarget<Triple::x86, /*HasJIT=*/true> X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above"); diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 5f2441c017f5..e4c58d42c5dc 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "X86ShuffleDecode.h" +#include "llvm/IR/Constants.h" #include "llvm/CodeGen/MachineValueType.h" //===----------------------------------------------------------------------===// @@ -62,6 +63,51 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) { ShuffleMask.push_back(NElts+i); } +void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + for (int i = 0, e = NumElts / 2; i < e; ++i) { + ShuffleMask.push_back(2 * i); + ShuffleMask.push_back(2 * i); + } +} + +void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + for (int i = 0, e = NumElts / 2; i < e; ++i) { + ShuffleMask.push_back(2 * i + 1); + ShuffleMask.push_back(2 * i + 1); + } +} + +void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned NumElts = VectorSizeInBits / 8; + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + int M = SM_SentinelZero; + if (i >= Imm) M = i - Imm + l; + ShuffleMask.push_back(M); + } +} + +void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + unsigned VectorSizeInBits = VT.getSizeInBits(); + unsigned NumElts = VectorSizeInBits / 8; + unsigned NumLanes = VectorSizeInBits / 128; + unsigned NumLaneElts = NumElts / NumLanes; + + for (unsigned l = 0; l < NumElts; l += NumLaneElts) + for (unsigned i = 0; i < NumLaneElts; ++i) { + unsigned Base = i + Imm; + int M = Base + l; + if (Base >= NumLaneElts) M = SM_SentinelZero; + ShuffleMask.push_back(M); + } +} + void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); @@ -207,6 +253,90 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, } } +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> + // + // <4 x i32> <i32 -2147483648, i32 -2147483648, + // i32 -2147483648, i32 -2147483648> + + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + + if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. + return; + + // This is a straightforward byte vector. + if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { + int NumElements = MaskTy->getVectorNumElements(); + ShuffleMask.reserve(NumElements); + + for (int i = 0; i < NumElements; ++i) { + // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte + // lane of the vector we're inside. + int Base = i < 16 ? 0 : 16; + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + // If the high bit (7) of the byte is set, the element is zeroed. + if (Element & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (Element & 0xf); + ShuffleMask.push_back(Index); + } + } + } + // TODO: Handle funny-looking vectors too. +} + +void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + if (M == (uint64_t)SM_SentinelUndef) { + ShuffleMask.push_back(M); + continue; + } + // For AVX vectors with 32 bytes the base of the shuffle is the half of + // the vector we're inside. + int Base = i < 16 ? 0 : 16; + // If the high bit (7) of the byte is set, the element is zeroed. + if (M & (1 << 7)) + ShuffleMask.push_back(SM_SentinelZero); + else { + // Only the least significant 4 bits of the byte are used. + int Index = Base + (M & 0xf); + ShuffleMask.push_back(Index); + } + } +} + +void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { + int ElementBits = VT.getScalarSizeInBits(); + int NumElements = VT.getVectorNumElements(); + for (int i = 0; i < NumElements; ++i) { + // If there are more than 8 elements in the vector, then any immediate blend + // mask applies to each 128-bit lane. There can never be more than + // 8 elements in a 128-bit lane with an immediate blend. + int Bit = NumElements > 8 ? i % (128 / ElementBits) : i; + assert(Bit < 8 && + "Immediate blends only operate over 8 elements at a time!"); + ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i); + } +} + /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { @@ -215,4 +345,44 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { } } +void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); + assert(MaskTy->getVectorElementType()->isIntegerTy() && + "Expected integer constant mask elements!"); + int ElementBits = MaskTy->getScalarSizeInBits(); + int NumElements = MaskTy->getVectorNumElements(); + assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && + "Unexpected number of vector elements."); + ShuffleMask.reserve(NumElements); + if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { + assert((unsigned)NumElements == CDS->getNumElements() && + "Constant mask has a different number of elements!"); + + for (int i = 0; i < NumElements; ++i) { + int Base = (i * ElementBits / 128) * (128 / ElementBits); + uint64_t Element = CDS->getElementAsInteger(i); + // Only the least significant 2 bits of the integer are used. + int Index = Base + (Element & 0x3); + ShuffleMask.push_back(Index); + } + } else if (auto *CV = dyn_cast<ConstantVector>(C)) { + assert((unsigned)NumElements == C->getNumOperands() && + "Constant mask has a different number of elements!"); + + for (int i = 0; i < NumElements; ++i) { + int Base = (i * ElementBits / 128) * (128 / ElementBits); + Constant *COp = CV->getOperand(i); + if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; + } + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + // Only the least significant 2 bits of the integer are used. + int Index = Base + (Element & 0x3); + ShuffleMask.push_back(Index); + } + } +} + } // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 9e75b6b945c5..6ba3c64f8ec3 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -12,21 +12,21 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_SHUFFLE_DECODE_H -#define X86_SHUFFLE_DECODE_H +#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H +#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/ArrayRef.h" //===----------------------------------------------------------------------===// // Vector Mask Decoding //===----------------------------------------------------------------------===// namespace llvm { +class Constant; class MVT; -enum { - SM_SentinelZero = -1 -}; +enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 }; void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); @@ -36,6 +36,14 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); // <0,2> or <0,1,4,5> void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask); +void DecodeMOVSLDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +void DecodeMOVSHDUPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSLLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + +void DecodePSRLDQMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); + void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); @@ -59,6 +67,16 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); /// different datatypes and vector widths. void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decode a PSHUFB mask from an IR-level vector constant. +void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a PSHUFB mask from a raw array of constants such as from +/// BUILD_VECTOR. +void DecodePSHUFBMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a BLEND immediate mask into a shuffle mask. +void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); @@ -67,6 +85,9 @@ void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decode a VPERMILP variable mask from an IR-level vector constant. +void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); + } // llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index d5522ed95eb4..219b64d18d1d 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef TARGET_X86_H -#define TARGET_X86_H +#ifndef LLVM_LIB_TARGET_X86_X86_H +#define LLVM_LIB_TARGET_X86_X86_H #include "llvm/Support/CodeGen.h" @@ -21,13 +21,8 @@ namespace llvm { class FunctionPass; class ImmutablePass; -class JITCodeEmitter; class X86TargetMachine; -/// createX86AtomicExpandPass - This pass expands atomic operations that cannot -/// be handled natively in terms of a loop using cmpxchg. -FunctionPass *createX86AtomicExpandPass(const X86TargetMachine *TM); - /// createX86ISelDag - This pass converts a legalized DAG into a /// X86-specific DAG, ready for instruction scheduling. /// @@ -54,11 +49,6 @@ FunctionPass *createX86FloatingPointStackifierPass(); /// AVX and SSE. FunctionPass *createX86IssueVZeroUpperPass(); -/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code -/// to the specified MCE object. -FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM, - JITCodeEmitter &JCE); - /// createX86EmitCodeToMemory - Returns a pass that converts a register /// allocated function into raw machine code in a dynamically /// allocated chunk of memory. @@ -77,6 +67,11 @@ FunctionPass *createX86PadShortFunctions(); /// to eliminate execution delays in some Atom processors. FunctionPass *createX86FixupLEAs(); +/// createX86CallFrameOptimization - Return a pass that optimizes +/// the code-size of x86 call sequences. This is done by replacing +/// esp-relative movs with pushes. +FunctionPass *createX86CallFrameOptimization(); + } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index cd32a0f24234..30b3b2876b8a 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -79,9 +79,16 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; +// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that +// explicit. Also, it seems this would be the default state for most chips +// going forward, so it would probably be better to negate the logic and +// match the 32-byte "slow mem" feature below. def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", "IsUAMemFast", "true", "Fast unaligned memory access">; +def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions", [FeatureSSE3]>; @@ -125,9 +132,9 @@ def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", "Enable XOP instructions", [FeatureFMA4]>; -def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem", - "HasVectorUAMem", "true", - "Allow unaligned memory operands on vector/SIMD instructions">; +def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem", + "HasSSEUnalignedMem", "true", + "Allow unaligned memory operands with SSE instructions">; def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", "Enable AES instructions", [FeatureSSE2]>; @@ -157,15 +164,22 @@ def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", "Enable SHA instructions", [FeatureSSE2]>; +def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true", + "Support SGX instructions">; def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; +def FeatureSMAP : SubtargetFeature<"smap", "HasSMAP", "true", + "Support SMAP instructions">; def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; -def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb", - "HasSlowDivide", "true", - "Use small divide for positive values less than 256">; +def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", + "HasSlowDivide32", "true", + "Use 8-bit divide for positive values less than 256">; +def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw", + "HasSlowDivide64", "true", + "Use 16-bit divide for positive values less than 65536">; def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; @@ -178,6 +192,10 @@ def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; +def FeatureUseSqrtEst : SubtargetFeature<"use-sqrt-est", "UseSqrtEst", "true", + "Use RSQRT* to optimize square root calculations">; +def FeatureUseRecipEst : SubtargetFeature<"use-recip-est", "UseReciprocalEst", + "true", "Use RCP* to optimize division calculations">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -222,78 +240,167 @@ def : ProcessorModel<"core2", SandyBridgeModel, def : ProcessorModel<"penryn", SandyBridgeModel, [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; -// Atom. -def : ProcessorModel<"atom", AtomModel, - [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B, - FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP, - FeatureSlowDivide, - FeatureCallRegIndirect, - FeatureLEAUsesAG, - FeaturePadShortFunctions]>; - -// Atom Silvermont. -def : ProcessorModel<"slm", SLMModel, [ProcIntelSLM, - FeatureSSE42, FeatureCMPXCHG16B, - FeatureMOVBE, FeaturePOPCNT, - FeaturePCLMUL, FeatureAES, - FeatureCallRegIndirect, - FeaturePRFCHW, - FeatureSlowLEA, FeatureSlowIncDec, - FeatureSlowBTMem, FeatureFastUAMem]>; +// Atom CPUs. +class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [ + ProcIntelAtom, + FeatureSSSE3, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureSlowBTMem, + FeatureLeaForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeatureLEAUsesAG, + FeaturePadShortFunctions + ]>; +def : BonnellProc<"bonnell">; +def : BonnellProc<"atom">; // Pin the generic name to the baseline. + +class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ + ProcIntelSLM, + FeatureSSE42, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeaturePOPCNT, + FeaturePCLMUL, + FeatureAES, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeaturePRFCHW, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowBTMem, + FeatureFastUAMem + ]>; +def : SilvermontProc<"silvermont">; +def : SilvermontProc<"slm">; // Legacy alias. + // "Arrandale" along with corei3 and corei5 -def : ProcessorModel<"corei7", SandyBridgeModel, - [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT, FeatureAES]>; +class NehalemProc<string Name, list<SubtargetFeature> AdditionalFeatures> + : ProcessorModel<Name, SandyBridgeModel, !listconcat([ + FeatureSSE42, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureFastUAMem, + FeaturePOPCNT + ], + AdditionalFeatures)>; +def : NehalemProc<"nehalem", []>; +def : NehalemProc<"corei7", [FeatureAES]>; -def : ProcessorModel<"nehalem", SandyBridgeModel, - [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT]>; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -def : ProcessorModel<"westmere", SandyBridgeModel, - [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem, - FeatureFastUAMem, FeaturePOPCNT, FeatureAES, - FeaturePCLMUL]>; -// Sandy Bridge +class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureSSE42, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureFastUAMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL + ]>; +def : WestmereProc<"westmere">; + // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. -def : ProcessorModel<"corei7-avx", SandyBridgeModel, - [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>; -// Ivy Bridge -def : ProcessorModel<"core-avx-i", SandyBridgeModel, - [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, - FeatureF16C, FeatureFSGSBase]>; - -// Haswell -def : ProcessorModel<"core-avx2", HaswellModel, - [FeatureAVX2, FeatureCMPXCHG16B, FeatureFastUAMem, - FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, - FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, - FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, - FeatureHLE]>; - -// KNL +class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureAVX, + FeatureCMPXCHG16B, + FeatureFastUAMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL + ]>; +def : SandyBridgeProc<"sandybridge">; +def : SandyBridgeProc<"corei7-avx">; // Legacy alias. + +class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureAVX, + FeatureCMPXCHG16B, + FeatureFastUAMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase + ]>; +def : IvyBridgeProc<"ivybridge">; +def : IvyBridgeProc<"core-avx-i">; // Legacy alias. + +class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureAVX2, + FeatureCMPXCHG16B, + FeatureFastUAMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec + ]>; +def : HaswellProc<"haswell">; +def : HaswellProc<"core-avx2">; // Legacy alias. + +class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureAVX2, + FeatureCMPXCHG16B, + FeatureFastUAMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSMAP, + FeatureSlowIncDec + ]>; +def : BroadwellProc<"broadwell">; + // FIXME: define KNL model -def : ProcessorModel<"knl", HaswellModel, +class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI, FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, FeatureSlowIncDec]>; +def : KnightsLandingProc<"knl">; -// SKX // FIXME: define SKX model -def : ProcessorModel<"skx", HaswellModel, +class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [FeatureAVX512, FeatureCDI, FeatureDQI, FeatureBWI, FeatureVLX, FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec]>; + FeatureSlowIncDec, FeatureSGX]>; +def : SkylakeProc<"skylake">; +def : SkylakeProc<"skx">; // Legacy alias. + + +// AMD CPUs. def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [Feature3DNow]>; @@ -302,7 +409,7 @@ def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, FeatureSlowSHLD]>; @@ -326,39 +433,52 @@ def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"barcelona", [FeatureSSE4A, + Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, + FeaturePOPCNT, FeatureSlowBTMem, + FeatureSlowSHLD]>; // Bobcat def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; + // Jaguar -def : Proc<"btver2", [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, - FeaturePRFCHW, FeatureAES, FeaturePCLMUL, - FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; +def : ProcessorModel<"btver2", BtVer2Model, + [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, + FeaturePRFCHW, FeatureAES, FeaturePCLMUL, + FeatureBMI, FeatureF16C, FeatureMOVBE, + FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, + FeatureSlowSHLD, FeatureUseSqrtEst, FeatureUseRecipEst]>; + +// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. + // Bulldozer def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>; + FeatureAVX, FeatureSSE4A, FeatureLZCNT, + FeaturePOPCNT, FeatureSlowSHLD]>; // Piledriver def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureF16C, FeatureLZCNT, - FeaturePOPCNT, FeatureBMI, FeatureTBM, - FeatureFMA, FeatureSlowSHLD]>; + FeatureAVX, FeatureSSE4A, FeatureF16C, + FeatureLZCNT, FeaturePOPCNT, FeatureBMI, + FeatureTBM, FeatureFMA, FeatureSlowSHLD]>; // Steamroller def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureF16C, FeatureLZCNT, - FeaturePOPCNT, FeatureBMI, FeatureTBM, - FeatureFMA, FeatureFSGSBase]>; + FeatureAVX, FeatureSSE4A, FeatureF16C, + FeatureLZCNT, FeaturePOPCNT, FeatureBMI, + FeatureTBM, FeatureFMA, FeatureSlowSHLD, + FeatureFSGSBase]>; // Excavator def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, FeaturePCLMUL, FeatureF16C, FeatureLZCNT, FeaturePOPCNT, FeatureBMI, FeatureBMI2, - FeatureTBM, FeatureFMA, FeatureFSGSBase]>; + FeatureTBM, FeatureFMA, FeatureSSE4A, + FeatureFSGSBase]>; def : Proc<"geode", [Feature3DNowA]>; @@ -371,7 +491,7 @@ def : Proc<"c3-2", [FeatureSSE1]>; // be good for modern chips without enabling instruction set encodings past the // basic SSE2 and 64-bit ones. It disables slow things from any mainstream and // modern 64-bit x86 chip, and enables features that are generally beneficial. -// +// // We currently use the Sandy Bridge model as the default scheduling model as // we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which // covers a huge swath of x86 processors. If there are specific scheduling diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp index 57c7a62bd5c1..d06e94fbfe1f 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -47,6 +47,8 @@ using namespace llvm; /// runOnMachineFunction - Emit the function body. /// bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + SMShadowTracker.startFunction(MF); + SetupMachineFunction(MF); if (Subtarget->isTargetCOFF()) { @@ -503,7 +505,7 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, } void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { - if (Subtarget->isTargetMacho()) + if (Subtarget->isTargetMachO()) OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); if (Subtarget->isTargetCOFF()) { @@ -556,14 +558,16 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { const MachineConstantPoolEntry &CPE = MF->getConstantPool()->getConstants()[CPID]; if (!CPE.isMachineConstantPoolEntry()) { - SectionKind Kind = CPE.getSectionKind(TM.getDataLayout()); + SectionKind Kind = + CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout()); const Constant *C = CPE.Val.ConstVal; - const MCSectionCOFF *S = cast<MCSectionCOFF>( - getObjFileLowering().getSectionForConstant(Kind, C)); - if (MCSymbol *Sym = S->getCOMDATSymbol()) { - if (Sym->isUndefined()) - OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global); - return Sym; + if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( + getObjFileLowering().getSectionForConstant(Kind, C))) { + if (MCSymbol *Sym = S->getCOMDATSymbol()) { + if (Sym->isUndefined()) + OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global); + return Sym; + } } } } @@ -599,10 +603,10 @@ void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { } void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { - if (Subtarget->isTargetMacho()) { + if (Subtarget->isTargetMachO()) { // All darwin targets use mach-o. MachineModuleInfoMachO &MMIMacho = - MMI->getObjFileInfo<MachineModuleInfoMachO>(); + MMI->getObjFileInfo<MachineModuleInfoMachO>(); // Output stubs for dynamically-linked functions. MachineModuleInfoMachO::SymbolListTy Stubs; @@ -684,11 +688,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals; for (const auto &Function : M) - if (Function.hasDLLExportStorageClass()) + if (Function.hasDLLExportStorageClass() && !Function.isDeclaration()) DLLExportedFns.push_back(getSymbol(&Function)); for (const auto &Global : M.globals()) - if (Global.hasDLLExportStorageClass()) + if (Global.hasDLLExportStorageClass() && !Global.isDeclaration()) DLLExportedGlobals.push_back(getSymbol(&Global)); for (const auto &Alias : M.aliases()) { @@ -725,7 +729,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); if (!Stubs.empty()) { OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); - const DataLayout *TD = TM.getDataLayout(); + const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout(); for (const auto &Stub : Stubs) { OutStreamer.EmitLabel(Stub.first); @@ -734,6 +738,8 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } Stubs.clear(); } + + SM.serializeToStackMapSection(); } } diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h index b1bbe8e41cc0..748b948d212f 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -7,14 +7,19 @@ // //===----------------------------------------------------------------------===// -#ifndef X86ASMPRINTER_H -#define X86ASMPRINTER_H +#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H +#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H #include "X86Subtarget.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/Target/TargetMachine.h" +// Implemented in X86MCInstLower.cpp +namespace { + class X86MCInstLower; +} + namespace llvm { class MCStreamer; class MCSymbol; @@ -25,9 +30,63 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void GenerateExportDirective(const MCSymbol *Sym, bool IsData); + // This utility class tracks the length of a stackmap instruction's 'shadow'. + // It is used by the X86AsmPrinter to ensure that the stackmap shadow + // invariants (i.e. no other stackmaps, patchpoints, or control flow within + // the shadow) are met, while outputting a minimal number of NOPs for padding. + // + // To minimise the number of NOPs used, the shadow tracker counts the number + // of instruction bytes output since the last stackmap. Only if there are too + // few instruction bytes to cover the shadow are NOPs used for padding. + class StackMapShadowTracker { + public: + StackMapShadowTracker(TargetMachine &TM); + ~StackMapShadowTracker(); + void startFunction(MachineFunction &MF); + void count(MCInst &Inst, const MCSubtargetInfo &STI); + + // Called to signal the start of a shadow of RequiredSize bytes. + void reset(unsigned RequiredSize) { + RequiredShadowSize = RequiredSize; + CurrentShadowSize = 0; + InShadow = true; + } + + // Called before every stackmap/patchpoint, and at the end of basic blocks, + // to emit any necessary padding-NOPs. + void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI); + private: + TargetMachine &TM; + std::unique_ptr<MCCodeEmitter> CodeEmitter; + bool InShadow; + + // RequiredShadowSize holds the length of the shadow specified in the most + // recently encountered STACKMAP instruction. + // CurrentShadowSize counts the number of bytes encoded since the most + // recently encountered STACKMAP, stopping when that number is greater than + // or equal to RequiredShadowSize. + unsigned RequiredShadowSize, CurrentShadowSize; + }; + + StackMapShadowTracker SMShadowTracker; + + // All instructions emitted by the X86AsmPrinter should use this helper + // method. + // + // This helper function invokes the SMShadowTracker on each instruction before + // outputting it to the OutStream. This allows the shadow tracker to minimise + // the number of NOPs used for stackmap padding. + void EmitAndCountInstruction(MCInst &Inst); + + void InsertStackMapShadows(MachineFunction &MF); + void LowerSTACKMAP(const MachineInstr &MI); + void LowerPATCHPOINT(const MachineInstr &MI); + + void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI); + public: explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer), SM(*this) { + : AsmPrinter(TM, Streamer), SM(*this), SMShadowTracker(TM) { Subtarget = &TM.getSubtarget<X86Subtarget>(); } @@ -43,6 +102,10 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void EmitInstruction(const MachineInstr *MI) override; + void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override { + SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + } + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &OS) override; @@ -53,6 +116,12 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { /// \brief Return the symbol for the specified constant pool entry. MCSymbol *GetCPISymbol(unsigned CPID) const override; + bool doInitialization(Module &M) override { + SMShadowTracker.reset(0); + SM.reset(); + return AsmPrinter::doInitialization(M); + } + bool runOnMachineFunction(MachineFunction &F) override; }; diff --git a/contrib/llvm/lib/Target/X86/X86AtomicExpandPass.cpp b/contrib/llvm/lib/Target/X86/X86AtomicExpandPass.cpp deleted file mode 100644 index 3dcadb16760b..000000000000 --- a/contrib/llvm/lib/Target/X86/X86AtomicExpandPass.cpp +++ /dev/null @@ -1,283 +0,0 @@ -//===-- X86AtomicExpandPass.cpp - Expand illegal atomic instructions --0---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a pass (at IR level) to replace atomic instructions which -// cannot be implemented as a single instruction with cmpxchg-based loops. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86TargetMachine.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Target/TargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "x86-atomic-expand" - -namespace { - class X86AtomicExpandPass : public FunctionPass { - const X86TargetMachine *TM; - public: - static char ID; // Pass identification, replacement for typeid - explicit X86AtomicExpandPass(const X86TargetMachine *TM) - : FunctionPass(ID), TM(TM) {} - - bool runOnFunction(Function &F) override; - bool expandAtomicInsts(Function &F); - - bool needsCmpXchgNb(Type *MemType); - - /// There are four kinds of atomic operations. Two never need expanding: - /// cmpxchg is what we expand the others *to*, and loads are easily handled - /// by ISelLowering. Atomicrmw and store can need expanding in some - /// circumstances. - bool shouldExpand(Instruction *Inst); - - /// 128-bit atomic stores (64-bit on i686) need to be implemented in terms - /// of trivial cmpxchg16b loops. A simple store isn't necessarily atomic. - bool shouldExpandStore(StoreInst *SI); - - /// Only some atomicrmw instructions need expanding -- some operations - /// (e.g. max) have absolutely no architectural support; some (e.g. or) have - /// limited support but can't return the previous value; some (e.g. add) - /// have complete support in the instruction set. - /// - /// Also, naturally, 128-bit operations always need to be expanded. - bool shouldExpandAtomicRMW(AtomicRMWInst *AI); - - bool expandAtomicRMW(AtomicRMWInst *AI); - bool expandAtomicStore(StoreInst *SI); - }; -} - -char X86AtomicExpandPass::ID = 0; - -FunctionPass *llvm::createX86AtomicExpandPass(const X86TargetMachine *TM) { - return new X86AtomicExpandPass(TM); -} - -bool X86AtomicExpandPass::runOnFunction(Function &F) { - SmallVector<Instruction *, 1> AtomicInsts; - - // Changing control-flow while iterating through it is a bad idea, so gather a - // list of all atomic instructions before we start. - for (BasicBlock &BB : F) - for (Instruction &Inst : BB) { - if (isa<AtomicRMWInst>(&Inst) || - (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic())) - AtomicInsts.push_back(&Inst); - } - - bool MadeChange = false; - for (Instruction *Inst : AtomicInsts) { - if (!shouldExpand(Inst)) - continue; - - if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) - MadeChange |= expandAtomicRMW(AI); - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - MadeChange |= expandAtomicStore(SI); - - assert(MadeChange && "Atomic inst not expanded when it should be?"); - Inst->eraseFromParent(); - } - - return MadeChange; -} - -/// Returns true if the operand type is 1 step up from the native width, and -/// the corresponding cmpxchg8b or cmpxchg16b instruction is available -/// (otherwise we leave them alone to become __sync_fetch_and_... calls). -bool X86AtomicExpandPass::needsCmpXchgNb(llvm::Type *MemType) { - const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>(); - unsigned OpWidth = MemType->getPrimitiveSizeInBits(); - - if (OpWidth == 64) - return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b - if (OpWidth == 128) - return Subtarget.hasCmpxchg16b(); - - return false; -} - -bool X86AtomicExpandPass::shouldExpandAtomicRMW(AtomicRMWInst *AI) { - const X86Subtarget &Subtarget = TM->getSubtarget<X86Subtarget>(); - unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; - - if (needsCmpXchgNb(AI->getType())) - return true; - - if (AI->getType()->getPrimitiveSizeInBits() > NativeWidth) - return false; - - AtomicRMWInst::BinOp Op = AI->getOperation(); - switch (Op) { - default: - llvm_unreachable("Unknown atomic operation"); - case AtomicRMWInst::Xchg: - case AtomicRMWInst::Add: - case AtomicRMWInst::Sub: - // It's better to use xadd, xsub or xchg for these in all cases. - return false; - case AtomicRMWInst::Or: - case AtomicRMWInst::And: - case AtomicRMWInst::Xor: - // If the atomicrmw's result isn't actually used, we can just add a "lock" - // prefix to a normal instruction for these operations. - return !AI->use_empty(); - case AtomicRMWInst::Nand: - case AtomicRMWInst::Max: - case AtomicRMWInst::Min: - case AtomicRMWInst::UMax: - case AtomicRMWInst::UMin: - // These always require a non-trivial set of data operations on x86. We must - // use a cmpxchg loop. - return true; - } -} - -bool X86AtomicExpandPass::shouldExpandStore(StoreInst *SI) { - if (needsCmpXchgNb(SI->getValueOperand()->getType())) - return true; - - return false; -} - -bool X86AtomicExpandPass::shouldExpand(Instruction *Inst) { - if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) - return shouldExpandAtomicRMW(AI); - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) - return shouldExpandStore(SI); - return false; -} - -/// Emit IR to implement the given atomicrmw operation on values in registers, -/// returning the new value. -static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, - Value *Loaded, Value *Inc) { - Value *NewVal; - switch (Op) { - case AtomicRMWInst::Xchg: - return Inc; - case AtomicRMWInst::Add: - return Builder.CreateAdd(Loaded, Inc, "new"); - case AtomicRMWInst::Sub: - return Builder.CreateSub(Loaded, Inc, "new"); - case AtomicRMWInst::And: - return Builder.CreateAnd(Loaded, Inc, "new"); - case AtomicRMWInst::Nand: - return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); - case AtomicRMWInst::Or: - return Builder.CreateOr(Loaded, Inc, "new"); - case AtomicRMWInst::Xor: - return Builder.CreateXor(Loaded, Inc, "new"); - case AtomicRMWInst::Max: - NewVal = Builder.CreateICmpSGT(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - case AtomicRMWInst::Min: - NewVal = Builder.CreateICmpSLE(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - case AtomicRMWInst::UMax: - NewVal = Builder.CreateICmpUGT(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - case AtomicRMWInst::UMin: - NewVal = Builder.CreateICmpULE(Loaded, Inc); - return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); - default: - break; - } - llvm_unreachable("Unknown atomic op"); -} - -bool X86AtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) { - AtomicOrdering Order = - AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); - Value *Addr = AI->getPointerOperand(); - BasicBlock *BB = AI->getParent(); - Function *F = BB->getParent(); - LLVMContext &Ctx = F->getContext(); - - // Given: atomicrmw some_op iN* %addr, iN %incr ordering - // - // The standard expansion we produce is: - // [...] - // %init_loaded = load atomic iN* %addr - // br label %loop - // loop: - // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] - // %new = some_op iN %loaded, %incr - // %pair = cmpxchg iN* %addr, iN %loaded, iN %new - // %new_loaded = extractvalue { iN, i1 } %pair, 0 - // %success = extractvalue { iN, i1 } %pair, 1 - // br i1 %success, label %atomicrmw.end, label %loop - // atomicrmw.end: - // [...] - BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); - BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); - - // This grabs the DebugLoc from AI. - IRBuilder<> Builder(AI); - - // The split call above "helpfully" added a branch at the end of BB (to the - // wrong place), but we want a load. It's easiest to just remove - // the branch entirely. - std::prev(BB->end())->eraseFromParent(); - Builder.SetInsertPoint(BB); - LoadInst *InitLoaded = Builder.CreateLoad(Addr); - InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits()); - Builder.CreateBr(LoopBB); - - // Start the main loop block now that we've taken care of the preliminaries. - Builder.SetInsertPoint(LoopBB); - PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); - Loaded->addIncoming(InitLoaded, BB); - - Value *NewVal = - performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); - - Value *Pair = Builder.CreateAtomicCmpXchg( - Addr, Loaded, NewVal, Order, - AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); - Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); - Loaded->addIncoming(NewLoaded, LoopBB); - - Value *Success = Builder.CreateExtractValue(Pair, 1, "success"); - Builder.CreateCondBr(Success, ExitBB, LoopBB); - - AI->replaceAllUsesWith(NewLoaded); - - return true; -} - -bool X86AtomicExpandPass::expandAtomicStore(StoreInst *SI) { - // An atomic store might need cmpxchg16b (or 8b on x86) to execute. Express - // this in terms of the usual expansion to "atomicrmw xchg". - IRBuilder<> Builder(SI); - AtomicOrdering Order = - SI->getOrdering() == Unordered ? Monotonic : SI->getOrdering(); - AtomicRMWInst *AI = - Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), - SI->getValueOperand(), Order); - - // Now we have an appropriate swap instruction, lower it as usual. - if (shouldExpandAtomicRMW(AI)) { - expandAtomicRMW(AI); - AI->eraseFromParent(); - return true; - } - - return AI; -} diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp new file mode 100644 index 000000000000..fae489e77cc0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -0,0 +1,400 @@ +//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a pass that optimizes call sequences on x86. +// Currently, it converts movs of function parameters onto the stack into +// pushes. This is beneficial for two main reasons: +// 1) The push instruction encoding is much smaller than an esp-relative mov +// 2) It is possible to push memory arguments directly. So, if the +// the transformation is preformed pre-reg-alloc, it can help relieve +// register pressure. +// +//===----------------------------------------------------------------------===// + +#include <algorithm> + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "X86MachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-cf-opt" + +cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt", + cl::desc("Avoid optimizing x86 call frames for size"), + cl::init(false), cl::Hidden); + +namespace { +class X86CallFrameOptimization : public MachineFunctionPass { +public: + X86CallFrameOptimization() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + bool shouldPerformTransformation(MachineFunction &MF); + + bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); + + MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, + unsigned Reg); + + const char *getPassName() const override { + return "X86 Optimize Call Frame"; + } + + const TargetInstrInfo *TII; + const TargetFrameLowering *TFL; + const MachineRegisterInfo *MRI; + static char ID; +}; + +char X86CallFrameOptimization::ID = 0; +} + +FunctionPass *llvm::createX86CallFrameOptimization() { + return new X86CallFrameOptimization(); +} + +// This checks whether the transformation is legal and profitable +bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) { + if (NoX86CFOpt.getValue()) + return false; + + // We currently only support call sequences where *all* parameters. + // are passed on the stack. + // No point in running this in 64-bit mode, since some arguments are + // passed in-register in all common calling conventions, so the pattern + // we're looking for will never match. + const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); + if (STI.is64Bit()) + return false; + + // You would expect straight-line code between call-frame setup and + // call-frame destroy. You would be wrong. There are circumstances (e.g. + // CMOV_GR8 expansion of a select that feeds a function call!) where we can + // end up with the setup and the destroy in different basic blocks. + // This is bad, and breaks SP adjustment. + // So, check that all of the frames in the function are closed inside + // the same block, and, for good measure, that there are no nested frames. + int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + for (MachineBasicBlock &BB : MF) { + bool InsideFrameSequence = false; + for (MachineInstr &MI : BB) { + if (MI.getOpcode() == FrameSetupOpcode) { + if (InsideFrameSequence) + return false; + InsideFrameSequence = true; + } + else if (MI.getOpcode() == FrameDestroyOpcode) { + if (!InsideFrameSequence) + return false; + InsideFrameSequence = false; + } + } + + if (InsideFrameSequence) + return false; + } + + // Now that we know the transformation is legal, check if it is + // profitable. + // TODO: Add a heuristic that actually looks at the function, + // and enable this for more cases. + + // This transformation is always a win when we expected to have + // a reserved call frame. Under other circumstances, it may be either + // a win or a loss, and requires a heuristic. + // For now, enable it only for the relatively clear win cases. + bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); + if (CannotReserveFrame) + return true; + + // For now, don't even try to evaluate the profitability when + // not optimizing for size. + AttributeSet FnAttrs = MF.getFunction()->getAttributes(); + bool OptForSize = + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize) || + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + + if (!OptForSize) + return false; + + // Stack re-alignment can make this unprofitable even in terms of size. + // As mentioned above, a better heuristic is needed. For now, don't do this + // when the required alignment is above 8. (4 would be the safe choice, but + // some experimentation showed 8 is generally good). + if (TFL->getStackAlignment() > 8) + return false; + + return true; +} + +bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getSubtarget().getInstrInfo(); + TFL = MF.getSubtarget().getFrameLowering(); + MRI = &MF.getRegInfo(); + + if (!shouldPerformTransformation(MF)) + return false; + + int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + + bool Changed = false; + + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) + if (I->getOpcode() == FrameSetupOpcode) + Changed |= adjustCallSequence(MF, *BB, I); + + return Changed; +} + +bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + + // Check that this particular call sequence is amenable to the + // transformation. + const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); + unsigned StackPtr = RegInfo.getStackRegister(); + int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + + // We expect to enter this at the beginning of a call sequence + assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); + MachineBasicBlock::iterator FrameSetup = I++; + + + // For globals in PIC mode, we can have some LEAs here. + // Ignore them, they don't bother us. + // TODO: Extend this to something that covers more cases. + while (I->getOpcode() == X86::LEA32r) + ++I; + + // We expect a copy instruction here. + // TODO: The copy instruction is a lowering artifact. + // We should also support a copy-less version, where the stack + // pointer is used directly. + if (!I->isCopy() || !I->getOperand(0).isReg()) + return false; + MachineBasicBlock::iterator SPCopy = I++; + StackPtr = SPCopy->getOperand(0).getReg(); + + // Scan the call setup sequence for the pattern we're looking for. + // We only handle a simple case - a sequence of MOV32mi or MOV32mr + // instructions, that push a sequence of 32-bit values onto the stack, with + // no gaps between them. + SmallVector<MachineInstr*, 4> MovVector(4, nullptr); + unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; + if (MaxAdjust > 4) + MovVector.resize(MaxAdjust, nullptr); + + do { + int Opcode = I->getOpcode(); + if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) + break; + + // We only want movs of the form: + // movl imm/r32, k(%esp) + // If we run into something else, bail. + // Note that AddrBaseReg may, counter to its name, not be a register, + // but rather a frame index. + // TODO: Support the fi case. This should probably work now that we + // have the infrastructure to track the stack pointer within a call + // sequence. + if (!I->getOperand(X86::AddrBaseReg).isReg() || + (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || + !I->getOperand(X86::AddrScaleAmt).isImm() || + (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || + (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || + (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || + !I->getOperand(X86::AddrDisp).isImm()) + return false; + + int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); + assert(StackDisp >= 0 && "Negative stack displacement when passing parameters"); + + // We really don't want to consider the unaligned case. + if (StackDisp % 4) + return false; + StackDisp /= 4; + + assert((size_t)StackDisp < MovVector.size() && + "Function call has more parameters than the stack is adjusted for."); + + // If the same stack slot is being filled twice, something's fishy. + if (MovVector[StackDisp] != nullptr) + return false; + MovVector[StackDisp] = I; + + ++I; + } while (I != MBB.end()); + + // We now expect the end of the sequence - a call and a stack adjust. + if (I == MBB.end()) + return false; + + // For PCrel calls, we expect an additional COPY of the basereg. + // If we find one, skip it. + if (I->isCopy()) { + if (I->getOperand(1).getReg() == + MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg()) + ++I; + else + return false; + } + + if (!I->isCall()) + return false; + MachineBasicBlock::iterator Call = I; + if ((++I)->getOpcode() != FrameDestroyOpcode) + return false; + + // Now, go through the vector, and see that we don't have any gaps, + // but only a series of 32-bit MOVs. + + int64_t ExpectedDist = 0; + auto MMI = MovVector.begin(), MME = MovVector.end(); + for (; MMI != MME; ++MMI, ExpectedDist += 4) + if (*MMI == nullptr) + break; + + // If the call had no parameters, do nothing + if (!ExpectedDist) + return false; + + // We are either at the last parameter, or a gap. + // Make sure it's not a gap + for (; MMI != MME; ++MMI) + if (*MMI != nullptr) + return false; + + // Ok, we can in fact do the transformation for this call. + // Do not remove the FrameSetup instruction, but adjust the parameters. + // PEI will end up finalizing the handling of this. + FrameSetup->getOperand(1).setImm(ExpectedDist); + + DebugLoc DL = I->getDebugLoc(); + // Now, iterate through the vector in reverse order, and replace the movs + // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to + // replace uses. + for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) { + MachineBasicBlock::iterator MOV = *MovVector[Idx]; + MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + if (MOV->getOpcode() == X86::MOV32mi) { + unsigned PushOpcode = X86::PUSHi32; + // If the operand is a small (8-bit) immediate, we can use a + // PUSH instruction with a shorter encoding. + // Note that isImm() may fail even though this is a MOVmi, because + // the operand can also be a symbol. + if (PushOp.isImm()) { + int64_t Val = PushOp.getImm(); + if (isInt<8>(Val)) + PushOpcode = X86::PUSH32i8; + } + BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp); + } else { + unsigned int Reg = PushOp.getReg(); + + // If PUSHrmm is not slow on this target, try to fold the source of the + // push into the instruction. + const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); + bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); + + // Check that this is legal to fold. Right now, we're extremely + // conservative about that. + MachineInstr *DefMov = nullptr; + if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { + MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm)); + + unsigned NumOps = DefMov->getDesc().getNumOperands(); + for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) + Push->addOperand(DefMov->getOperand(i)); + + DefMov->eraseFromParent(); + } else { + BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr(); + } + } + + MBB.erase(MOV); + } + + // The stack-pointer copy is no longer used in the call sequences. + // There should not be any other users, but we can't commit to that, so: + if (MRI->use_empty(SPCopy->getOperand(0).getReg())) + SPCopy->eraseFromParent(); + + // Once we've done this, we need to make sure PEI doesn't assume a reserved + // frame. + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + FuncInfo->setHasPushSequences(true); + + return true; +} + +MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( + MachineBasicBlock::iterator FrameSetup, unsigned Reg) { + // Do an extremely restricted form of load folding. + // ISel will often create patterns like: + // movl 4(%edi), %eax + // movl 8(%edi), %ecx + // movl 12(%edi), %edx + // movl %edx, 8(%esp) + // movl %ecx, 4(%esp) + // movl %eax, (%esp) + // call + // Get rid of those with prejudice. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return nullptr; + + // Make sure this is the only use of Reg. + if (!MRI->hasOneNonDBGUse(Reg)) + return nullptr; + + MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg); + + // Make sure the def is a MOV from memory. + // If the def is an another block, give up. + if (DefMI->getOpcode() != X86::MOV32rm || + DefMI->getParent() != FrameSetup->getParent()) + return nullptr; + + // Be careful with movs that load from a stack slot, since it may get + // resolved incorrectly. + // TODO: Again, we already have the infrastructure, so this should work. + if (!DefMI->getOperand(1).isReg()) + return nullptr; + + // Now, make sure everything else up until the ADJCALLSTACK is a sequence + // of MOVs. To be less conservative would require duplicating a lot of the + // logic from PeepholeOptimizer. + // FIXME: A possibly better approach would be to teach the PeepholeOptimizer + // to be smarter about folding into pushes. + for (auto I = DefMI; I != FrameSetup; ++I) + if (I->getOpcode() != X86::MOV32rm) + return nullptr; + + return DefMI; +} diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h index e76f9fda2db9..0eb2494f1d63 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.h +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -12,14 +12,27 @@ // //===----------------------------------------------------------------------===// -#ifndef X86CALLINGCONV_H -#define X86CALLINGCONV_H +#ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H +#define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/IR/CallingConv.h" namespace llvm { +inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + // Similar to CCPassIndirect, with the addition of inreg. + LocVT = MVT::i32; + LocInfo = CCValAssign::Indirect; + ArgFlags.setInReg(); + return false; // Continue the search, but now for i32. +} + + inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, CCValAssign::LocInfo &, ISD::ArgFlagsTy &, CCState &) { diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td index 86c01bd64647..75a2ec004685 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -14,7 +14,9 @@ /// CCIfSubtarget - Match if the current subtarget has a feature F. class CCIfSubtarget<string F, CCAction A> - : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>; + : CCIf<!strconcat("static_cast<const X86Subtarget&>" + "(State.getMachineFunction().getSubtarget()).", F), + A>; //===----------------------------------------------------------------------===// // Return Value Calling Conventions @@ -59,20 +61,20 @@ def RetCC_X86Common : CallingConv<[ // MM0, it doesn't support these vector types. CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, - // Long double types are always returned in ST0 (even with SSE). - CCIfType<[f80], CCAssignToReg<[ST0, ST1]>> + // Long double types are always returned in FP0 (even with SSE). + CCIfType<[f80], CCAssignToReg<[FP0, FP1]>> ]>; // X86-32 C return-value convention. def RetCC_X86_32_C : CallingConv<[ - // The X86-32 calling convention returns FP values in ST0, unless marked + // The X86-32 calling convention returns FP values in FP0, unless marked // with "inreg" (used here to distinguish one kind of reg from another, // weirdly; this is really the sse-regparm calling convention) in which // case they use XMM0, otherwise it is the same as the common X86 calling // conv. CCIfInReg<CCIfSubtarget<"hasSSE2()", CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, - CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>, + CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>, CCDelegateTo<RetCC_X86Common> ]>; @@ -122,6 +124,24 @@ def RetCC_X86_32_HiPE : CallingConv<[ CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> ]>; +// X86-32 HiPE return-value convention. +def RetCC_X86_32_VectorCall : CallingConv<[ + // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit FP vectors + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // 512-bit FP vectors + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + + // Return integers in the standard way. + CCDelegateTo<RetCC_X86Common> +]>; + // X86-64 C return-value convention. def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. @@ -177,6 +197,7 @@ def RetCC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>, // Otherwise, use RetCC_X86_32_C. CCDelegateTo<RetCC_X86_32_C> @@ -224,6 +245,7 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[i8, i16], CCPromoteToType<i32>>, // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>, CCIfNest<CCAssignToReg<[R10]>>, // The first 6 integer arguments are passed in integer registers. @@ -327,6 +349,25 @@ def CC_X86_Win64_C : CallingConv<[ CCIfType<[f80], CCAssignToStack<0, 0>> ]>; +def CC_X86_Win64_VectorCall : CallingConv<[ + // The first 6 floating point and vector types of 128 bits or less use + // XMM0-XMM5. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, + + // 256-bit vectors use YMM registers. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, + + // 512-bit vectors use ZMM registers. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + + // Delegate to fastcall to handle integer types. + CCDelegateTo<CC_X86_Win64_C> +]>; + + def CC_X86_64_GHC : CallingConv<[ // Promote i8/i16/i32 arguments to i64. CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, @@ -460,6 +501,30 @@ def CC_X86_32_FastCall : CallingConv<[ CCDelegateTo<CC_X86_32_Common> ]>; +def CC_X86_32_VectorCall : CallingConv<[ + // The first 6 floating point and vector types of 128 bits or less use + // XMM0-XMM5. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>, + + // 256-bit vectors use YMM registers. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>, + + // 512-bit vectors use ZMM registers. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>, + + // Otherwise, pass it indirectly. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, + v32i8, v16i16, v8i32, v4i64, v8f32, v4f64, + v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCCustom<"CC_X86_32_VectorCallIndirect">>, + + // Delegate to fastcall to handle integer types. + CCDelegateTo<CC_X86_32_FastCall> +]>; + def CC_X86_32_ThisCall_Common : CallingConv<[ // The first integer argument is passed in ECX CCIfType<[i32], CCAssignToReg<[ECX]>>, @@ -573,6 +638,7 @@ def CC_Intel_OCL_BI : CallingConv<[ // This is the root argument convention for the X86-32 backend. def CC_X86_32 : CallingConv<[ CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, @@ -590,6 +656,7 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>, CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, diff --git a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp b/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp deleted file mode 100644 index a3ae7ee315a3..000000000000 --- a/contrib/llvm/lib/Target/X86/X86CodeEmitter.cpp +++ /dev/null @@ -1,1498 +0,0 @@ -//===-- X86CodeEmitter.cpp - Convert X86 code to machine code -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the pass that transforms the X86 machine instructions into -// relocatable machine code. -// -//===----------------------------------------------------------------------===// - -#include "X86.h" -#include "X86InstrInfo.h" -#include "X86JITInfo.h" -#include "X86Relocations.h" -#include "X86Subtarget.h" -#include "X86TargetMachine.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/JITCodeEmitter.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/PassManager.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetOptions.h" -using namespace llvm; - -#define DEBUG_TYPE "x86-emitter" - -STATISTIC(NumEmitted, "Number of machine instructions emitted"); - -namespace { - template<class CodeEmitter> - class Emitter : public MachineFunctionPass { - const X86InstrInfo *II; - const DataLayout *TD; - X86TargetMachine &TM; - CodeEmitter &MCE; - MachineModuleInfo *MMI; - intptr_t PICBaseOffset; - bool Is64BitMode; - bool IsPIC; - public: - static char ID; - explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce) - : MachineFunctionPass(ID), II(nullptr), TD(nullptr), TM(tm), - MCE(mce), PICBaseOffset(0), Is64BitMode(false), - IsPIC(TM.getRelocationModel() == Reloc::PIC_) {} - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "X86 Machine Code Emitter"; - } - - void emitOpcodePrefix(uint64_t TSFlags, int MemOperand, - const MachineInstr &MI, - const MCInstrDesc *Desc) const; - - void emitVEXOpcodePrefix(uint64_t TSFlags, int MemOperand, - const MachineInstr &MI, - const MCInstrDesc *Desc) const; - - void emitSegmentOverridePrefix(uint64_t TSFlags, - int MemOperand, - const MachineInstr &MI) const; - - void emitInstruction(MachineInstr &MI, const MCInstrDesc *Desc); - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - AU.addRequired<MachineModuleInfo>(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - private: - void emitPCRelativeBlockAddress(MachineBasicBlock *MBB); - void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc, - intptr_t Disp = 0, intptr_t PCAdj = 0, - bool Indirect = false); - void emitExternalSymbolAddress(const char *ES, unsigned Reloc); - void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0, - intptr_t PCAdj = 0); - void emitJumpTableAddress(unsigned JTI, unsigned Reloc, - intptr_t PCAdj = 0); - - void emitDisplacementField(const MachineOperand *RelocOp, int DispVal, - intptr_t Adj = 0, bool IsPCRel = true); - - void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField); - void emitRegModRMByte(unsigned RegOpcodeField); - void emitSIBByte(unsigned SS, unsigned Index, unsigned Base); - void emitConstant(uint64_t Val, unsigned Size); - - void emitMemModRMByte(const MachineInstr &MI, - unsigned Op, unsigned RegOpcodeField, - intptr_t PCAdj = 0); - - unsigned getX86RegNum(unsigned RegNo) const { - const TargetRegisterInfo *TRI = TM.getRegisterInfo(); - return TRI->getEncodingValue(RegNo) & 0x7; - } - - unsigned char getVEXRegisterEncoding(const MachineInstr &MI, - unsigned OpNum) const; - }; - -template<class CodeEmitter> - char Emitter<CodeEmitter>::ID = 0; -} // end anonymous namespace. - -/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code -/// to the specified JITCodeEmitter object. -FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM, - JITCodeEmitter &JCE) { - return new Emitter<JITCodeEmitter>(TM, JCE); -} - -template<class CodeEmitter> -bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) { - MMI = &getAnalysis<MachineModuleInfo>(); - MCE.setModuleInfo(MMI); - - II = TM.getInstrInfo(); - TD = TM.getDataLayout(); - Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit(); - IsPIC = TM.getRelocationModel() == Reloc::PIC_; - - do { - DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n"); - MCE.startFunction(MF); - for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); - MBB != E; ++MBB) { - MCE.StartMachineBasicBlock(MBB); - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); - I != E; ++I) { - const MCInstrDesc &Desc = I->getDesc(); - emitInstruction(*I, &Desc); - // MOVPC32r is basically a call plus a pop instruction. - if (Desc.getOpcode() == X86::MOVPC32r) - emitInstruction(*I, &II->get(X86::POP32r)); - ++NumEmitted; // Keep track of the # of mi's emitted - } - } - } while (MCE.finishFunction(MF)); - - return false; -} - -/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64 -/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand -/// size, and 3) use of X86-64 extended registers. -static unsigned determineREX(const MachineInstr &MI) { - unsigned REX = 0; - const MCInstrDesc &Desc = MI.getDesc(); - - // Pseudo instructions do not need REX prefix byte. - if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo) - return 0; - if (Desc.TSFlags & X86II::REX_W) - REX |= 1 << 3; - - unsigned NumOps = Desc.getNumOperands(); - if (NumOps) { - bool isTwoAddr = NumOps > 1 && - Desc.getOperandConstraint(1, MCOI::TIED_TO) != -1; - - // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. - unsigned i = isTwoAddr ? 1 : 0; - for (unsigned e = NumOps; i != e; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (MO.isReg()) { - unsigned Reg = MO.getReg(); - if (X86II::isX86_64NonExtLowByteReg(Reg)) - REX |= 0x40; - } - } - - switch (Desc.TSFlags & X86II::FormMask) { - case X86II::MRMSrcReg: { - if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) - REX |= 1 << 2; - i = isTwoAddr ? 2 : 1; - for (unsigned e = NumOps; i != e; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (X86InstrInfo::isX86_64ExtendedReg(MO)) - REX |= 1 << 0; - } - break; - } - case X86II::MRMSrcMem: { - if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) - REX |= 1 << 2; - unsigned Bit = 0; - i = isTwoAddr ? 2 : 1; - for (; i != NumOps; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (MO.isReg()) { - if (X86InstrInfo::isX86_64ExtendedReg(MO)) - REX |= 1 << Bit; - Bit++; - } - } - break; - } - case X86II::MRMXm: - case X86II::MRM0m: case X86II::MRM1m: - case X86II::MRM2m: case X86II::MRM3m: - case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: - case X86II::MRMDestMem: { - unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands); - i = isTwoAddr ? 1 : 0; - if (NumOps > e && X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e))) - REX |= 1 << 2; - unsigned Bit = 0; - for (; i != e; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (MO.isReg()) { - if (X86InstrInfo::isX86_64ExtendedReg(MO)) - REX |= 1 << Bit; - Bit++; - } - } - break; - } - default: { - if (X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0))) - REX |= 1 << 0; - i = isTwoAddr ? 2 : 1; - for (unsigned e = NumOps; i != e; ++i) { - const MachineOperand& MO = MI.getOperand(i); - if (X86InstrInfo::isX86_64ExtendedReg(MO)) - REX |= 1 << 2; - } - break; - } - } - } - return REX; -} - - -/// emitPCRelativeBlockAddress - This method keeps track of the information -/// necessary to resolve the address of this block later and emits a dummy -/// value. -/// -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) { - // Remember where this reference was and where it is to so we can - // deal with it later. - MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(), - X86::reloc_pcrel_word, MBB)); - MCE.emitWordLE(0); -} - -/// emitGlobalAddress - Emit the specified address to the code stream assuming -/// this is part of a "take the address of a global" instruction. -/// -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitGlobalAddress(const GlobalValue *GV, - unsigned Reloc, - intptr_t Disp /* = 0 */, - intptr_t PCAdj /* = 0 */, - bool Indirect /* = false */) { - intptr_t RelocCST = Disp; - if (Reloc == X86::reloc_picrel_word) - RelocCST = PICBaseOffset; - else if (Reloc == X86::reloc_pcrel_word) - RelocCST = PCAdj; - MachineRelocation MR = Indirect - ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc, - const_cast<GlobalValue *>(GV), - RelocCST, false) - : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc, - const_cast<GlobalValue *>(GV), RelocCST, false); - MCE.addRelocation(MR); - // The relocated value will be added to the displacement - if (Reloc == X86::reloc_absolute_dword) - MCE.emitDWordLE(Disp); - else - MCE.emitWordLE((int32_t)Disp); -} - -/// emitExternalSymbolAddress - Arrange for the address of an external symbol to -/// be emitted to the current location in the function, and allow it to be PC -/// relative. -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES, - unsigned Reloc) { - intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0; - - // X86 never needs stubs because instruction selection will always pick - // an instruction sequence that is large enough to hold any address - // to a symbol. - // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall) - bool NeedStub = false; - MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(), - Reloc, ES, RelocCST, - 0, NeedStub)); - if (Reloc == X86::reloc_absolute_dword) - MCE.emitDWordLE(0); - else - MCE.emitWordLE(0); -} - -/// emitConstPoolAddress - Arrange for the address of an constant pool -/// to be emitted to the current location in the function, and allow it to be PC -/// relative. -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc, - intptr_t Disp /* = 0 */, - intptr_t PCAdj /* = 0 */) { - intptr_t RelocCST = 0; - if (Reloc == X86::reloc_picrel_word) - RelocCST = PICBaseOffset; - else if (Reloc == X86::reloc_pcrel_word) - RelocCST = PCAdj; - MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(), - Reloc, CPI, RelocCST)); - // The relocated value will be added to the displacement - if (Reloc == X86::reloc_absolute_dword) - MCE.emitDWordLE(Disp); - else - MCE.emitWordLE((int32_t)Disp); -} - -/// emitJumpTableAddress - Arrange for the address of a jump table to -/// be emitted to the current location in the function, and allow it to be PC -/// relative. -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc, - intptr_t PCAdj /* = 0 */) { - intptr_t RelocCST = 0; - if (Reloc == X86::reloc_picrel_word) - RelocCST = PICBaseOffset; - else if (Reloc == X86::reloc_pcrel_word) - RelocCST = PCAdj; - MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(), - Reloc, JTI, RelocCST)); - // The relocated value will be added to the displacement - if (Reloc == X86::reloc_absolute_dword) - MCE.emitDWordLE(0); - else - MCE.emitWordLE(0); -} - -inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode, - unsigned RM) { - assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); - return RM | (RegOpcode << 3) | (Mod << 6); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg, - unsigned RegOpcodeFld){ - MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg))); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) { - MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0)); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, - unsigned Index, - unsigned Base) { - // SIB byte is in the same format as the ModRMByte... - MCE.emitByte(ModRMByte(SS, Index, Base)); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) { - // Output the constant in little endian byte order... - for (unsigned i = 0; i != Size; ++i) { - MCE.emitByte(Val & 255); - Val >>= 8; - } -} - -/// isDisp8 - Return true if this signed displacement fits in a 8-bit -/// sign-extended field. -static bool isDisp8(int Value) { - return Value == (signed char)Value; -} - -static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp, - const TargetMachine &TM) { - // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer - // mechanism as 32-bit mode. - if (TM.getSubtarget<X86Subtarget>().is64Bit() && - !TM.getSubtarget<X86Subtarget>().isTargetDarwin()) - return false; - - // Return true if this is a reference to a stub containing the address of the - // global, not the global itself. - return isGlobalStubReference(GVOp.getTargetFlags()); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp, - int DispVal, - intptr_t Adj /* = 0 */, - bool IsPCRel /* = true */) { - // If this is a simple integer displacement that doesn't require a relocation, - // emit it now. - if (!RelocOp) { - emitConstant(DispVal, 4); - return; - } - - // Otherwise, this is something that requires a relocation. Emit it as such - // now. - unsigned RelocType = Is64BitMode ? - (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext) - : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (RelocOp->isGlobal()) { - // In 64-bit static small code model, we could potentially emit absolute. - // But it's probably not beneficial. If the MCE supports using RIP directly - // do it, otherwise fallback to absolute (this is determined by IsPCRel). - // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative - // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute - bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM); - emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(), - Adj, Indirect); - } else if (RelocOp->isSymbol()) { - emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType); - } else if (RelocOp->isCPI()) { - emitConstPoolAddress(RelocOp->getIndex(), RelocType, - RelocOp->getOffset(), Adj); - } else { - assert(RelocOp->isJTI() && "Unexpected machine operand!"); - emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj); - } -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI, - unsigned Op,unsigned RegOpcodeField, - intptr_t PCAdj) { - const MachineOperand &Op3 = MI.getOperand(Op+3); - int DispVal = 0; - const MachineOperand *DispForReloc = nullptr; - - // Figure out what sort of displacement we have to handle here. - if (Op3.isGlobal()) { - DispForReloc = &Op3; - } else if (Op3.isSymbol()) { - DispForReloc = &Op3; - } else if (Op3.isCPI()) { - if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) { - DispForReloc = &Op3; - } else { - DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex()); - DispVal += Op3.getOffset(); - } - } else if (Op3.isJTI()) { - if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) { - DispForReloc = &Op3; - } else { - DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex()); - } - } else { - DispVal = Op3.getImm(); - } - - const MachineOperand &Base = MI.getOperand(Op); - const MachineOperand &Scale = MI.getOperand(Op+1); - const MachineOperand &IndexReg = MI.getOperand(Op+2); - - unsigned BaseReg = Base.getReg(); - - // Handle %rip relative addressing. - if (BaseReg == X86::RIP || - (Is64BitMode && DispForReloc)) { // [disp32+RIP] in X86-64 mode - assert(IndexReg.getReg() == 0 && Is64BitMode && - "Invalid rip-relative address"); - MCE.emitByte(ModRMByte(0, RegOpcodeField, 5)); - emitDisplacementField(DispForReloc, DispVal, PCAdj, true); - return; - } - - // Indicate that the displacement will use an pcrel or absolute reference - // by default. MCEs able to resolve addresses on-the-fly use pcrel by default - // while others, unless explicit asked to use RIP, use absolute references. - bool IsPCRel = MCE.earlyResolveAddresses() ? true : false; - - // Is a SIB byte needed? - // If no BaseReg, issue a RIP relative instruction only if the MCE can - // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table - // 2-7) and absolute references. - unsigned BaseRegNo = -1U; - if (BaseReg != 0 && BaseReg != X86::RIP) - BaseRegNo = getX86RegNum(BaseReg); - - if (// The SIB byte must be used if there is an index register. - IndexReg.getReg() == 0 && - // The SIB byte must be used if the base is ESP/RSP/R12, all of which - // encode to an R/M value of 4, which indicates that a SIB byte is - // present. - BaseRegNo != N86::ESP && - // If there is no base register and we're in 64-bit mode, we need a SIB - // byte to emit an addr that is just 'disp32' (the non-RIP relative form). - (!Is64BitMode || BaseReg != 0)) { - if (BaseReg == 0 || // [disp32] in X86-32 mode - BaseReg == X86::RIP) { // [disp32+RIP] in X86-64 mode - MCE.emitByte(ModRMByte(0, RegOpcodeField, 5)); - emitDisplacementField(DispForReloc, DispVal, PCAdj, true); - return; - } - - // If the base is not EBP/ESP and there is no displacement, use simple - // indirect register encoding, this handles addresses like [EAX]. The - // encoding for [EBP] with no displacement means [disp32] so we handle it - // by emitting a displacement of 0 below. - if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) { - MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo)); - return; - } - - // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. - if (!DispForReloc && isDisp8(DispVal)) { - MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo)); - emitConstant(DispVal, 1); - return; - } - - // Otherwise, emit the most general non-SIB encoding: [REG+disp32] - MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo)); - emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel); - return; - } - - // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first. - assert(IndexReg.getReg() != X86::ESP && - IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!"); - - bool ForceDisp32 = false; - bool ForceDisp8 = false; - if (BaseReg == 0) { - // If there is no base register, we emit the special case SIB byte with - // MOD=0, BASE=4, to JUST get the index, scale, and displacement. - MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); - ForceDisp32 = true; - } else if (DispForReloc) { - // Emit the normal disp32 encoding. - MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); - ForceDisp32 = true; - } else if (DispVal == 0 && BaseRegNo != N86::EBP) { - // Emit no displacement ModR/M byte - MCE.emitByte(ModRMByte(0, RegOpcodeField, 4)); - } else if (isDisp8(DispVal)) { - // Emit the disp8 encoding... - MCE.emitByte(ModRMByte(1, RegOpcodeField, 4)); - ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP - } else { - // Emit the normal disp32 encoding... - MCE.emitByte(ModRMByte(2, RegOpcodeField, 4)); - } - - // Calculate what the SS field value should be... - static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 }; - unsigned SS = SSTable[Scale.getImm()]; - - if (BaseReg == 0) { - // Handle the SIB byte for the case where there is no base, see Intel - // Manual 2A, table 2-7. The displacement has already been output. - unsigned IndexRegNo; - if (IndexReg.getReg()) - IndexRegNo = getX86RegNum(IndexReg.getReg()); - else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) - IndexRegNo = 4; - emitSIBByte(SS, IndexRegNo, 5); - } else { - unsigned BaseRegNo = getX86RegNum(BaseReg); - unsigned IndexRegNo; - if (IndexReg.getReg()) - IndexRegNo = getX86RegNum(IndexReg.getReg()); - else - IndexRegNo = 4; // For example [ESP+1*<noreg>+4] - emitSIBByte(SS, IndexRegNo, BaseRegNo); - } - - // Do we need to output a displacement? - if (ForceDisp8) { - emitConstant(DispVal, 1); - } else if (DispVal != 0 || ForceDisp32) { - emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel); - } -} - -static const MCInstrDesc *UpdateOp(MachineInstr &MI, const X86InstrInfo *II, - unsigned Opcode) { - const MCInstrDesc *Desc = &II->get(Opcode); - MI.setDesc(*Desc); - return Desc; -} - -/// Is16BitMemOperand - Return true if the specified instruction has -/// a 16-bit memory operand. Op specifies the operand # of the memoperand. -static bool Is16BitMemOperand(const MachineInstr &MI, unsigned Op) { - const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); - const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); - - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg()))) - return true; - return false; -} - -/// Is32BitMemOperand - Return true if the specified instruction has -/// a 32-bit memory operand. Op specifies the operand # of the memoperand. -static bool Is32BitMemOperand(const MachineInstr &MI, unsigned Op) { - const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); - const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); - - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg()))) - return true; - return false; -} - -/// Is64BitMemOperand - Return true if the specified instruction has -/// a 64-bit memory operand. Op specifies the operand # of the memoperand. -#ifndef NDEBUG -static bool Is64BitMemOperand(const MachineInstr &MI, unsigned Op) { - const MachineOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg); - const MachineOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg); - - if ((BaseReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) || - (IndexReg.getReg() != 0 && - X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg()))) - return true; - return false; -} -#endif - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags, - int MemOperand, - const MachineInstr &MI, - const MCInstrDesc *Desc) const { - // Emit the operand size opcode prefix as needed. - if (((TSFlags & X86II::OpSizeMask) >> X86II::OpSizeShift) == X86II::OpSize16) - MCE.emitByte(0x66); - - switch (Desc->TSFlags & X86II::OpPrefixMask) { - case X86II::PD: // 66 - MCE.emitByte(0x66); - break; - case X86II::XS: // F3 - MCE.emitByte(0xF3); - break; - case X86II::XD: // F2 - MCE.emitByte(0xF2); - break; - } - - // Handle REX prefix. - if (Is64BitMode) { - if (unsigned REX = determineREX(MI)) - MCE.emitByte(0x40 | REX); - } - - // 0x0F escape code must be emitted just before the opcode. - switch (Desc->TSFlags & X86II::OpMapMask) { - case X86II::TB: // Two-byte opcode map - case X86II::T8: // 0F 38 - case X86II::TA: // 0F 3A - MCE.emitByte(0x0F); - break; - } - - switch (Desc->TSFlags & X86II::OpMapMask) { - case X86II::T8: // 0F 38 - MCE.emitByte(0x38); - break; - case X86II::TA: // 0F 3A - MCE.emitByte(0x3A); - break; - } -} - -// On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range -// 0-7 and the difference between the 2 groups is given by the REX prefix. -// In the VEX prefix, registers are seen sequencially from 0-15 and encoded -// in 1's complement form, example: -// -// ModRM field => XMM9 => 1 -// VEX.VVVV => XMM9 => ~9 -// -// See table 4-35 of Intel AVX Programming Reference for details. -template<class CodeEmitter> -unsigned char -Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI, - unsigned OpNum) const { - unsigned SrcReg = MI.getOperand(OpNum).getReg(); - unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg()); - if (X86II::isX86_64ExtendedReg(SrcReg)) - SrcRegNum |= 8; - - // The registers represented through VEX_VVVV should - // be encoded in 1's complement form. - return (~SrcRegNum) & 0xf; -} - -/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitSegmentOverridePrefix(uint64_t TSFlags, - int MemOperand, - const MachineInstr &MI) const { - if (MemOperand < 0) - return; // No memory operand - - // Check for explicit segment override on memory operand. - switch (MI.getOperand(MemOperand+X86::AddrSegmentReg).getReg()) { - default: llvm_unreachable("Unknown segment register!"); - case 0: break; - case X86::CS: MCE.emitByte(0x2E); break; - case X86::SS: MCE.emitByte(0x36); break; - case X86::DS: MCE.emitByte(0x3E); break; - case X86::ES: MCE.emitByte(0x26); break; - case X86::FS: MCE.emitByte(0x64); break; - case X86::GS: MCE.emitByte(0x65); break; - } -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags, - int MemOperand, - const MachineInstr &MI, - const MCInstrDesc *Desc) const { - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; - - // VEX_R: opcode externsion equivalent to REX.R in - // 1's complement (inverted) form - // - // 1: Same as REX_R=0 (must be 1 in 32-bit mode) - // 0: Same as REX_R=1 (64 bit mode only) - // - unsigned char VEX_R = 0x1; - - // VEX_X: equivalent to REX.X, only used when a - // register is used for index in SIB Byte. - // - // 1: Same as REX.X=0 (must be 1 in 32-bit mode) - // 0: Same as REX.X=1 (64-bit mode only) - unsigned char VEX_X = 0x1; - - // VEX_B: - // - // 1: Same as REX_B=0 (ignored in 32-bit mode) - // 0: Same as REX_B=1 (64 bit mode only) - // - unsigned char VEX_B = 0x1; - - // VEX_W: opcode specific (use like REX.W, or used for - // opcode extension, or ignored, depending on the opcode byte) - unsigned char VEX_W = 0; - - // VEX_5M (VEX m-mmmmm field): - // - // 0b00000: Reserved for future use - // 0b00001: implied 0F leading opcode - // 0b00010: implied 0F 38 leading opcode bytes - // 0b00011: implied 0F 3A leading opcode bytes - // 0b00100-0b11111: Reserved for future use - // 0b01000: XOP map select - 08h instructions with imm byte - // 0b01001: XOP map select - 09h instructions with no imm byte - // 0b01010: XOP map select - 0Ah instructions with imm dword - unsigned char VEX_5M = 0; - - // VEX_4V (VEX vvvv field): a register specifier - // (in 1's complement form) or 1111 if unused. - unsigned char VEX_4V = 0xf; - - // VEX_L (Vector Length): - // - // 0: scalar or 128-bit vector - // 1: 256-bit vector - // - unsigned char VEX_L = 0; - - // VEX_PP: opcode extension providing equivalent - // functionality of a SIMD prefix - // - // 0b00: None - // 0b01: 66 - // 0b10: F3 - // 0b11: F2 - // - unsigned char VEX_PP = 0; - - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) - VEX_W = 1; - - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) - VEX_L = 1; - - switch (TSFlags & X86II::OpPrefixMask) { - default: break; // VEX_PP already correct - case X86II::PD: VEX_PP = 0x1; break; // 66 - case X86II::XS: VEX_PP = 0x2; break; // F3 - case X86II::XD: VEX_PP = 0x3; break; // F2 - } - - switch (TSFlags & X86II::OpMapMask) { - default: llvm_unreachable("Invalid prefix!"); - case X86II::TB: VEX_5M = 0x1; break; // 0F - case X86II::T8: VEX_5M = 0x2; break; // 0F 38 - case X86II::TA: VEX_5M = 0x3; break; // 0F 3A - case X86II::XOP8: VEX_5M = 0x8; break; - case X86II::XOP9: VEX_5M = 0x9; break; - case X86II::XOPA: VEX_5M = 0xA; break; - } - - // Classify VEX_B, VEX_4V, VEX_R, VEX_X - unsigned NumOps = Desc->getNumOperands(); - unsigned CurOp = 0; - if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0) - ++CurOp; - else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); - // Special case for GATHER with 2 TIED_TO operands - // Skip the first 2 operands: dst, mask_wb - CurOp += 2; - } - - switch (TSFlags & X86II::FormMask) { - default: llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!"); - case X86II::RawFrm: - break; - case X86II::MRMDestMem: { - // MRMDestMem instructions forms: - // MemAddr, src1(ModR/M) - // MemAddr, src1(VEX_4V), src2(ModR/M) - // MemAddr, src1(ModR/M), imm8 - // - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg())) - VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg())) - VEX_X = 0x0; - - CurOp = X86::AddrNumOperands; - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); - - const MachineOperand &MO = MI.getOperand(CurOp); - if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg())) - VEX_R = 0x0; - break; - } - case X86II::MRMSrcMem: - // MRMSrcMem instructions forms: - // src1(ModR/M), MemAddr - // src1(ModR/M), src2(VEX_4V), MemAddr - // src1(ModR/M), MemAddr, imm8 - // src1(ModR/M), MemAddr, src2(VEX_I8IMM) - // - // FMA4: - // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) - // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_R = 0x0; - CurOp++; - - if (HasVEX_4V) { - VEX_4V = getVEXRegisterEncoding(MI, CurOp); - CurOp++; - } - - if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) - VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) - VEX_X = 0x0; - - if (HasVEX_4VOp3) - VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands); - break; - case X86II::MRM0m: case X86II::MRM1m: - case X86II::MRM2m: case X86II::MRM3m: - case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: { - // MRM[0-9]m instructions forms: - // MemAddr - // src1(VEX_4V), MemAddr - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); - - if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemOperand+X86::AddrBaseReg).getReg())) - VEX_B = 0x0; - if (X86II::isX86_64ExtendedReg( - MI.getOperand(MemOperand+X86::AddrIndexReg).getReg())) - VEX_X = 0x0; - break; - } - case X86II::MRMSrcReg: - // MRMSrcReg instructions forms: - // dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) - // dst(ModR/M), src1(ModR/M) - // dst(ModR/M), src1(ModR/M), imm8 - // - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_R = 0x0; - CurOp++; - - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); - - if (HasMemOp4) // Skip second register source (encoded in I8IMM) - CurOp++; - - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_B = 0x0; - CurOp++; - if (HasVEX_4VOp3) - VEX_4V = getVEXRegisterEncoding(MI, CurOp); - break; - case X86II::MRMDestReg: - // MRMDestReg instructions forms: - // dst(ModR/M), src(ModR/M) - // dst(ModR/M), src(ModR/M), imm8 - // dst(ModR/M), src1(VEX_4V), src2(ModR/M) - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_B = 0x0; - CurOp++; - - if (HasVEX_4V) - VEX_4V = getVEXRegisterEncoding(MI, CurOp++); - - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_R = 0x0; - break; - case X86II::MRM0r: case X86II::MRM1r: - case X86II::MRM2r: case X86II::MRM3r: - case X86II::MRM4r: case X86II::MRM5r: - case X86II::MRM6r: case X86II::MRM7r: - // MRM0r-MRM7r instructions forms: - // dst(VEX_4V), src(ModR/M), imm8 - VEX_4V = getVEXRegisterEncoding(MI, CurOp); - CurOp++; - - if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg())) - VEX_B = 0x0; - break; - } - - // Emit segment override opcode prefix as needed. - emitSegmentOverridePrefix(TSFlags, MemOperand, MI); - - // VEX opcode prefix can have 2 or 3 bytes - // - // 3 bytes: - // +-----+ +--------------+ +-------------------+ - // | C4h | | RXB | m-mmmm | | W | vvvv | L | pp | - // +-----+ +--------------+ +-------------------+ - // 2 bytes: - // +-----+ +-------------------+ - // | C5h | | R | vvvv | L | pp | - // +-----+ +-------------------+ - // - // XOP uses a similar prefix: - // +-----+ +--------------+ +-------------------+ - // | 8Fh | | RXB | m-mmmm | | W | vvvv | L | pp | - // +-----+ +--------------+ +-------------------+ - unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3); - - // Can this use the 2 byte VEX prefix? - if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { - MCE.emitByte(0xC5); - MCE.emitByte(LastByte | (VEX_R << 7)); - return; - } - - // 3 byte VEX prefix - MCE.emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4); - MCE.emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M); - MCE.emitByte(LastByte | (VEX_W << 7)); -} - -template<class CodeEmitter> -void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI, - const MCInstrDesc *Desc) { - DEBUG(dbgs() << MI); - - // If this is a pseudo instruction, lower it. - switch (Desc->getOpcode()) { - case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break; - case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break; - case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break; - case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break; - case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break; - case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break; - case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break; - case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break; - case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break; - case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break; - case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break; - case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break; - case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break; - case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break; - case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break; - case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break; - case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break; - } - - - MCE.processDebugLoc(MI.getDebugLoc(), true); - - unsigned Opcode = Desc->Opcode; - - // If this is a two-address instruction, skip one of the register operands. - unsigned NumOps = Desc->getNumOperands(); - unsigned CurOp = 0; - if (NumOps > 1 && Desc->getOperandConstraint(1, MCOI::TIED_TO) == 0) - ++CurOp; - else if (NumOps > 3 && Desc->getOperandConstraint(2, MCOI::TIED_TO) == 0) { - assert(Desc->getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1); - // Special case for GATHER with 2 TIED_TO operands - // Skip the first 2 operands: dst, mask_wb - CurOp += 2; - } - - uint64_t TSFlags = Desc->TSFlags; - - // Encoding type for this instruction. - unsigned char Encoding = (TSFlags & X86II::EncodingMask) >> - X86II::EncodingShift; - - // It uses the VEX.VVVV field? - bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; - bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; - bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4; - const unsigned MemOp4_I8IMMOperand = 2; - - // Determine where the memory operand starts, if present. - int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); - if (MemoryOperand != -1) MemoryOperand += CurOp; - - // Emit the lock opcode prefix as needed. - if (Desc->TSFlags & X86II::LOCK) - MCE.emitByte(0xF0); - - // Emit segment override opcode prefix as needed. - emitSegmentOverridePrefix(TSFlags, MemoryOperand, MI); - - // Emit the repeat opcode prefix as needed. - if (Desc->TSFlags & X86II::REP) - MCE.emitByte(0xF3); - - // Emit the address size opcode prefix as needed. - bool need_address_override; - if (TSFlags & X86II::AdSize) { - need_address_override = true; - } else if (MemoryOperand < 0) { - need_address_override = false; - } else if (Is64BitMode) { - assert(!Is16BitMemOperand(MI, MemoryOperand)); - need_address_override = Is32BitMemOperand(MI, MemoryOperand); - } else { - assert(!Is64BitMemOperand(MI, MemoryOperand)); - need_address_override = Is16BitMemOperand(MI, MemoryOperand); - } - - if (need_address_override) - MCE.emitByte(0x67); - - if (Encoding == 0) - emitOpcodePrefix(TSFlags, MemoryOperand, MI, Desc); - else - emitVEXOpcodePrefix(TSFlags, MemoryOperand, MI, Desc); - - unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags); - switch (TSFlags & X86II::FormMask) { - default: - llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!"); - case X86II::Pseudo: - // Remember the current PC offset, this is the PIC relocation - // base address. - switch (Opcode) { - default: - llvm_unreachable("pseudo instructions should be removed before code" - " emission"); - // Do nothing for Int_MemBarrier - it's just a comment. Add a debug - // to make it slightly easier to see. - case X86::Int_MemBarrier: - DEBUG(dbgs() << "#MEMBARRIER\n"); - break; - - case TargetOpcode::INLINEASM: - // We allow inline assembler nodes with empty bodies - they can - // implicitly define registers, which is ok for JIT. - if (MI.getOperand(0).getSymbolName()[0]) { - DebugLoc DL = MI.getDebugLoc(); - DL.print(MI.getParent()->getParent()->getFunction()->getContext(), - llvm::errs()); - report_fatal_error("JIT does not support inline asm!"); - } - break; - case TargetOpcode::DBG_VALUE: - case TargetOpcode::CFI_INSTRUCTION: - break; - case TargetOpcode::GC_LABEL: - case TargetOpcode::EH_LABEL: - MCE.emitLabel(MI.getOperand(0).getMCSymbol()); - break; - - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - break; - - case X86::SEH_PushReg: - case X86::SEH_SaveReg: - case X86::SEH_SaveXMM: - case X86::SEH_StackAlloc: - case X86::SEH_SetFrame: - case X86::SEH_PushFrame: - case X86::SEH_EndPrologue: - break; - - case X86::MOVPC32r: { - // This emits the "call" portion of this pseudo instruction. - MCE.emitByte(BaseOpcode); - emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags)); - // Remember PIC base. - PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset(); - X86JITInfo *JTI = TM.getJITInfo(); - JTI->setPICBase(MCE.getCurrentPCValue()); - break; - } - } - CurOp = NumOps; - break; - case X86II::RawFrm: { - MCE.emitByte(BaseOpcode); - - if (CurOp == NumOps) - break; - - const MachineOperand &MO = MI.getOperand(CurOp++); - - DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n"); - DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n"); - DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n"); - DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n"); - DEBUG(dbgs() << "isImm " << MO.isImm() << "\n"); - - if (MO.isMBB()) { - emitPCRelativeBlockAddress(MO.getMBB()); - break; - } - - if (MO.isGlobal()) { - emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word, - MO.getOffset(), 0); - break; - } - - if (MO.isSymbol()) { - emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word); - break; - } - - // FIXME: Only used by hackish MCCodeEmitter, remove when dead. - if (MO.isJTI()) { - emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word); - break; - } - - assert(MO.isImm() && "Unknown RawFrm operand!"); - if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) { - // Fix up immediate operand for pc relative calls. - intptr_t Imm = (intptr_t)MO.getImm(); - Imm = Imm - MCE.getCurrentPCValue() - 4; - emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags)); - } else - emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags)); - break; - } - - case X86II::AddRegFrm: { - MCE.emitByte(BaseOpcode + - getX86RegNum(MI.getOperand(CurOp++).getReg())); - - if (CurOp == NumOps) - break; - - const MachineOperand &MO1 = MI.getOperand(CurOp++); - unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); - if (MO1.isImm()) { - emitConstant(MO1.getImm(), Size); - break; - } - - unsigned rt = Is64BitMode ? X86::reloc_pcrel_word - : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (Opcode == X86::MOV32ri64) - rt = X86::reloc_absolute_word; // FIXME: add X86II flag? - // This should not occur on Darwin for relocatable objects. - if (Opcode == X86::MOV64ri) - rt = X86::reloc_absolute_dword; // FIXME: add X86II flag? - if (MO1.isGlobal()) { - bool Indirect = gvNeedsNonLazyPtr(MO1, TM); - emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0, - Indirect); - } else if (MO1.isSymbol()) - emitExternalSymbolAddress(MO1.getSymbolName(), rt); - else if (MO1.isCPI()) - emitConstPoolAddress(MO1.getIndex(), rt); - else if (MO1.isJTI()) - emitJumpTableAddress(MO1.getIndex(), rt); - break; - } - - case X86II::MRMDestReg: { - MCE.emitByte(BaseOpcode); - - unsigned SrcRegNum = CurOp+1; - if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) - SrcRegNum++; - - emitRegModRMByte(MI.getOperand(CurOp).getReg(), - getX86RegNum(MI.getOperand(SrcRegNum).getReg())); - CurOp = SrcRegNum + 1; - break; - } - case X86II::MRMDestMem: { - MCE.emitByte(BaseOpcode); - - unsigned SrcRegNum = CurOp + X86::AddrNumOperands; - if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) - SrcRegNum++; - emitMemModRMByte(MI, CurOp, - getX86RegNum(MI.getOperand(SrcRegNum).getReg())); - CurOp = SrcRegNum + 1; - break; - } - - case X86II::MRMSrcReg: { - MCE.emitByte(BaseOpcode); - - unsigned SrcRegNum = CurOp+1; - if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) - ++SrcRegNum; - - if (HasMemOp4) // Skip 2nd src (which is encoded in I8IMM) - ++SrcRegNum; - - emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(), - getX86RegNum(MI.getOperand(CurOp).getReg())); - // 2 operands skipped with HasMemOp4, compensate accordingly - CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1; - if (HasVEX_4VOp3) - ++CurOp; - break; - } - case X86II::MRMSrcMem: { - int AddrOperands = X86::AddrNumOperands; - unsigned FirstMemOp = CurOp+1; - if (HasVEX_4V) { - ++AddrOperands; - ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). - } - if (HasMemOp4) // Skip second register source (encoded in I8IMM) - ++FirstMemOp; - - MCE.emitByte(BaseOpcode); - - intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ? - X86II::getSizeOfImm(Desc->TSFlags) : 0; - emitMemModRMByte(MI, FirstMemOp, - getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj); - CurOp += AddrOperands + 1; - if (HasVEX_4VOp3) - ++CurOp; - break; - } - - case X86II::MRMXr: - case X86II::MRM0r: case X86II::MRM1r: - case X86II::MRM2r: case X86II::MRM3r: - case X86II::MRM4r: case X86II::MRM5r: - case X86II::MRM6r: case X86II::MRM7r: { - if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). - ++CurOp; - MCE.emitByte(BaseOpcode); - uint64_t Form = (Desc->TSFlags & X86II::FormMask); - emitRegModRMByte(MI.getOperand(CurOp++).getReg(), - (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r); - - if (CurOp == NumOps) - break; - - const MachineOperand &MO1 = MI.getOperand(CurOp++); - unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); - if (MO1.isImm()) { - emitConstant(MO1.getImm(), Size); - break; - } - - unsigned rt = Is64BitMode ? X86::reloc_pcrel_word - : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (Opcode == X86::MOV64ri32) - rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag? - if (MO1.isGlobal()) { - bool Indirect = gvNeedsNonLazyPtr(MO1, TM); - emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0, - Indirect); - } else if (MO1.isSymbol()) - emitExternalSymbolAddress(MO1.getSymbolName(), rt); - else if (MO1.isCPI()) - emitConstPoolAddress(MO1.getIndex(), rt); - else if (MO1.isJTI()) - emitJumpTableAddress(MO1.getIndex(), rt); - break; - } - - case X86II::MRMXm: - case X86II::MRM0m: case X86II::MRM1m: - case X86II::MRM2m: case X86II::MRM3m: - case X86II::MRM4m: case X86II::MRM5m: - case X86II::MRM6m: case X86II::MRM7m: { - if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV). - ++CurOp; - intptr_t PCAdj = (CurOp + X86::AddrNumOperands != NumOps) ? - (MI.getOperand(CurOp+X86::AddrNumOperands).isImm() ? - X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0; - - MCE.emitByte(BaseOpcode); - uint64_t Form = (Desc->TSFlags & X86II::FormMask); - emitMemModRMByte(MI, CurOp, (Form==X86II::MRMXm) ? 0 : Form - X86II::MRM0m, - PCAdj); - CurOp += X86::AddrNumOperands; - - if (CurOp == NumOps) - break; - - const MachineOperand &MO = MI.getOperand(CurOp++); - unsigned Size = X86II::getSizeOfImm(Desc->TSFlags); - if (MO.isImm()) { - emitConstant(MO.getImm(), Size); - break; - } - - unsigned rt = Is64BitMode ? X86::reloc_pcrel_word - : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word); - if (Opcode == X86::MOV64mi32) - rt = X86::reloc_absolute_word_sext; // FIXME: add X86II flag? - if (MO.isGlobal()) { - bool Indirect = gvNeedsNonLazyPtr(MO, TM); - emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0, - Indirect); - } else if (MO.isSymbol()) - emitExternalSymbolAddress(MO.getSymbolName(), rt); - else if (MO.isCPI()) - emitConstPoolAddress(MO.getIndex(), rt); - else if (MO.isJTI()) - emitJumpTableAddress(MO.getIndex(), rt); - break; - } - - case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: - case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: - case X86II::MRM_D0: case X86II::MRM_D1: case X86II::MRM_D4: - case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D8: - case X86II::MRM_D9: case X86II::MRM_DA: case X86II::MRM_DB: - case X86II::MRM_DC: case X86II::MRM_DD: case X86II::MRM_DE: - case X86II::MRM_DF: case X86II::MRM_E0: case X86II::MRM_E1: - case X86II::MRM_E2: case X86II::MRM_E3: case X86II::MRM_E4: - case X86II::MRM_E5: case X86II::MRM_E8: case X86II::MRM_E9: - case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: - case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_F0: - case X86II::MRM_F1: case X86II::MRM_F2: case X86II::MRM_F3: - case X86II::MRM_F4: case X86II::MRM_F5: case X86II::MRM_F6: - case X86II::MRM_F7: case X86II::MRM_F8: case X86II::MRM_F9: - case X86II::MRM_FA: case X86II::MRM_FB: case X86II::MRM_FC: - case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF: - MCE.emitByte(BaseOpcode); - - unsigned char MRM; - switch (TSFlags & X86II::FormMask) { - default: llvm_unreachable("Invalid Form"); - case X86II::MRM_C0: MRM = 0xC0; break; - case X86II::MRM_C1: MRM = 0xC1; break; - case X86II::MRM_C2: MRM = 0xC2; break; - case X86II::MRM_C3: MRM = 0xC3; break; - case X86II::MRM_C4: MRM = 0xC4; break; - case X86II::MRM_C8: MRM = 0xC8; break; - case X86II::MRM_C9: MRM = 0xC9; break; - case X86II::MRM_CA: MRM = 0xCA; break; - case X86II::MRM_CB: MRM = 0xCB; break; - case X86II::MRM_D0: MRM = 0xD0; break; - case X86II::MRM_D1: MRM = 0xD1; break; - case X86II::MRM_D4: MRM = 0xD4; break; - case X86II::MRM_D5: MRM = 0xD5; break; - case X86II::MRM_D6: MRM = 0xD6; break; - case X86II::MRM_D8: MRM = 0xD8; break; - case X86II::MRM_D9: MRM = 0xD9; break; - case X86II::MRM_DA: MRM = 0xDA; break; - case X86II::MRM_DB: MRM = 0xDB; break; - case X86II::MRM_DC: MRM = 0xDC; break; - case X86II::MRM_DD: MRM = 0xDD; break; - case X86II::MRM_DE: MRM = 0xDE; break; - case X86II::MRM_DF: MRM = 0xDF; break; - case X86II::MRM_E0: MRM = 0xE0; break; - case X86II::MRM_E1: MRM = 0xE1; break; - case X86II::MRM_E2: MRM = 0xE2; break; - case X86II::MRM_E3: MRM = 0xE3; break; - case X86II::MRM_E4: MRM = 0xE4; break; - case X86II::MRM_E5: MRM = 0xE5; break; - case X86II::MRM_E8: MRM = 0xE8; break; - case X86II::MRM_E9: MRM = 0xE9; break; - case X86II::MRM_EA: MRM = 0xEA; break; - case X86II::MRM_EB: MRM = 0xEB; break; - case X86II::MRM_EC: MRM = 0xEC; break; - case X86II::MRM_ED: MRM = 0xED; break; - case X86II::MRM_EE: MRM = 0xEE; break; - case X86II::MRM_F0: MRM = 0xF0; break; - case X86II::MRM_F1: MRM = 0xF1; break; - case X86II::MRM_F2: MRM = 0xF2; break; - case X86II::MRM_F3: MRM = 0xF3; break; - case X86II::MRM_F4: MRM = 0xF4; break; - case X86II::MRM_F5: MRM = 0xF5; break; - case X86II::MRM_F6: MRM = 0xF6; break; - case X86II::MRM_F7: MRM = 0xF7; break; - case X86II::MRM_F8: MRM = 0xF8; break; - case X86II::MRM_F9: MRM = 0xF9; break; - case X86II::MRM_FA: MRM = 0xFA; break; - case X86II::MRM_FB: MRM = 0xFB; break; - case X86II::MRM_FC: MRM = 0xFC; break; - case X86II::MRM_FD: MRM = 0xFD; break; - case X86II::MRM_FE: MRM = 0xFE; break; - case X86II::MRM_FF: MRM = 0xFF; break; - } - MCE.emitByte(MRM); - break; - } - - while (CurOp != NumOps && NumOps - CurOp <= 2) { - // The last source register of a 4 operand instruction in AVX is encoded - // in bits[7:4] of a immediate byte. - if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { - const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand - : CurOp); - ++CurOp; - unsigned RegNum = getX86RegNum(MO.getReg()) << 4; - if (X86II::isX86_64ExtendedReg(MO.getReg())) - RegNum |= 1 << 7; - // If there is an additional 5th operand it must be an immediate, which - // is encoded in bits[3:0] - if (CurOp != NumOps) { - const MachineOperand &MIMM = MI.getOperand(CurOp++); - if (MIMM.isImm()) { - unsigned Val = MIMM.getImm(); - assert(Val < 16 && "Immediate operand value out of range"); - RegNum |= Val; - } - } - emitConstant(RegNum, 1); - } else { - emitConstant(MI.getOperand(CurOp++).getImm(), - X86II::getSizeOfImm(Desc->TSFlags)); - } - } - - if (!MI.isVariadic() && CurOp != NumOps) { -#ifndef NDEBUG - dbgs() << "Cannot encode all operands of: " << MI << "\n"; -#endif - llvm_unreachable(nullptr); - } - - MCE.processDebugLoc(MI.getDebugLoc(), false); -} diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index ae3eea4943d7..688a5447b8e6 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -64,7 +64,7 @@ public: X86ScalarSSEf32 = Subtarget->hasSSE1(); } - bool TargetSelectInstruction(const Instruction *I) override; + bool fastSelectInstruction(const Instruction *I) override; /// \brief The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, @@ -73,9 +73,9 @@ public: bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const LoadInst *LI) override; - bool FastLowerArguments() override; - bool FastLowerCall(CallLoweringInfo &CLI) override; - bool FastLowerIntrinsicCall(const IntrinsicInst *II) override; + bool fastLowerArguments() override; + bool fastLowerCall(CallLoweringInfo &CLI) override; + bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; #include "X86GenFastISel.inc" @@ -127,7 +127,7 @@ private: bool X86SelectFPTrunc(const Instruction *I); const X86InstrInfo *getInstrInfo() const { - return getTargetMachine()->getInstrInfo(); + return getTargetMachine()->getSubtargetImpl()->getInstrInfo(); } const X86TargetMachine *getTargetMachine() const { return static_cast<const X86TargetMachine *>(&TM); @@ -135,11 +135,14 @@ private: bool handleConstantAddresses(const Value *V, X86AddressMode &AM); - unsigned TargetMaterializeConstant(const Constant *C) override; + unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT); + unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT); + unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT); + unsigned fastMaterializeConstant(const Constant *C) override; - unsigned TargetMaterializeAlloca(const AllocaInst *C) override; + unsigned fastMaterializeAlloca(const AllocaInst *C) override; - unsigned TargetMaterializeFloatZero(const ConstantFP *CF) override; + unsigned fastMaterializeFloatZero(const ConstantFP *CF) override; /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. @@ -161,46 +164,6 @@ private: } // end anonymous namespace. -static CmpInst::Predicate optimizeCmpPredicate(const CmpInst *CI) { - // If both operands are the same, then try to optimize or fold the cmp. - CmpInst::Predicate Predicate = CI->getPredicate(); - if (CI->getOperand(0) != CI->getOperand(1)) - return Predicate; - - switch (Predicate) { - default: llvm_unreachable("Invalid predicate!"); - case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::FCMP_OEQ: Predicate = CmpInst::FCMP_ORD; break; - case CmpInst::FCMP_OGT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::FCMP_OGE: Predicate = CmpInst::FCMP_ORD; break; - case CmpInst::FCMP_OLT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::FCMP_OLE: Predicate = CmpInst::FCMP_ORD; break; - case CmpInst::FCMP_ONE: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::FCMP_ORD: Predicate = CmpInst::FCMP_ORD; break; - case CmpInst::FCMP_UNO: Predicate = CmpInst::FCMP_UNO; break; - case CmpInst::FCMP_UEQ: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::FCMP_UGT: Predicate = CmpInst::FCMP_UNO; break; - case CmpInst::FCMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::FCMP_ULT: Predicate = CmpInst::FCMP_UNO; break; - case CmpInst::FCMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::FCMP_UNE: Predicate = CmpInst::FCMP_UNO; break; - case CmpInst::FCMP_TRUE: Predicate = CmpInst::FCMP_TRUE; break; - - case CmpInst::ICMP_EQ: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::ICMP_NE: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_UGT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_UGE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::ICMP_ULT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_ULE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::ICMP_SGT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_SGE: Predicate = CmpInst::FCMP_TRUE; break; - case CmpInst::ICMP_SLT: Predicate = CmpInst::FCMP_FALSE; break; - case CmpInst::ICMP_SLE: Predicate = CmpInst::FCMP_TRUE; break; - } - - return Predicate; -} - static std::pair<X86::CondCode, bool> getX86ConditionCode(CmpInst::Predicate Predicate) { X86::CondCode CC = X86::COND_INVALID; @@ -529,7 +492,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg) { - unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, + unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src, /*TODO: Kill=*/false); if (RR == 0) return false; @@ -581,7 +544,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { // Ok, we need to do a load from a stub. If we've already loaded from // this stub, reuse the loaded pointer, otherwise emit the load now. - DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V); + DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V); unsigned LoadReg; if (I != LocalValueMap.end() && I->second != 0) { LoadReg = I->second; @@ -692,7 +655,7 @@ redo_gep: case Instruction::Alloca: { // Do static allocas. const AllocaInst *A = cast<AllocaInst>(V); - DenseMap<const AllocaInst*, int>::iterator SI = + DenseMap<const AllocaInst *, int>::iterator SI = FuncInfo.StaticAllocaMap.find(A); if (SI != FuncInfo.StaticAllocaMap.end()) { AM.BaseType = X86AddressMode::FrameIndexBase; @@ -940,7 +903,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { unsigned Alignment = S->getAlignment(); unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); - if (Alignment == 0) // Ensure that codegen never sees alignment 0 + if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = ABIAlignment; bool Aligned = Alignment >= ABIAlignment; @@ -993,8 +956,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ValLocs; - CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, - I->getContext()); + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); const Value *RV = Ret->getOperand(0); @@ -1017,7 +979,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // The calling-convention tables for x87 returns don't tell // the whole story. - if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; unsigned SrcReg = Reg + VA.getValNo(); @@ -1036,23 +998,23 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (SrcVT == MVT::i1) { if (Outs[0].Flags.isSExt()) return false; - SrcReg = FastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); + SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; } unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; - SrcReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, + SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg, /*TODO: Kill=*/false); } // Make the copy. unsigned DstReg = VA.getLocReg(); - const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. if (!SrcRC->contains(DstReg)) return false; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), - DstReg).addReg(SrcReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg); // Add register to return instruction. RetRegs.push_back(VA.getLocReg()); @@ -1068,14 +1030,15 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), - RetReg).addReg(Reg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), RetReg).addReg(Reg); RetRegs.push_back(RetReg); } // Now emit the RET. MachineInstrBuilder MIB = - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) MIB.addReg(RetRegs[i], RegState::Implicit); return true; @@ -1104,7 +1067,7 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) return false; - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1194,7 +1157,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { ResultReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), ResultReg); - ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; @@ -1209,7 +1172,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { } if (ResultReg) { - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1250,7 +1213,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { FlagReg2); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), ResultReg).addReg(FlagReg1).addReg(FlagReg2); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1268,7 +1231,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1285,7 +1248,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType()); if (SrcVT.SimpleTy == MVT::i1) { // Set the high bits to zero. - ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); + ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; if (ResultReg == 0) @@ -1312,17 +1275,16 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { ResultReg) .addImm(0).addReg(Result32).addImm(X86::sub_32bit); } else if (DstVT != MVT::i8) { - ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, + ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, ResultReg, /*Kill=*/true); if (ResultReg == 0) return false; } - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } - bool X86FastISel::X86SelectBranch(const Instruction *I) { // Unconditional branches are selected by tablegen-generated code. // Handle a conditional branch. @@ -1342,8 +1304,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); switch (Predicate) { default: break; - case CmpInst::FCMP_FALSE: FastEmitBranch(FalseMBB, DbgLoc); return true; - case CmpInst::FCMP_TRUE: FastEmitBranch(TrueMBB, DbgLoc); return true; + case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true; + case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true; } const Value *CmpLHS = CI->getOperand(0); @@ -1400,7 +1362,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // X86 requires a second branch to handle UNE (and OEQ, which is mapped // to UNE above). if (NeedExtraBranch) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_4)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1)) .addMBB(TrueMBB); } @@ -1413,7 +1375,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // Emits an unconditional branch to the FalseBB, obtains the branch // weight, and adds it to the successor list. - FastEmitBranch(FalseMBB, DbgLoc); + fastEmitBranch(FalseMBB, DbgLoc); return true; } @@ -1437,15 +1399,15 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) .addReg(OpReg).addImm(1); - unsigned JmpOpc = X86::JNE_4; + unsigned JmpOpc = X86::JNE_1; if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); - JmpOpc = X86::JE_4; + JmpOpc = X86::JE_1; } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); - FastEmitBranch(FalseMBB, DbgLoc); + fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), @@ -1465,7 +1427,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); - FastEmitBranch(FalseMBB, DbgLoc); + fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), @@ -1482,9 +1444,9 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(OpReg).addImm(1); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_4)) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) .addMBB(TrueMBB); - FastEmitBranch(FalseMBB, DbgLoc); + fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), @@ -1558,7 +1520,7 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) .addReg(Op0Reg); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1670,8 +1632,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { TII.get(X86::MOV32r0), Zero32); // Copy the zero into the appropriate sub/super/identical physical - // register. Unfortunately the operations needed are not uniform enough to - // fit neatly into the table above. + // register. Unfortunately the operations needed are not uniform enough + // to fit neatly into the table above. if (VT.SimpleTy == MVT::i16) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) @@ -1712,7 +1674,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { ResultSuperReg).addReg(SourceSuperReg).addImm(8); // Now reference the 8-bit subreg of the result. - ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, + ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, /*Kill=*/true, X86::sub_8bit); } // Copy the result out of the physreg if we haven't already. @@ -1721,7 +1683,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg) .addReg(OpEntry.DivRemResultReg); } - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1779,7 +1741,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { EVT CmpVT = TLI.getValueType(CmpLHS->getType()); // Emit a compare of the LHS and RHS, setting the flags. if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) - return false; + return false; if (SETFOpc) { unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); @@ -1837,9 +1799,9 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return false; unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); - unsigned ResultReg = FastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, + unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1858,7 +1820,7 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { if (I->getType() != CI->getOperand(0)->getType() || !((Subtarget->hasSSE1() && RetVT == MVT::f32) || - (Subtarget->hasSSE2() && RetVT == MVT::f64) )) + (Subtarget->hasSSE2() && RetVT == MVT::f64))) return false; const Value *CmpLHS = CI->getOperand(0); @@ -1917,15 +1879,15 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); - unsigned CmpReg = FastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = FastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, + unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, LHSReg, LHSIsKill); - unsigned AndNReg = FastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, + unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, RHSReg, RHSIsKill); - unsigned ResultReg = FastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, + unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, AndReg, /*IsKill=*/true); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -1988,8 +1950,8 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned ResultReg = - FastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); - UpdateValueMap(I, ResultReg); + fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); + updateValueMap(I, ResultReg); return true; } @@ -2018,7 +1980,7 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(OpReg, getKillRegState(OpIsKill)); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } } @@ -2051,7 +2013,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::CVTSS2SDrr), ResultReg) .addReg(OpReg); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } } @@ -2070,7 +2032,7 @@ bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::CVTSD2SSrr), ResultReg) .addReg(OpReg); - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } } @@ -2096,30 +2058,29 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { if (SrcVT == MVT::i8) { // Truncate from i8 to i1; no code needed. - UpdateValueMap(I, InputReg); + updateValueMap(I, InputReg); return true; } if (!Subtarget->is64Bit()) { // If we're on x86-32; we can't extract an i8 from a general register. // First issue a copy to GR16_ABCD or GR32_ABCD. - const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ? - (const TargetRegisterClass*)&X86::GR16_ABCDRegClass : - (const TargetRegisterClass*)&X86::GR32_ABCDRegClass; + const TargetRegisterClass *CopyRC = + (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass; unsigned CopyReg = createResultReg(CopyRC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), - CopyReg).addReg(InputReg); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg); InputReg = CopyReg; } // Issue an extract_subreg. - unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8, + unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; - UpdateValueMap(I, ResultReg); + updateValueMap(I, ResultReg); return true; } @@ -2145,9 +2106,8 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, VT = MVT::i32; else if (Len >= 2) VT = MVT::i16; - else { + else VT = MVT::i8; - } unsigned Reg; bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); @@ -2163,19 +2123,7 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, return true; } -static bool isCommutativeIntrinsic(IntrinsicInst const *II) { - switch (II->getIntrinsicID()) { - case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::umul_with_overflow: - return true; - default: - return false; - } -} - -bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { +bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // FIXME: Handle more intrinsics. switch (II->getIntrinsicID()) { default: return false; @@ -2195,14 +2143,14 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; } - // This needs to be set before we call getFrameRegister, otherwise we get - // the wrong frame register. + // This needs to be set before we call getPtrSizedFrameRegister, otherwise + // we get the wrong frame register. MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); MFI->setFrameAddressIsTaken(true); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); - unsigned FrameReg = RegInfo->getFrameRegister(*(FuncInfo.MF)); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*(FuncInfo.MF)); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -2228,7 +2176,7 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { SrcReg = DestReg; } - UpdateValueMap(II, SrcReg); + updateValueMap(II, SrcReg); return true; } case Intrinsic::memcpy: { @@ -2258,7 +2206,7 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255) return false; - return LowerCallTo(II, "memcpy", II->getNumArgOperands() - 2); + return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2); } case Intrinsic::memset: { const MemSetInst *MSI = cast<MemSetInst>(II); @@ -2273,7 +2221,7 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { if (MSI->getDestAddressSpace() > 255) return false; - return LowerCallTo(II, "memset", II->getNumArgOperands() - 2); + return lowerCallTo(II, "memset", II->getNumArgOperands() - 2); } case Intrinsic::stackprotector: { // Emit code to store the stack guard onto the stack. @@ -2299,8 +2247,10 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); // FIXME may need to add RegState::Debug to any registers produced, // although ESP/EBP should be the only ones at the moment. - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM). - addImm(0).addMetadata(DI->getVariable()); + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM) + .addImm(0) + .addMetadata(DI->getVariable()) + .addMetadata(DI->getExpression()); return true; } case Intrinsic::trap: { @@ -2317,7 +2267,7 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { if (!isTypeLegal(RetTy, VT)) return false; - // Unfortunately we can't use FastEmit_r, because the AVX version of FSQRT + // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT // is not generated by FastISel yet. // FIXME: Update this code once tablegen can handle it. static const unsigned SqrtOpc[2][2] = { @@ -2356,7 +2306,7 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { MIB.addReg(SrcReg); - UpdateValueMap(II, ResultReg); + updateValueMap(II, ResultReg); return true; } case Intrinsic::sadd_with_overflow: @@ -2387,15 +2337,23 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { isCommutativeIntrinsic(II)) std::swap(LHS, RHS); + bool UseIncDec = false; + if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne()) + UseIncDec = true; + unsigned BaseOpc, CondOpc; switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::sadd_with_overflow: - BaseOpc = ISD::ADD; CondOpc = X86::SETOr; break; + BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD); + CondOpc = X86::SETOr; + break; case Intrinsic::uadd_with_overflow: BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; case Intrinsic::ssub_with_overflow: - BaseOpc = ISD::SUB; CondOpc = X86::SETOr; break; + BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB); + CondOpc = X86::SETOr; + break; case Intrinsic::usub_with_overflow: BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; case Intrinsic::smul_with_overflow: @@ -2411,9 +2369,21 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { unsigned ResultReg = 0; // Check if we have an immediate version. - if (auto const *C = dyn_cast<ConstantInt>(RHS)) { - ResultReg = FastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, - C->getZExtValue()); + if (const auto *CI = dyn_cast<ConstantInt>(RHS)) { + static const unsigned Opc[2][4] = { + { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, + { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } + }; + + if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) { + ResultReg = createResultReg(TLI.getRegClassFor(VT)); + bool IsDec = BaseOpc == X86ISD::DEC; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg) + .addReg(LHSReg, getKillRegState(LHSIsKill)); + } else + ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, + CI->getZExtValue()); } unsigned RHSReg; @@ -2423,7 +2393,7 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { if (RHSReg == 0) return false; RHSIsKill = hasTrivialKill(RHS); - ResultReg = FastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, + ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill); } @@ -2438,7 +2408,7 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) .addReg(LHSReg, getKillRegState(LHSIsKill)); - ResultReg = FastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], + ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], TLI.getRegClassFor(VT), RHSReg, RHSIsKill); } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { static const unsigned MULOpc[] = @@ -2449,10 +2419,10 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), X86::AL) .addReg(LHSReg, getKillRegState(LHSIsKill)); - ResultReg = FastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, + ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, RHSIsKill); } else - ResultReg = FastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], + ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], TLI.getRegClassFor(VT), LHSReg, LHSIsKill, RHSReg, RHSIsKill); } @@ -2465,7 +2435,7 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), ResultReg2); - UpdateValueMap(II, ResultReg, 2); + updateValueMap(II, ResultReg, 2); return true; } case Intrinsic::x86_sse_cvttss2si: @@ -2531,13 +2501,13 @@ bool X86FastISel::FastLowerIntrinsicCall(const IntrinsicInst *II) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(Reg); - UpdateValueMap(II, ResultReg); + updateValueMap(II, ResultReg); return true; } } } -bool X86FastISel::FastLowerArguments() { +bool X86FastISel::fastLowerArguments() { if (!FuncInfo.CanLowerReturn) return false; @@ -2554,7 +2524,7 @@ bool X86FastISel::FastLowerArguments() { if (!Subtarget->is64Bit()) return false; - + // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. unsigned GPRCnt = 0; unsigned FPRCnt = 0; @@ -2627,7 +2597,7 @@ bool X86FastISel::FastLowerArguments() { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(DstReg, getKillRegState(true)); - UpdateValueMap(&Arg, ResultReg); + updateValueMap(&Arg, ResultReg); } return true; } @@ -2649,7 +2619,7 @@ static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, return 4; } -bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { +bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { auto &OutVals = CLI.OutVals; auto &OutFlags = CLI.OutFlags; auto &OutRegs = CLI.OutRegs; @@ -2699,6 +2669,9 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { TM.Options.GuaranteedTailCallOpt)) return false; + SmallVector<MVT, 16> OutVTs; + SmallVector<unsigned, 16> ArgRegs; + // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra // instruction. This is safe because it is common to all FastISel supported // calling conventions on x86. @@ -2716,46 +2689,44 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { // Passing bools around ends up doing a trunc to i1 and passing it. // Codegen this as an argument + "and 1". - if (auto *TI = dyn_cast<TruncInst>(Val)) { - if (TI->getType()->isIntegerTy(1) && CLI.CS && - (TI->getParent() == CLI.CS->getInstruction()->getParent()) && - TI->hasOneUse()) { - Val = cast<TruncInst>(Val)->getOperand(0); - unsigned ResultReg = getRegForValue(Val); - - if (!ResultReg) - return false; - - MVT ArgVT; - if (!isTypeLegal(Val->getType(), ArgVT)) - return false; + MVT VT; + auto *TI = dyn_cast<TruncInst>(Val); + unsigned ResultReg; + if (TI && TI->getType()->isIntegerTy(1) && CLI.CS && + (TI->getParent() == CLI.CS->getInstruction()->getParent()) && + TI->hasOneUse()) { + Value *PrevVal = TI->getOperand(0); + ResultReg = getRegForValue(PrevVal); + + if (!ResultReg) + return false; - ResultReg = - FastEmit_ri(ArgVT, ArgVT, ISD::AND, ResultReg, Val->hasOneUse(), 1); + if (!isTypeLegal(PrevVal->getType(), VT)) + return false; - if (!ResultReg) - return false; - UpdateValueMap(Val, ResultReg); - } + ResultReg = + fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1); + } else { + if (!isTypeLegal(Val->getType(), VT)) + return false; + ResultReg = getRegForValue(Val); } + + if (!ResultReg) + return false; + + ArgRegs.push_back(ResultReg); + OutVTs.push_back(VT); } // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, - CLI.RetTy->getContext()); + CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); - SmallVector<MVT, 16> OutVTs; - for (auto *Val : OutVals) { - MVT VT; - if (!isTypeLegal(Val->getType(), VT)) - return false; - OutVTs.push_back(VT); - } CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); // Get a count of how many bytes are to be pushed on the stack. @@ -2764,11 +2735,11 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes); + .addImm(NumBytes).addImm(0); // Walk the register/memloc assignments, inserting copies/loads. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(TM.getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign const &VA = ArgLocs[i]; const Value *ArgVal = OutVals[VA.getValNo()]; @@ -2777,9 +2748,7 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { if (ArgVT == MVT::x86mmx) return false; - unsigned ArgReg = getRegForValue(ArgVal); - if (!ArgReg) - return false; + unsigned ArgReg = ArgRegs[VA.getValNo()]; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -2819,7 +2788,7 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { break; } case CCValAssign::BCvt: { - ArgReg = FastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, + ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, /*TODO: Kill=*/false); assert(ArgReg && "Failed to emit a bitcast!"); ArgVT = VA.getLocVT(); @@ -2846,6 +2815,11 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { OutRegs.push_back(VA.getLocReg()); } else { assert(VA.isMemLoc()); + + // Don't emit stores for undef values. + if (isa<UndefValue>(ArgVal)) + continue; + unsigned LocMemOffset = VA.getLocMemOffset(); X86AddressMode AM; AM.Base.Reg = RegInfo->getStackRegister(); @@ -2982,7 +2956,7 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { // Now handle call return values. SmallVector<CCValAssign, 16> RVLocs; - CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, + CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, CLI.RetTy->getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); @@ -2999,39 +2973,33 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { report_fatal_error("SSE register return with SSE disabled"); } - // If this is a call to a function that returns an fp value on the floating - // point stack, we must guarantee the value is popped from the stack, so - // a COPY is not good enough - the copy instruction may be eliminated if the - // return value is not used. We use the FpPOP_RETVAL instruction instead. - if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { - // If we prefer to use the value in xmm registers, copy it out as f80 and - // use a truncate to move it from fp stack reg to xmm reg. - if (isScalarFPTypeInSSEReg(VA.getValVT())) { - CopyVT = MVT::f80; - CopyReg = createResultReg(&X86::RFP80RegClass); - } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::FpPOP_RETVAL), CopyReg); - - // Round the f80 to the right size, which also moves it to the appropriate - // xmm register. This is accomplished by storing the f80 value in memory - // and then loading it back. - if (CopyVT != VA.getValVT()) { - EVT ResVT = VA.getValVT(); - unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; - unsigned MemSize = ResVT.getSizeInBits()/8; - int FI = MFI.CreateStackObject(MemSize, MemSize, false); - addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc)), FI) - .addReg(CopyReg); - Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; - addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc), ResultReg + i), FI); - } - } else { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg()); - InRegs.push_back(VA.getLocReg()); + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + isScalarFPTypeInSSEReg(VA.getValVT())) { + CopyVT = MVT::f80; + CopyReg = createResultReg(&X86::RFP80RegClass); + } + + // Copy out the result. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg()); + InRegs.push_back(VA.getLocReg()); + + // Round the f80 to the right size, which also moves it to the appropriate + // xmm register. This is accomplished by storing the f80 value in memory + // and then loading it back. + if (CopyVT != VA.getValVT()) { + EVT ResVT = VA.getValVT(); + unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; + unsigned MemSize = ResVT.getSizeInBits()/8; + int FI = MFI.CreateStackObject(MemSize, MemSize, false); + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc)), FI) + .addReg(CopyReg); + Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; + addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg + i), FI); } } @@ -3043,7 +3011,7 @@ bool X86FastISel::FastLowerCall(CallLoweringInfo &CLI) { } bool -X86FastISel::TargetSelectInstruction(const Instruction *I) { +X86FastISel::fastSelectInstruction(const Instruction *I) { switch (I->getOpcode()) { default: break; case Instruction::Load: @@ -3086,7 +3054,7 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { return X86SelectTrunc(I); unsigned Reg = getRegForValue(I->getOperand(0)); if (Reg == 0) return false; - UpdateValueMap(I, Reg); + updateValueMap(I, Reg); return true; } } @@ -3094,13 +3062,69 @@ X86FastISel::TargetSelectInstruction(const Instruction *I) { return false; } -unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { - MVT VT; - if (!isTypeLegal(C->getType(), VT)) +unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { + if (VT > MVT::i64) return 0; + uint64_t Imm = CI->getZExtValue(); + if (Imm == 0) { + unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type"); + case MVT::i1: + case MVT::i8: + return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, + X86::sub_8bit); + case MVT::i16: + return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true, + X86::sub_16bit); + case MVT::i32: + return SrcReg; + case MVT::i64: { + unsigned ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) + .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); + return ResultReg; + } + } + } + + unsigned Opc = 0; + switch (VT.SimpleTy) { + default: llvm_unreachable("Unexpected value type"); + case MVT::i1: VT = MVT::i8; // fall-through + case MVT::i8: Opc = X86::MOV8ri; break; + case MVT::i16: Opc = X86::MOV16ri; break; + case MVT::i32: Opc = X86::MOV32ri; break; + case MVT::i64: { + if (isUInt<32>(Imm)) + Opc = X86::MOV32ri; + else if (isInt<32>(Imm)) + Opc = X86::MOV64ri32; + else + Opc = X86::MOV64ri; + break; + } + } + if (VT == MVT::i64 && Opc == X86::MOV32ri) { + unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm); + unsigned ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) + .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); + return ResultReg; + } + return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); +} + +unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { + if (CFP->isNullValue()) + return fastMaterializeFloatZero(CFP); + // Can't handle alternate code models yet. - if (TM.getCodeModel() != CodeModel::Small) + CodeModel::Model CM = TM.getCodeModel(); + if (CM != CodeModel::Small && CM != CodeModel::Large) return 0; // Get opcode and regclass of the output for the given load instruction. @@ -3108,23 +3132,6 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; - case MVT::i8: - Opc = X86::MOV8rm; - RC = &X86::GR8RegClass; - break; - case MVT::i16: - Opc = X86::MOV16rm; - RC = &X86::GR16RegClass; - break; - case MVT::i32: - Opc = X86::MOV32rm; - RC = &X86::GR32RegClass; - break; - case MVT::i64: - // Must be in x86-64 mode. - Opc = X86::MOV64rm; - RC = &X86::GR64RegClass; - break; case MVT::f32: if (X86ScalarSSEf32) { Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; @@ -3148,39 +3155,11 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { return 0; } - // Materialize addresses with LEA/MOV instructions. - if (isa<GlobalValue>(C)) { - X86AddressMode AM; - if (X86SelectAddress(C, AM)) { - // If the expression is just a basereg, then we're done, otherwise we need - // to emit an LEA. - if (AM.BaseType == X86AddressMode::RegBase && - AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) - return AM.Base.Reg; - - unsigned ResultReg = createResultReg(RC); - if (TM.getRelocationModel() == Reloc::Static && - TLI.getPointerTy() == MVT::i64) { - // The displacement code be more than 32 bits away so we need to use - // an instruction with a 64 bit immediate - Opc = X86::MOV64ri; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc), ResultReg).addGlobalAddress(cast<GlobalValue>(C)); - } else { - Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; - addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc), ResultReg), AM); - } - return ResultReg; - } - return 0; - } - // MachineConstantPool wants an explicit alignment. - unsigned Align = DL.getPrefTypeAlignment(C->getType()); + unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); if (Align == 0) { - // Alignment of vector types. FIXME! - Align = DL.getTypeAllocSize(C->getType()); + // Alignment of vector types. FIXME! + Align = DL.getTypeAllocSize(CFP->getType()); } // x86-32 PIC requires a PIC base register for constant pools. @@ -3198,23 +3177,88 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) { } // Create the load from the constant pool. - unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align); + unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); unsigned ResultReg = createResultReg(RC); + + if (CM == CodeModel::Large) { + unsigned AddrReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), + AddrReg) + .addConstantPoolIndex(CPI, 0, OpFlag); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg); + addDirectMem(MIB, AddrReg); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, + TM.getSubtargetImpl()->getDataLayout()->getPointerSize(), Align); + MIB->addMemOperand(*FuncInfo.MF, MMO); + return ResultReg; + } + addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), - MCPOffset, PICBase, OpFlag); - + CPI, PICBase, OpFlag); return ResultReg; } -unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { +unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return 0; + + // Materialize addresses with LEA/MOV instructions. + X86AddressMode AM; + if (X86SelectAddress(GV, AM)) { + // If the expression is just a basereg, then we're done, otherwise we need + // to emit an LEA. + if (AM.BaseType == X86AddressMode::RegBase && + AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) + return AM.Base.Reg; + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + if (TM.getRelocationModel() == Reloc::Static && + TLI.getPointerTy() == MVT::i64) { + // The displacement code could be more than 32 bits away so we need to use + // an instruction with a 64 bit immediate + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), + ResultReg) + .addGlobalAddress(GV); + } else { + unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; + addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg), AM); + } + return ResultReg; + } + return 0; +} + +unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { + EVT CEVT = TLI.getValueType(C->getType(), true); + + // Only handle simple types. + if (!CEVT.isSimple()) + return 0; + MVT VT = CEVT.getSimpleVT(); + + if (const auto *CI = dyn_cast<ConstantInt>(C)) + return X86MaterializeInt(CI, VT); + else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) + return X86MaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) + return X86MaterializeGV(GV, VT); + + return 0; +} + +unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { // Fail on dynamic allocas. At this point, getRegForValue has already // checked its CSE maps, so if we're here trying to handle a dynamic // alloca, we're not going to succeed. X86SelectAddress has a // check for dynamic allocas, because it's called directly from - // various places, but TargetMaterializeAlloca also needs a check + // various places, but targetMaterializeAlloca also needs a check // in order to avoid recursion between getRegForValue, - // X86SelectAddrss, and TargetMaterializeAlloca. + // X86SelectAddrss, and targetMaterializeAlloca. if (!FuncInfo.StaticAllocaMap.count(C)) return 0; assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?"); @@ -3222,7 +3266,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { X86AddressMode AM; if (!X86SelectAddress(C, AM)) return 0; - unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; + unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -3230,7 +3274,7 @@ unsigned X86FastISel::TargetMaterializeAlloca(const AllocaInst *C) { return ResultReg; } -unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) { +unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { MVT VT; if (!isTypeLegal(CF->getType(), VT)) return 0; @@ -3276,7 +3320,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, if (!X86SelectAddress(Ptr, AM)) return false; - const X86InstrInfo &XII = (const X86InstrInfo&)TII; + const X86InstrInfo &XII = (const X86InstrInfo &)TII; unsigned Size = DL.getTypeAllocSize(LI->getType()); unsigned Alignment = LI->getAlignment(); @@ -3288,7 +3332,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, AM.getFullAddress(AddrOps); MachineInstr *Result = - XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment); + XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, + Size, Alignment, /*AllowCommute=*/true); if (!Result) return false; diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index eb9f743226cc..8e033a096fd8 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -155,7 +155,8 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { if (!ST.LEAusesAG() && !ST.slowLEA()) return false; - TII = static_cast<const X86InstrInfo *>(TM->getInstrInfo()); + TII = + static_cast<const X86InstrInfo *>(TM->getSubtargetImpl()->getInstrInfo()); DEBUG(dbgs() << "Start X86FixupLEAs\n";); // Process all basic blocks. @@ -217,7 +218,8 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, if (usesRegister(p, CurInst) == RU_Write) { return CurInst; } - InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst); + InstrDistance += TII->getInstrLatency( + TM->getSubtargetImpl()->getInstrItineraryData(), CurInst); Found = getPreviousInstr(CurInst, MFI); } return nullptr; @@ -281,6 +283,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; int addrr_opcode, addri_opcode; switch (opcode) { + default: llvm_unreachable("Unexpected LEA instruction"); case X86::LEA16r: addrr_opcode = X86::ADD16rr; addri_opcode = X86::ADD16ri; @@ -294,8 +297,6 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, addrr_opcode = X86::ADD64rr; addri_opcode = X86::ADD64ri32; break; - default: - assert(false && "Unexpected LEA instruction"); } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index c8a3ab3082ad..618910946bf9 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -28,12 +28,14 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/EdgeBundles.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Support/Debug.h" @@ -41,7 +43,9 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> +#include <bitset> using namespace llvm; #define DEBUG_TYPE "x86-codegen" @@ -50,6 +54,8 @@ STATISTIC(NumFXCH, "Number of fxch instructions inserted"); STATISTIC(NumFP , "Number of floating point instructions"); namespace { + const unsigned ScratchFPReg = 7; + struct FPS : public MachineFunctionPass { static char ID; FPS() : MachineFunctionPass(ID) { @@ -137,7 +143,7 @@ namespace { unsigned StackTop; // The current top of the FP stack. enum { - NumFPRegs = 16 // Including scratch pseudo-registers. + NumFPRegs = 8 // Including scratch pseudo-registers. }; // For each live FP<n> register, point to its Stack[] entry. @@ -146,27 +152,6 @@ namespace { // register allocator thinks. unsigned RegMap[NumFPRegs]; - // Pending fixed registers - Inline assembly needs FP registers to appear - // in fixed stack slot positions. This is handled by copying FP registers - // to ST registers before the instruction, and copying back after the - // instruction. - // - // This is modeled with pending ST registers. NumPendingSTs is the number - // of ST registers (ST0-STn) we are tracking. PendingST[n] points to an FP - // register that holds the ST value. The ST registers are not moved into - // place until immediately before the instruction that needs them. - // - // It can happen that we need an ST register to be live when no FP register - // holds the value: - // - // %ST0 = COPY %FP4<kill> - // - // When that happens, we allocate a scratch FP register to hold the ST - // value. That means every register in PendingST must be live. - - unsigned NumPendingSTs; - unsigned char PendingST[8]; - // Set up our stack model to match the incoming registers to MBB. void setupBlockStack(); @@ -180,9 +165,6 @@ namespace { dbgs() << " FP" << Stack[i]; assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!"); } - for (unsigned i = 0; i != NumPendingSTs; ++i) - dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]); - dbgs() << "\n"; } #endif @@ -199,19 +181,6 @@ namespace { return Slot < StackTop && Stack[Slot] == RegNo; } - /// getScratchReg - Return an FP register that is not currently in use. - unsigned getScratchReg() const { - for (int i = NumFPRegs - 1; i >= 8; --i) - if (!isLive(i)) - return i; - llvm_unreachable("Ran out of scratch FP registers"); - } - - /// isScratchReg - Returns trus if RegNo is a scratch FP register. - static bool isScratchReg(unsigned RegNo) { - return RegNo > 8 && RegNo < NumFPRegs; - } - /// getStackEntry - Return the X86::FP<n> register in register ST(i). unsigned getStackEntry(unsigned STi) const { if (STi >= StackTop) @@ -263,21 +232,6 @@ namespace { BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg); } - /// duplicatePendingSTBeforeKill - The instruction at I is about to kill - /// RegNo. If any PendingST registers still need the RegNo value, duplicate - /// them to new scratch registers. - void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) { - for (unsigned i = 0; i != NumPendingSTs; ++i) { - if (PendingST[i] != RegNo) - continue; - unsigned SR = getScratchReg(); - DEBUG(dbgs() << "Duplicating pending ST" << i - << " in FP" << RegNo << " to FP" << SR << '\n'); - duplicateToTop(RegNo, SR, I); - PendingST[i] = SR; - } - } - /// popStackAfter - Pop the current value off of the top of the FP stack /// after the specified instruction. void popStackAfter(MachineBasicBlock::iterator &I); @@ -304,6 +258,7 @@ namespace { bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB); + void handleCall(MachineBasicBlock::iterator &I); void handleZeroArgFP(MachineBasicBlock::iterator &I); void handleOneArgFP(MachineBasicBlock::iterator &I); void handleOneArgFPRW(MachineBasicBlock::iterator &I); @@ -320,6 +275,8 @@ namespace { return X86::RFP80RegClass.contains(DstReg) || X86::RFP80RegClass.contains(SrcReg); } + + void setKillFlags(MachineBasicBlock &MBB) const; }; char FPS::ID = 0; } @@ -354,7 +311,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { if (!FPIsUsed) return false; Bundles = &getAnalysis<EdgeBundles>(); - TII = MF.getTarget().getInstrInfo(); + TII = MF.getSubtarget().getInstrInfo(); // Prepare cross-MBB liveness. bundleCFG(MF); @@ -367,15 +324,13 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock *Entry = MF.begin(); bool Changed = false; - for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> > - I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed); - I != E; ++I) - Changed |= processBasicBlock(MF, **I); + for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed)) + Changed |= processBasicBlock(MF, *BB); // Process any unreachable blocks in arbitrary order now. if (MF.size() != Processed.size()) for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) - if (Processed.insert(BB)) + if (Processed.insert(BB).second) Changed |= processBasicBlock(MF, *BB); LiveBundles.clear(); @@ -409,8 +364,8 @@ void FPS::bundleCFG(MachineFunction &MF) { bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; MBB = &BB; - NumPendingSTs = 0; + setKillFlags(BB); setupBlockStack(); for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { @@ -428,6 +383,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { X86::RFP80RegClass.contains(MI->getOperand(0).getReg())) FPInstClass = X86II::SpecialFP; + if (MI->isCall()) + FPInstClass = X86II::SpecialFP; + if (FPInstClass == X86II::NotFP) continue; // Efficiently ignore non-fp insts! @@ -462,7 +420,9 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { // after definition. If so, pop them. for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) { unsigned Reg = DeadRegs[i]; - if (Reg >= X86::FP0 && Reg <= X86::FP6) { + // Check if Reg is live on the stack. An inline-asm register operand that + // is in the clobber list and marked dead might not be live on the stack. + if (Reg >= X86::FP0 && Reg <= X86::FP6 && isLive(Reg-X86::FP0)) { DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n"); freeStackSlotAfter(I, Reg-X86::FP0); } @@ -874,7 +834,9 @@ FPS::freeStackSlotBefore(MachineBasicBlock::iterator I, unsigned FPRegNo) { RegMap[TopReg] = OldSlot; RegMap[FPRegNo] = ~0; Stack[--StackTop] = ~0; - return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)).addReg(STReg); + return BuildMI(*MBB, I, DebugLoc(), TII->get(X86::ST_FPrr)) + .addReg(STReg) + .getInstr(); } /// adjustLiveRegs - Kill and revive registers such that exactly the FP @@ -966,6 +928,31 @@ void FPS::shuffleStackTop(const unsigned char *FixStack, // Instruction transformation implementation //===----------------------------------------------------------------------===// +void FPS::handleCall(MachineBasicBlock::iterator &I) { + unsigned STReturns = 0; + + for (const auto &MO : I->operands()) { + if (!MO.isReg()) + continue; + + unsigned R = MO.getReg() - X86::FP0; + + if (R < 8) { + assert(MO.isDef() && MO.isImplicit()); + STReturns |= 1 << R; + } + } + + unsigned N = CountTrailingOnes_32(STReturns); + + // FP registers used for function return must be consecutive starting at + // FP0. + assert(STReturns == 0 || (isMask_32(STReturns) && N <= 2)); + + for (unsigned I = 0; I < N; ++I) + pushReg(N - I - 1); +} + /// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem> /// void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) { @@ -992,9 +979,6 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { unsigned Reg = getFPReg(MI->getOperand(NumOps-1)); bool KillsSrc = MI->killsRegister(X86::FP0+Reg); - if (KillsSrc) - duplicatePendingSTBeforeKill(Reg, I); - // FISTP64m is strange because there isn't a non-popping versions. // If we have one _and_ we don't want to pop the operand, duplicate the value // on the stack instead of moving it. This ensure that popping the value is @@ -1015,7 +999,7 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) { MI->getOpcode() == X86::ISTT_Fp32m80 || MI->getOpcode() == X86::ISTT_Fp64m80 || MI->getOpcode() == X86::ST_FpP80m)) { - duplicateToTop(Reg, getScratchReg(), I); + duplicateToTop(Reg, ScratchFPReg, I); } else { moveToTop(Reg, I); // Move to the top of the stack... } @@ -1058,7 +1042,6 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) { bool KillsSrc = MI->killsRegister(X86::FP0+Reg); if (KillsSrc) { - duplicatePendingSTBeforeKill(Reg, I); // If this is the last use of the source register, just make sure it's on // the top of the stack. moveToTop(Reg, I); @@ -1314,71 +1297,22 @@ void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) { /// floating point instructions. This is primarily intended for use by pseudo /// instructions. /// -void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { - MachineInstr *MI = I; +void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { + MachineInstr *MI = Inst; + + if (MI->isCall()) { + handleCall(Inst); + return; + } + switch (MI->getOpcode()) { default: llvm_unreachable("Unknown SpecialFP instruction!"); case TargetOpcode::COPY: { // We handle three kinds of copies: FP <- FP, FP <- ST, and ST <- FP. const MachineOperand &MO1 = MI->getOperand(1); const MachineOperand &MO0 = MI->getOperand(0); - unsigned DstST = MO0.getReg() - X86::ST0; - unsigned SrcST = MO1.getReg() - X86::ST0; bool KillsSrc = MI->killsRegister(MO1.getReg()); - // ST = COPY FP. Set up a pending ST register. - if (DstST < 8) { - unsigned SrcFP = getFPReg(MO1); - assert(isLive(SrcFP) && "Cannot copy dead register"); - assert(!MO0.isDead() && "Cannot copy to dead ST register"); - - // Unallocated STs are marked as the nonexistent FP255. - while (NumPendingSTs <= DstST) - PendingST[NumPendingSTs++] = NumFPRegs; - - // STi could still be live from a previous inline asm. - if (isScratchReg(PendingST[DstST])) { - DEBUG(dbgs() << "Clobbering old ST in FP" << unsigned(PendingST[DstST]) - << '\n'); - freeStackSlotBefore(MI, PendingST[DstST]); - } - - // When the source is killed, allocate a scratch FP register. - if (KillsSrc) { - duplicatePendingSTBeforeKill(SrcFP, I); - unsigned Slot = getSlot(SrcFP); - unsigned SR = getScratchReg(); - PendingST[DstST] = SR; - Stack[Slot] = SR; - RegMap[SR] = Slot; - } else - PendingST[DstST] = SrcFP; - break; - } - - // FP = COPY ST. Extract fixed stack value. - // Any instruction defining ST registers must have assigned them to a - // scratch register. - if (SrcST < 8) { - unsigned DstFP = getFPReg(MO0); - assert(!isLive(DstFP) && "Cannot copy ST to live FP register"); - assert(NumPendingSTs > SrcST && "Cannot copy from dead ST register"); - unsigned SrcFP = PendingST[SrcST]; - assert(isScratchReg(SrcFP) && "Expected ST in a scratch register"); - assert(isLive(SrcFP) && "Scratch holding ST is dead"); - - // DstFP steals the stack slot from SrcFP. - unsigned Slot = getSlot(SrcFP); - Stack[Slot] = DstFP; - RegMap[DstFP] = Slot; - - // Always treat the ST as killed. - PendingST[SrcST] = NumFPRegs; - while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs) - --NumPendingSTs; - break; - } - // FP <- FP copy. unsigned DstFP = getFPReg(MO0); unsigned SrcFP = getFPReg(MO1); @@ -1392,7 +1326,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { } else { // For COPY we just duplicate the specified value to a new stack slot. // This could be made better, but would require substantial changes. - duplicateToTop(SrcFP, DstFP, I); + duplicateToTop(SrcFP, DstFP, Inst); } break; } @@ -1401,41 +1335,11 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // All FP registers must be explicitly defined, so load a 0 instead. unsigned Reg = MI->getOperand(0).getReg() - X86::FP0; DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n'); - BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0)); + BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::LD_F0)); pushReg(Reg); break; } - case X86::FpPOP_RETVAL: { - // The FpPOP_RETVAL instruction is used after calls that return a value on - // the floating point stack. We cannot model this with ST defs since CALL - // instructions have fixed clobber lists. This instruction is interpreted - // to mean that there is one more live register on the stack than we - // thought. - // - // This means that StackTop does not match the hardware stack between a - // call and the FpPOP_RETVAL instructions. We do tolerate FP instructions - // between CALL and FpPOP_RETVAL as long as they don't overflow the - // hardware stack. - unsigned DstFP = getFPReg(MI->getOperand(0)); - - // Move existing stack elements up to reflect reality. - assert(StackTop < 8 && "Stack overflowed before FpPOP_RETVAL"); - if (StackTop) { - std::copy_backward(Stack, Stack + StackTop, Stack + StackTop + 1); - for (unsigned i = 0; i != NumFPRegs; ++i) - ++RegMap[i]; - } - ++StackTop; - - // DstFP is the new bottom of the stack. - Stack[0] = DstFP; - RegMap[DstFP] = 0; - - // DstFP will be killed by processBasicBlock if this was a dead def. - break; - } - case TargetOpcode::INLINEASM: { // The inline asm MachineInstr currently only *uses* FP registers for the // 'f' constraint. These should be turned into the current ST(x) register @@ -1472,19 +1376,30 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // only tell clobbers from defs by looking at the asm descriptor. unsigned STUses = 0, STDefs = 0, STClobbers = 0, STDeadDefs = 0; unsigned NumOps = 0; + SmallSet<unsigned, 1> FRegIdx; + unsigned RCID; + for (unsigned i = InlineAsm::MIOp_FirstOperand, e = MI->getNumOperands(); i != e && MI->getOperand(i).isImm(); i += 1 + NumOps) { unsigned Flags = MI->getOperand(i).getImm(); + NumOps = InlineAsm::getNumOperandRegisters(Flags); if (NumOps != 1) continue; const MachineOperand &MO = MI->getOperand(i + 1); if (!MO.isReg()) continue; - unsigned STReg = MO.getReg() - X86::ST0; + unsigned STReg = MO.getReg() - X86::FP0; if (STReg >= 8) continue; + // If the flag has a register class constraint, this must be an operand + // with constraint "f". Record its index and continue. + if (InlineAsm::hasRegClassConstraint(Flags, RCID)) { + FRegIdx.insert(i + 1); + continue; + } + switch (InlineAsm::getKind(Flags)) { case InlineAsm::Kind_RegUse: STUses |= (1u << STReg); @@ -1527,71 +1442,42 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { DEBUG(dbgs() << "Asm uses " << NumSTUses << " fixed regs, pops " << NumSTPopped << ", and defines " << NumSTDefs << " regs.\n"); - // Scan the instruction for FP uses corresponding to "f" constraints. - // Collect FP registers to kill afer the instruction. - // Always kill all the scratch regs. +#ifndef NDEBUG + // If any input operand uses constraint "f", all output register + // constraints must be early-clobber defs. + for (unsigned I = 0, E = MI->getNumOperands(); I < E; ++I) + if (FRegIdx.count(I)) { + assert((1 << getFPReg(MI->getOperand(I)) & STDefs) == 0 && + "Operands with constraint \"f\" cannot overlap with defs"); + } +#endif + + // Collect all FP registers (register operands with constraints "t", "u", + // and "f") to kill afer the instruction. unsigned FPKills = ((1u << NumFPRegs) - 1) & ~0xff; - unsigned FPUsed = 0; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &Op = MI->getOperand(i); if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) continue; - if (!Op.isUse()) - MI->emitError("illegal \"f\" output constraint"); unsigned FPReg = getFPReg(Op); - FPUsed |= 1U << FPReg; // If we kill this operand, make sure to pop it from the stack after the // asm. We just remember it for now, and pop them all off at the end in // a batch. - if (Op.isKill()) + if (Op.isUse() && Op.isKill()) FPKills |= 1U << FPReg; } - // The popped inputs will be killed by the instruction, so duplicate them - // if the FP register needs to be live after the instruction, or if it is - // used in the instruction itself. We effectively treat the popped inputs - // as early clobbers. - for (unsigned i = 0; i < NumSTPopped; ++i) { - if ((FPKills & ~FPUsed) & (1u << PendingST[i])) - continue; - unsigned SR = getScratchReg(); - duplicateToTop(PendingST[i], SR, I); - DEBUG(dbgs() << "Duplicating ST" << i << " in FP" - << unsigned(PendingST[i]) << " to avoid clobbering it.\n"); - PendingST[i] = SR; - } - - // Make sure we have a unique live register for every fixed use. Some of - // them could be undef uses, and we need to emit LD_F0 instructions. - for (unsigned i = 0; i < NumSTUses; ++i) { - if (i < NumPendingSTs && PendingST[i] < NumFPRegs) { - // Check for shared assignments. - for (unsigned j = 0; j < i; ++j) { - if (PendingST[j] != PendingST[i]) - continue; - // STi and STj are inn the same register, create a copy. - unsigned SR = getScratchReg(); - duplicateToTop(PendingST[i], SR, I); - DEBUG(dbgs() << "Duplicating ST" << i << " in FP" - << unsigned(PendingST[i]) - << " to avoid collision with ST" << j << '\n'); - PendingST[i] = SR; - } - continue; - } - unsigned SR = getScratchReg(); - DEBUG(dbgs() << "Emitting LD_F0 for ST" << i << " in FP" << SR << '\n'); - BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0)); - pushReg(SR); - PendingST[i] = SR; - if (NumPendingSTs == i) - ++NumPendingSTs; - } - assert(NumPendingSTs >= NumSTUses && "Fixed registers should be assigned"); + // Do not include registers that are implicitly popped by defs/clobbers. + FPKills &= ~(STDefs | STClobbers); // Now we can rearrange the live registers to match what was requested. - shuffleStackTop(PendingST, NumPendingSTs, I); + unsigned char STUsesArray[8]; + + for (unsigned I = 0; I < NumSTUses; ++I) + STUsesArray[I] = I; + + shuffleStackTop(STUsesArray, NumSTUses, Inst); DEBUG({dbgs() << "Before asm: "; dumpStack();}); // With the stack layout fixed, rewrite the FP registers. @@ -1599,36 +1485,22 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { MachineOperand &Op = MI->getOperand(i); if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6) continue; + unsigned FPReg = getFPReg(Op); - Op.setReg(getSTReg(FPReg)); + + if (FRegIdx.count(i)) + // Operand with constraint "f". + Op.setReg(getSTReg(FPReg)); + else + // Operand with a single register class constraint ("t" or "u"). + Op.setReg(X86::ST0 + FPReg); } // Simulate the inline asm popping its inputs and pushing its outputs. StackTop -= NumSTPopped; - // Hold the fixed output registers in scratch FP registers. They will be - // transferred to real FP registers by copies. - NumPendingSTs = 0; - for (unsigned i = 0; i < NumSTDefs; ++i) { - unsigned SR = getScratchReg(); - pushReg(SR); - FPKills &= ~(1u << SR); - } for (unsigned i = 0; i < NumSTDefs; ++i) - PendingST[NumPendingSTs++] = getStackEntry(i); - DEBUG({dbgs() << "After asm: "; dumpStack();}); - - // If any of the ST defs were dead, pop them immediately. Our caller only - // handles dead FP defs. - MachineBasicBlock::iterator InsertPt = MI; - for (unsigned i = 0; STDefs & (1u << i); ++i) { - if (!(STDeadDefs & (1u << i))) - continue; - freeStackSlotAfter(InsertPt, PendingST[i]); - PendingST[i] = NumFPRegs; - } - while (NumPendingSTs && PendingST[NumPendingSTs - 1] == NumFPRegs) - --NumPendingSTs; + pushReg(NumSTDefs - i - 1); // If this asm kills any FP registers (is the last use of them) we must // explicitly emit pop instructions for them. Do this now after the asm has @@ -1640,9 +1512,10 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { while (FPKills) { unsigned FPReg = countTrailingZeros(FPKills); if (isLive(FPReg)) - freeStackSlotAfter(InsertPt, FPReg); + freeStackSlotAfter(Inst, FPReg); FPKills &= ~(1U << FPReg); } + // Don't delete the inline asm! return; } @@ -1655,12 +1528,12 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6); unsigned FPReg = getFPReg(Op); if (Op.isKill()) - moveToTop(FPReg, I); + moveToTop(FPReg, Inst); else - duplicateToTop(FPReg, FPReg, I); + duplicateToTop(FPReg, FPReg, Inst); // Emit the call. This will pop the operand. - BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) + BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) .addExternalSymbol("_ftol2") .addReg(X86::ST0, RegState::ImplicitKill) .addReg(X86::ECX, RegState::ImplicitDefine) @@ -1738,7 +1611,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { // Duplicate the TOS so that we return it twice. Just pick some other FPx // register to hold it. - unsigned NewReg = getScratchReg(); + unsigned NewReg = ScratchFPReg; duplicateToTop(FirstFPRegOp, NewReg, MI); FirstFPRegOp = NewReg; } @@ -1761,13 +1634,54 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) { return; } - I = MBB->erase(I); // Remove the pseudo instruction + Inst = MBB->erase(Inst); // Remove the pseudo instruction // We want to leave I pointing to the previous instruction, but what if we // just erased the first instruction? - if (I == MBB->begin()) { + if (Inst == MBB->begin()) { DEBUG(dbgs() << "Inserting dummy KILL\n"); - I = BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL)); + Inst = BuildMI(*MBB, Inst, DebugLoc(), TII->get(TargetOpcode::KILL)); } else - --I; + --Inst; +} + +void FPS::setKillFlags(MachineBasicBlock &MBB) const { + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + LivePhysRegs LPR(TRI); + + LPR.addLiveOuts(&MBB); + + for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); + I != E; ++I) { + if (I->isDebugValue()) + continue; + + std::bitset<8> Defs; + SmallVector<MachineOperand *, 2> Uses; + MachineInstr &MI = *I; + + for (auto &MO : I->operands()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg() - X86::FP0; + + if (Reg >= 8) + continue; + + if (MO.isDef()) { + Defs.set(Reg); + if (!LPR.contains(MO.getReg())) + MO.setIsDead(); + } else + Uses.push_back(&MO); + } + + for (auto *MO : Uses) + if (Defs.test(getFPReg(*MO)) || !LPR.contains(MO->getReg())) + MO->setIsKill(); + + LPR.stepBackward(MI); + } } diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index 8c029a8c22d5..f2eb6a8ea73e 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/Debug.h" +#include <cstdlib> using namespace llvm; @@ -37,7 +38,34 @@ using namespace llvm; extern cl::opt<bool> ForceStackAlign; bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { - return !MF.getFrameInfo()->hasVarSizedObjects(); + return !MF.getFrameInfo()->hasVarSizedObjects() && + !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); +} + +/// canSimplifyCallFramePseudos - If there is a reserved call frame, the +/// call frame pseudos can be simplified. Having a FP, as in the default +/// implementation, is not sufficient here since we can't always use it. +/// Use a more nuanced condition. +bool +X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { + const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *> + (MF.getSubtarget().getRegisterInfo()); + return hasReservedCallFrame(MF) || + (hasFP(MF) && !TRI->needsStackRealignment(MF)) + || TRI->hasBasePointer(MF); +} + +// needsFrameIndexResolution - Do we need to perform FI resolution for +// this function. Normally, this is required only when the function +// has any stack objects. However, FI resolution actually has another job, +// not apparent from the title - it resolves callframesetup/destroy +// that were not simplified earlier. +// So, this is required for x86 functions that have push sequences even +// when there are no stack objects. +bool +X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { + return MF.getFrameInfo()->hasStackObjects() || + MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -46,14 +74,15 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineModuleInfo &MMI = MF.getMMI(); - const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || - MMI.callsUnwindInit() || MMI.callsEHReturn()); + MMI.callsUnwindInit() || MMI.callsEHReturn() || + MFI->hasStackMap() || MFI->hasPatchPoint()); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { @@ -80,6 +109,17 @@ static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { } } +static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::AND64ri8; + return X86::AND64ri32; + } + if (isInt<8>(Imm)) + return X86::AND32ri8; + return X86::AND32ri; +} + static unsigned getLEArOpcode(unsigned IsLP64) { return IsLP64 ? X86::LEA64r : X86::LEA32r; } @@ -148,32 +188,32 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, static void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, int64_t NumBytes, - bool Is64Bit, bool IsLP64, bool UseLEA, + bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; unsigned Opc; if (UseLEA) - Opc = getLEArOpcode(IsLP64); + Opc = getLEArOpcode(Is64BitStackPtr); else Opc = isSub - ? getSUBriOpcode(IsLP64, Offset) - : getADDriOpcode(IsLP64, Offset); + ? getSUBriOpcode(Is64BitStackPtr, Offset) + : getADDriOpcode(Is64BitStackPtr, Offset); uint64_t Chunk = (1LL << 31) - 1; DebugLoc DL = MBB.findDebugLoc(MBBI); while (Offset) { uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; - if (ThisVal == (Is64Bit ? 8 : 4)) { + if (ThisVal == (Is64BitTarget ? 8 : 4)) { // Use push / pop instead. unsigned Reg = isSub - ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) - : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit); + ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX) + : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); if (Reg) { Opc = isSub - ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r) - : (Is64Bit ? X86::POP64r : X86::POP32r); + ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r) + : (Is64BitTarget ? X86::POP64r : X86::POP32r); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); if (isSub) @@ -314,7 +354,7 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); // Add callee saved registers to move list. const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); @@ -352,6 +392,23 @@ static bool usesTheStack(const MachineFunction &MF) { return false; } +void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI, + unsigned &CallOp, + const char *&Symbol) { + CallOp = STI.is64Bit() ? X86::W64ALLOCA : X86::CALLpcrel32; + + if (STI.is64Bit()) { + if (STI.isTargetCygMing()) { + Symbol = "___chkstk_ms"; + } else { + Symbol = "__chkstk"; + } + } else if (STI.isTargetCygMing()) + Symbol = "_alloca"; + else + Symbol = "_chkstk"; +} + /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to @@ -418,6 +475,8 @@ static bool usesTheStack(const MachineFunction &MF) { [if needs base pointer] mov %rsp, %rbx + [if needs to restore base pointer] + mov %rsp, -MMM(%rbp) ; Emit CFI info [if needs FP] @@ -440,8 +499,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. @@ -449,11 +508,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { bool HasFP = hasFP(MF); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); - bool IsLP64 = STI.isTarget64BitLP64(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); bool IsWin64 = STI.isTargetWin64(); - bool IsWinEH = - MF.getTarget().getMCAsmInfo()->getExceptionHandlingType() == - ExceptionHandling::WinEH; // Not necessarily synonymous with IsWin64. + // Not necessarily synonymous with IsWin64. + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); bool NeedsDwarfCFI = !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); @@ -461,6 +520,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); + const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? + getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); unsigned BasePtr = RegInfo->getBaseRegister(); DebugLoc DL; @@ -482,6 +543,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); + bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); + + // The default stack probe size is 4096 if the function has no stackprobesize + // attribute. + unsigned StackProbeSize = 4096; + if (Fn->hasFnAttribute("stack-probe-size")) + Fn->getFnAttribute("stack-probe-size") + .getValueAsString() + .getAsInteger(0, StackProbeSize); + // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the @@ -507,7 +578,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (TailCallReturnAddrDelta < 0) { MachineInstr *MI = BuildMI(MBB, MBBI, DL, - TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)), + TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)), StackPtr) .addReg(StackPtr) .addImm(-TailCallReturnAddrDelta) @@ -535,6 +606,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; + // If required, include space for extra hidden slot for stashing base pointer. + if (X86FI->getRestoreBasePointer()) + FrameSize += SlotSize; if (RegInfo->needsStackRealignment(MF)) { // Callee-saved registers are pushed on stack before the stack // is realigned. @@ -551,7 +625,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) - .addReg(FramePtr, RegState::Kill) + .addReg(MachineFramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { @@ -564,7 +638,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .addCFIIndex(CFIIndex); // Change the rule for the FramePtr to be an "offset" rule. - unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); CFIIndex = MMI.addFrameInst( MCCFIInstruction::createOffset(nullptr, DwarfFramePtr, 2 * stackGrowth)); @@ -580,14 +654,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr) + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. - unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true); + unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) @@ -596,7 +670,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // Mark the FramePtr as live-in in every block. for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - I->addLiveIn(FramePtr); + I->addLiveIn(MachineFramePtr); } else { NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } @@ -633,11 +707,12 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // able to calculate their offsets from the frame pointer). if (RegInfo->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); + uint64_t Val = -MaxAlign; MachineInstr *MI = BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr) + TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr) .addReg(StackPtr) - .addImm(-MaxAlign) + .addImm(Val) .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. @@ -663,19 +738,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. - if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetMacho()) { + if (NumBytes >= StackProbeSize && UseStackProbe) { const char *StackProbeSymbol; + unsigned CallOp; - if (Is64Bit) { - if (STI.isTargetCygMing()) { - StackProbeSymbol = "___chkstk_ms"; - } else { - StackProbeSymbol = "__chkstk"; - } - } else if (STI.isTargetCygMing()) - StackProbeSymbol = "_alloca"; - else - StackProbeSymbol = "_chkstk"; + getStackProbeFunction(STI, CallOp, StackProbeSymbol); // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); @@ -706,7 +773,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { } BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32)) + TII.get(CallOp)) .addExternalSymbol(StackProbeSymbol) .addReg(StackPtr, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) @@ -722,15 +789,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { .setMIFlag(MachineInstr::FrameSetup); } if (isEAXAlive) { - // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); - MI->setFlag(MachineInstr::FrameSetup); - MBB.insert(MBBI, MI); + // Restore EAX + MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), + X86::EAX), + StackPtr, false, NumBytes - 4); + MI->setFlag(MachineInstr::FrameSetup); + MBB.insert(MBBI, MI); } } else if (NumBytes) { - emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64, + emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); } @@ -746,7 +813,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // will restore SP to (BP - SEHFrameOffset) for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { int offset = MFI->getObjectOffset(Info.getFrameIdx()); - SEHFrameOffset = std::max(SEHFrameOffset, abs(offset)); + SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset)); } SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant @@ -804,10 +871,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { // to reference locals. if (RegInfo->hasBasePointer(MF)) { // Update the base pointer with the current stack pointer. - unsigned Opc = Is64Bit ? X86::MOV64rr : X86::MOV32rr; + unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); + if (X86FI->getRestoreBasePointer()) { + // Stash value of base pointer. Saving RSP instead of EBP shortens dependence chain. + unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), + FramePtr, true, X86FI->getRestoreBasePointerOffset()) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + } } if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { @@ -834,21 +909,28 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert(MBBI != MBB.end() && "Returning block has no instructions"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); - bool IsLP64 = STI.isTarget64BitLP64(); + // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + const bool Is64BitILP32 = STI.isTarget64BitILP32(); bool UseLEA = STI.useLeaForSP(); unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); + unsigned MachineFramePtr = Is64BitILP32 ? + getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); + bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry(); + switch (RetOpcode) { default: llvm_unreachable("Can only insert epilog into returning blocks"); @@ -898,7 +980,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Pop EBP. BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr); + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr); } else { NumBytes = StackSize - CSSize; } @@ -930,27 +1012,39 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (RegInfo->needsStackRealignment(MF)) MBBI = FirstCSPop; if (CSSize != 0) { - unsigned Opc = getLEArOpcode(IsLP64); + unsigned Opc = getLEArOpcode(Uses64BitFramePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr, false, -CSSize); + --MBBI; } else { - unsigned Opc = (Is64Bit ? X86::MOV64rr : X86::MOV32rr); + unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(FramePtr); + --MBBI; } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. - emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA, + emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); + --MBBI; } + // Windows unwinder will not invoke function's exception handler if IP is + // either in prologue or in epilogue. This behavior causes a problem when a + // call immediately precedes an epilogue, because the return address points + // into the epilogue. To cope with that, we insert an epilogue marker here, + // then replace it with a 'nop' if it ends up immediately after a CALL in the + // final emitted code. + if (NeedsWinEH) + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); + // We're returning from function via eh_return. if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { MBBI = MBB.getLastNonDebugInstr(); MachineOperand &DestAddr = MBBI->getOperand(0); assert(DestAddr.isReg() && "Offset should be in register!"); BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), + TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr).addReg(DestAddr.getReg()); } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNmi || @@ -976,7 +1070,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Offset) { // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64, + emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); } @@ -1021,7 +1115,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Check for possible merge with preceding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); - emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII, + emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); } } @@ -1029,7 +1123,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); const MachineFrameInfo *MFI = MF.getFrameInfo(); int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); uint64_t StackSize = MFI->getStackSize(); @@ -1072,7 +1166,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo()); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. @@ -1085,12 +1179,85 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return getFrameIndexOffset(MF, FI); } +// Simplified from getFrameIndexOffset keeping only StackPointer cases +int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Does not include any dynamic realign. + const uint64_t StackSize = MFI->getStackSize(); + { +#ifndef NDEBUG + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo()); + // Note: LLVM arranges the stack as: + // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) + // > "Stack Slots" (<--SP) + // We can always address StackSlots from RSP. We can usually (unless + // needsStackRealignment) address CSRs from RSP, but sometimes need to + // address them from RBP. FixedObjects can be placed anywhere in the stack + // frame depending on their specific requirements (i.e. we can actually + // refer to arguments to the function which are stored in the *callers* + // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs + // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. + + assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); + + // We don't handle tail calls, and shouldn't be seeing them + // either. + int TailCallReturnAddrDelta = + MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta(); + assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); +#endif + } + + // This is how the math works out: + // + // %rsp grows (i.e. gets lower) left to right. Each box below is + // one word (eight bytes). Obj0 is the stack slot we're trying to + // get to. + // + // ---------------------------------- + // | BP | Obj0 | Obj1 | ... | ObjN | + // ---------------------------------- + // ^ ^ ^ ^ + // A B C E + // + // A is the incoming stack pointer. + // (B - A) is the local area offset (-8 for x86-64) [1] + // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2] + // + // |(E - B)| is the StackSize (absolute value, positive). For a + // stack that grown down, this works out to be (B - E). [3] + // + // E is also the value of %rsp after stack has been set up, and we + // want (C - E) -- the value we can add to %rsp to get to Obj0. Now + // (C - E) == (C - A) - (B - A) + (B - E) + // { Using [1], [2] and [3] above } + // == getObjectOffset - LocalAreaOffset + StackSize + // + + // Get the Offset from the StackPointer + int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); + + return Offset + StackSize; +} +// Simplified from getFrameIndexReference keeping only StackPointer cases +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const X86RegisterInfo *RegInfo = + static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo()); + + assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); + + FrameReg = RegInfo->getStackRegister(); + return getFrameIndexOffsetFromSP(MF, FI); +} + bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector<CalleeSavedInfo> &CSI) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1107,7 +1274,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // about avoiding it later. unsigned FPReg = RegInfo->getFrameRegister(MF); for (unsigned i = 0; i < CSI.size(); ++i) { - if (CSI[i].getReg() == FPReg) { + if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { CSI.erase(CSI.begin() + i); break; } @@ -1138,7 +1305,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); // ensure alignment - SpillSlotOffset -= abs(SpillSlotOffset) % RC->getAlignment(); + SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); // spill into slot SpillSlotOffset -= RC->getSize(); int SlotIndex = @@ -1157,7 +1324,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters( DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); // Push GPRs. It increases frame size. @@ -1205,7 +1372,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); // Reload XMMs from stack frame. @@ -1237,7 +1404,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -1278,7 +1445,7 @@ HasNestArgument(const MachineFunction *MF) { /// and the properties of the function either one or two registers will be /// needed. Set primary to true for the first register, false for the second. static unsigned -GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { +GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); // Erlang stuff. @@ -1289,8 +1456,12 @@ GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) { return Primary ? X86::EBX : X86::EDI; } - if (Is64Bit) - return Primary ? X86::R11 : X86::R12; + if (Is64Bit) { + if (IsLP64) + return Primary ? X86::R11 : X86::R12; + else + return Primary ? X86::R11D : X86::R12D; + } bool IsNested = HasNestArgument(&MF); @@ -1314,21 +1485,23 @@ void X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MachineBasicBlock &prologueMBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); uint64_t StackSize; const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool Is64Bit = STI.is64Bit(); + const bool IsLP64 = STI.isTarget64BitLP64(); unsigned TlsReg, TlsOffset; DebugLoc DL; - unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true); + unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "Scratch register is live-in"); if (MF.getFunction()->isVarArg()) report_fatal_error("Segmented stacks do not support vararg functions."); - if (!STI.isTargetLinux() && !STI.isTargetDarwin() && - !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD()) + if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && + !STI.isTargetWin64() && !STI.isTargetFreeBSD() && + !STI.isTargetDragonFly()) report_fatal_error("Segmented stacks not supported on this platform."); // Eventually StackSize will be calculated by a link-time pass; which will @@ -1359,7 +1532,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } if (IsNested) - allocMBB->addLiveIn(X86::R10); + allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D); MF.push_front(allocMBB); MF.push_front(checkMBB); @@ -1372,7 +1545,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { if (Is64Bit) { if (STI.isTargetLinux()) { TlsReg = X86::FS; - TlsOffset = 0x70; + TlsOffset = IsLP64 ? 0x70 : 0x40; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. @@ -1382,17 +1555,20 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } else if (STI.isTargetFreeBSD()) { TlsReg = X86::FS; TlsOffset = 0x18; + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x20; // use tls_tcb.tcb_segstack } else { report_fatal_error("Segmented stacks not supported on this platform."); } if (CompareStackPointer) - ScratchReg = X86::RSP; + ScratchReg = IsLP64 ? X86::RSP : X86::ESP; else - BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP) + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); - BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg) + BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else { if (STI.isTargetLinux()) { @@ -1404,6 +1580,9 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } else if (STI.isTargetWin32()) { TlsReg = X86::FS; TlsOffset = 0x14; // pvArbitrary, reserved for application use + } else if (STI.isTargetDragonFly()) { + TlsReg = X86::FS; + TlsOffset = 0x10; // use tls_tcb.tcb_segstack } else if (STI.isTargetFreeBSD()) { report_fatal_error("Segmented stacks not supported on FreeBSD i386."); } else { @@ -1416,7 +1595,8 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); - if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64()) { + if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || + STI.isTargetDragonFly()) { BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else if (STI.isTargetDarwin()) { @@ -1426,11 +1606,11 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { bool SaveScratch2; if (CompareStackPointer) { // The primary scratch register is available for holding the TLS offset. - ScratchReg2 = GetScratchRegister(Is64Bit, MF, true); + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true); SaveScratch2 = false; } else { // Need to use a second register to hold the TLS offset - ScratchReg2 = GetScratchRegister(Is64Bit, MF, false); + ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false); // Unfortunately, with fastcc the second scratch register may hold an // argument. @@ -1460,7 +1640,7 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. - BuildMI(checkMBB, DL, TII.get(X86::JA_4)).addMBB(&prologueMBB); + BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. @@ -1468,15 +1648,21 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { // Functions with nested arguments use R10, so it needs to be saved across // the call to _morestack + const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX; + const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; + const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; + const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; + const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; + if (IsNested) - BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10); + BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); - BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10) + BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) .addImm(StackSize); - BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11) + BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); - MF.getRegInfo().setPhysRegUsed(X86::R10); - MF.getRegInfo().setPhysRegUsed(X86::R11); + MF.getRegInfo().setPhysRegUsed(Reg10); + MF.getRegInfo().setPhysRegUsed(Reg11); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); @@ -1485,12 +1671,36 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { } // __morestack is in libgcc - if (Is64Bit) - BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) - .addExternalSymbol("__morestack"); - else - BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) - .addExternalSymbol("__morestack"); + if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { + // Under the large code model, we cannot assume that __morestack lives + // within 2^31 bytes of the call site, so we cannot use pc-relative + // addressing. We cannot perform the call via a temporary register, + // as the rax register may be used to store the static chain, and all + // other suitable registers may be either callee-save or used for + // parameter passing. We cannot use the stack at this point either + // because __morestack manipulates the stack directly. + // + // To avoid these issues, perform an indirect call via a read-only memory + // location containing the address. + // + // This solution is not perfect, as it assumes that the .rodata section + // is laid out within 2^31 bytes of each function body, but this seems + // to be sufficient for JIT. + BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addExternalSymbol("__morestack_addr") + .addReg(0); + MF.getMMI().setUsesMorestackAddr(true); + } else { + if (Is64Bit) + BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack"); + else + BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__morestack"); + } if (IsNested) BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); @@ -1523,13 +1733,14 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); const unsigned SlotSize = - static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()) + static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo()) ->getSlotSize(); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); const bool Is64Bit = STI.is64Bit(); + const bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL; // HiPE-specific values const unsigned HipeLeafWords = 24; @@ -1623,7 +1834,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { SPLimitOffset = 0x4c; } - ScratchReg = GetScratchRegister(Is64Bit, MF, true); + ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "HiPE prologue scratch register is live-in"); @@ -1633,7 +1844,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { // SPLimitOffset is in a fixed heap location (pointed by BP). addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB); + BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB); // Create new MBB for IncStack: BuildMI(incStackMBB, DL, TII.get(CALLop)). @@ -1642,7 +1853,7 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { SPReg, false, -MaxStack); addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); - BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB); + BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); stackCheckMBB->addSuccessor(&prologueMBB, 99); stackCheckMBB->addSuccessor(incStackMBB, 1); @@ -1657,48 +1868,49 @@ void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); - const X86RegisterInfo &RegInfo = - *static_cast<const X86RegisterInfo *>(MF.getTarget().getRegisterInfo()); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( + MF.getSubtarget().getRegisterInfo()); unsigned StackPtr = RegInfo.getStackRegister(); - bool reseveCallFrame = hasReservedCallFrame(MF); + bool reserveCallFrame = hasReservedCallFrame(MF); int Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>(); bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL = I->getDebugLoc(); - uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0; - uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; + uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; + uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; I = MBB.erase(I); - if (!reseveCallFrame) { + if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, <amt>' and the // adjcallstackdown instruction into 'add ESP, <amt>' - // TODO: consider using push / pop instead of sub + store / add if (Amount == 0) return; // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned StackAlign = - MF.getTarget().getFrameLowering()->getStackAlignment(); + unsigned StackAlign = MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getStackAlignment(); Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; MachineInstr *New = nullptr; - if (Opcode == TII.getCallFrameSetupOpcode()) { - New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), - StackPtr) - .addReg(StackPtr) - .addImm(Amount); - } else { - assert(Opcode == TII.getCallFrameDestroyOpcode()); - // Factor out the amount the callee already popped. - Amount -= CalleeAmt; + // Factor out the amount that gets handled inside the sequence + // (Pushes of argument for frame setup, callee pops for frame destroy) + Amount -= InternalAmt; + + if (Amount) { + if (Opcode == TII.getCallFrameSetupOpcode()) { + New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr) + .addReg(StackPtr).addImm(Amount); + } else { + assert(Opcode == TII.getCallFrameDestroyOpcode()); - if (Amount) { unsigned Opc = getADDriOpcode(IsLP64, Amount); New = BuildMI(MF, DL, TII.get(Opc), StackPtr) .addReg(StackPtr).addImm(Amount); @@ -1716,13 +1928,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, return; } - if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { + if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. We do this until we have // more advanced stack pointer tracking ability. - unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt); + unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt); MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr).addImm(CalleeAmt); + .addReg(StackPtr).addImm(InternalAmt); // The EFLAGS implicit def is dead. New->getOperand(3).setIsDead(); diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 5ad3d4dc2e97..9cb887ac112d 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86_FRAMELOWERING_H -#define X86_FRAMELOWERING_H +#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H +#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #include "llvm/Target/TargetFrameLowering.h" @@ -20,12 +20,17 @@ namespace llvm { class MCSymbol; class X86TargetMachine; +class X86Subtarget; class X86FrameLowering : public TargetFrameLowering { public: explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} + static void getStackProbeFunction(const X86Subtarget &STI, + unsigned &CallOp, + const char *&Symbol); + void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL) const; @@ -59,14 +64,30 @@ public: bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; + bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; + bool needsFrameIndexResolution(const MachineFunction &MF) const override; int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const; + int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + +private: + /// convertArgMovsToPushes - This method tries to convert a call sequence + /// that uses sub and mov instructions to put the argument onto the stack + /// into a series of pushes. + /// Returns true if the transformation succeeded, false if not. + bool convertArgMovsToPushes(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + uint64_t Amount) const; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index ba2f5f645d09..7fe8e8b344aa 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Type.h" @@ -33,6 +34,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" +#include <stdint.h> using namespace llvm; #define DEBUG_TYPE "x86-isel" @@ -192,7 +194,6 @@ namespace { private: SDNode *Select(SDNode *N) override; SDNode *SelectGather(SDNode *N, unsigned Opc); - SDNode *SelectAtomic64(SDNode *Node, unsigned Opc); SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT); bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); @@ -237,10 +238,10 @@ namespace { inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { - Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ? - CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, - getTargetLowering()->getPointerTy()) : - AM.Base_Reg; + Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) + ? CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, + TLI->getPointerTy()) + : AM.Base_Reg; Scale = getI8Imm(AM.Scale); Index = AM.IndexReg; // These are 32-bit even in 64-bit mode since RIP relative offset @@ -297,7 +298,14 @@ namespace { /// getInstrInfo - Return a reference to the TargetInstrInfo, casted /// to the target-specific type. const X86InstrInfo *getInstrInfo() const { - return getTargetMachine().getInstrInfo(); + return getTargetMachine().getSubtargetImpl()->getInstrInfo(); + } + + /// \brief Address-mode matching performs shift-of-and to and-of-shift + /// reassociation in order to expose more scaled addressing + /// opportunities. + bool ComplexPatternFuncMutatesDAG() const override { + return true; } }; } @@ -510,7 +518,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // If the source and destination are SSE registers, then this is a legal // conversion that should not be lowered. const X86TargetLowering *X86Lowering = - static_cast<const X86TargetLowering *>(getTargetLowering()); + static_cast<const X86TargetLowering *>(TLI); bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) @@ -544,7 +552,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { false, false, 0); SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, MachinePointerInfo(), - MemVT, false, false, 0); + MemVT, false, false, false, 0); // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the // extload we created. This will cause general havok on the dag because @@ -565,7 +573,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { /// the main function. void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI) { - const TargetInstrInfo *TII = TM.getInstrInfo(); + const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo(); if (Subtarget->isTargetCygMing()) { unsigned CallOp = Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32; @@ -775,9 +783,10 @@ static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { } } -// Transform "(X >> (8-C1)) & C2" to "(X >> 8) & 0xff)" if safe. This -// allows us to convert the shift and and into an h-register extract and -// a scaled index. Returns false if the simplification is performed. +// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if +// safe. This allows us to convert the shift and and into an h-register +// extract and a scaled index. Returns false if the simplification is +// performed. static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, @@ -1429,7 +1438,7 @@ bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base); if (RN && RN->getReg() == 0) Base = CurDAG->getRegister(0, MVT::i64); - else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(N)) { + else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) { // Base could already be %rip, particularly in the x32 ABI. Base = SDValue(CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, DL, MVT::i64, @@ -1563,26 +1572,7 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, /// SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); - return CurDAG->getRegister(GlobalBaseReg, - getTargetLowering()->getPointerTy()).getNode(); -} - -SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) { - SDValue Chain = Node->getOperand(0); - SDValue In1 = Node->getOperand(1); - SDValue In2L = Node->getOperand(2); - SDValue In2H = Node->getOperand(3); - - SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) - return nullptr; - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); - const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain}; - SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), - MVT::i32, MVT::i32, MVT::Other, Ops); - cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1); - return ResNode; + return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy()).getNode(); } /// Atomic opcode table @@ -1716,16 +1706,23 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = { static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, SDLoc dl, enum AtomicOpc &Op, MVT NVT, - SDValue Val) { + SDValue Val, + const X86Subtarget *Subtarget) { if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) { int64_t CNVal = CN->getSExtValue(); // Quit if not 32-bit imm. if ((int32_t)CNVal != CNVal) return Val; + // Quit if INT32_MIN: it would be negated as it is negative and overflow, + // producing an immediate that does not fit in the 32 bits available for + // an immediate operand to sub. However, it still fits in 32 bits for the + // add (since it is not negated) so we can return target-constant. + if (CNVal == INT32_MIN) + return CurDAG->getTargetConstant(CNVal, NVT); // For atomic-load-add, we could do some optimizations. if (Op == ADD) { // Translate to INC/DEC if ADD by 1 or -1. - if ((CNVal == 1) || (CNVal == -1)) { + if (((CNVal == 1) || (CNVal == -1)) && !Subtarget->slowIncDec()) { Op = (CNVal == 1) ? INC : DEC; // No more constant operand after being translated into INC/DEC. return SDValue(); @@ -1774,8 +1771,8 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { SDValue Chain = Node->getOperand(0); SDValue Ptr = Node->getOperand(1); SDValue Val = Node->getOperand(2); - SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) + SDValue Base, Scale, Index, Disp, Segment; + if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) return nullptr; // Which index into the table. @@ -1797,7 +1794,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { break; } - Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val); + Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val, Subtarget); bool isUnOp = !Val.getNode(); bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant); @@ -1829,31 +1826,40 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { Opc = AtomicOpcTbl[Op][I32]; break; case MVT::i64: - Opc = AtomicOpcTbl[Op][I64]; if (isCN) { if (immSext8(Val.getNode())) Opc = AtomicOpcTbl[Op][SextConstantI64]; else if (i64immSExt32(Val.getNode())) Opc = AtomicOpcTbl[Op][ConstantI64]; - } + else + llvm_unreachable("True 64 bits constant in SelectAtomicLoadArith"); + } else + Opc = AtomicOpcTbl[Op][I64]; break; } assert(Opc != 0 && "Invalid arith lock transform!"); + // Building the new node. SDValue Ret; - SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, - dl, NVT), 0); - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); if (isUnOp) { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain }; + SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Chain }; Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); } else { - SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain }; + SDValue Ops[] = { Base, Scale, Index, Disp, Segment, Val, Chain }; Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops), 0); } + + // Copying the MachineMemOperand. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<MemSDNode>(Node)->getMemOperand(); cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1); + + // We need to have two outputs as that is what the original instruction had. + // So we add a dummy, undefined output. This is safe as we checked first + // that no-one uses our output anyway. + SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, + dl, NVT), 0); SDValue RetVals[] = { Undef, Ret }; return CurDAG->getMergeValues(RetVals, dl).getNode(); } @@ -1885,8 +1891,8 @@ static bool HasNoSignedComparisonUses(SDNode *N) { case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr: case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm: case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm: - case X86::JA_4: case X86::JAE_4: case X86::JB_4: case X86::JBE_4: - case X86::JE_4: case X86::JNE_4: case X86::JP_4: case X86::JNP_4: + case X86::JA_1: case X86::JAE_1: case X86::JB_1: case X86::JBE_1: + case X86::JE_1: case X86::JNE_1: case X86::JP_1: case X86::JNP_1: case X86::CMOVA16rr: case X86::CMOVA16rm: case X86::CMOVA32rr: case X86::CMOVA32rm: case X86::CMOVA64rr: case X86::CMOVA64rm: @@ -2125,6 +2131,16 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case X86ISD::GlobalBaseReg: return getGlobalBaseReg(); + case X86ISD::SHRUNKBLEND: { + // SHRUNKBLEND selects like a regular VSELECT. + SDValue VSelect = CurDAG->getNode( + ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), + Node->getOperand(1), Node->getOperand(2)); + ReplaceUses(SDValue(Node, 0), VSelect); + SelectCode(VSelect.getNode()); + // We already called ReplaceUses. + return nullptr; + } case ISD::ATOMIC_LOAD_XOR: case ISD::ATOMIC_LOAD_AND: @@ -2212,6 +2228,25 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { return CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0), getI8Imm(ShlVal)); } + case X86ISD::UMUL8: + case X86ISD::SMUL8: { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + Opc = (Opcode == X86ISD::SMUL8 ? X86::IMUL8r : X86::MUL8r); + + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::AL, + N0, SDValue()).getValue(1); + + SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32); + SDValue Ops[] = {N1, InFlag}; + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + + ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1)); + return nullptr; + } + case X86ISD::UMUL: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); @@ -2387,11 +2422,14 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } case ISD::SDIVREM: - case ISD::UDIVREM: { + case ISD::UDIVREM: + case X86ISD::SDIVREM8_SEXT_HREG: + case X86ISD::UDIVREM8_ZEXT_HREG: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - bool isSigned = Opcode == ISD::SDIVREM; + bool isSigned = (Opcode == ISD::SDIVREM || + Opcode == X86ISD::SDIVREM8_SEXT_HREG); if (!isSigned) { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); @@ -2466,7 +2504,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. - SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); + SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); switch (NVT.SimpleTy) { case MVT::i16: ClrNode = @@ -2507,33 +2545,43 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); } - // Prevent use of AH in a REX instruction by referencing AX instead. - // Shift it down 8 bits. + // Prevent use of AH in a REX instruction by explicitly copying it to + // an ABCD_L register. // // The current assumption of the register allocator is that isel - // won't generate explicit references to the GPR8_NOREX registers. If + // won't generate explicit references to the GR8_ABCD_H registers. If // the allocator and/or the backend get enhanced to be more robust in // that regard, this can be, and should be, removed. - if (HiReg == X86::AH && Subtarget->is64Bit() && - !SDValue(Node, 1).use_empty()) { - SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, - X86::AX, MVT::i16, InFlag); - InFlag = Result.getValue(2); - - // If we also need AL (the quotient), get it by extracting a subreg from - // Result. The fast register allocator does not like multiple CopyFromReg - // nodes using aliasing registers. - if (!SDValue(Node, 0).use_empty()) - ReplaceUses(SDValue(Node, 0), - CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); - - // Shift AX right by 8 bits instead of using AH. - Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16, - Result, - CurDAG->getTargetConstant(8, MVT::i8)), - 0); - ReplaceUses(SDValue(Node, 1), - CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); + if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { + SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); + unsigned AHExtOpcode = + isSigned ? X86::MOVSX32_NOREXrr8 : X86::MOVZX32_NOREXrr8; + + SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, + MVT::Glue, AHCopy, InFlag); + SDValue Result(RNode, 0); + InFlag = SDValue(RNode, 1); + + if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG || + Opcode == X86ISD::SDIVREM8_SEXT_HREG) { + if (Node->getValueType(1) == MVT::i64) { + // It's not possible to directly movsx AH to a 64bit register, because + // the latter needs the REX prefix, but the former can't have it. + assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG && + "Unexpected i64 sext of h-register"); + Result = + SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), Result, + CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)), + 0); + } + } else { + Result = + CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); + } + ReplaceUses(SDValue(Node, 1), Result); + DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the division (low) result, if it is needed. if (!SDValue(Node, 0).use_empty()) { @@ -2563,12 +2611,30 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to - // use a smaller encoding. if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - HasNoSignedComparisonUses(Node)) - // Look past the truncate if CMP is the only use of it. + HasNoSignedComparisonUses(Node)) { + // Look for (X86cmp (truncate $op, i1), 0) and try to convert to a + // smaller encoding + if (Opcode == X86ISD::CMP && N0.getValueType() == MVT::i1 && + X86::isZeroNode(N1)) { + SDValue Reg = N0.getOperand(0); + SDValue Imm = CurDAG->getTargetConstant(1, MVT::i8); + + // Emit testb + if (Reg.getScalarValueSizeInBits() > 8) + Reg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Reg); + // Emit a testb. + SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, + Reg, Imm); + ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); + return nullptr; + } + N0 = N0.getOperand(0); + } + // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to + // use a smaller encoding. + // Look past the truncate if CMP is the only use of it. if ((N0.getNode()->getOpcode() == ISD::AND || (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) && N0.getNode()->hasOneUse() && diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 6fc4c847429c..177299b8afc4 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19,6 +19,7 @@ #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" @@ -49,6 +50,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" +#include "X86IntrinsicsInfo.h" #include <bitset> #include <numeric> #include <cctype> @@ -65,10 +67,23 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( cl::Hidden); static cl::opt<bool> ExperimentalVectorShuffleLowering( - "x86-experimental-vector-shuffle-lowering", cl::init(false), + "x86-experimental-vector-shuffle-lowering", cl::init(true), cl::desc("Enable an experimental vector shuffle lowering code path."), cl::Hidden); +static cl::opt<bool> ExperimentalVectorShuffleLegality( + "x86-experimental-vector-shuffle-legality", cl::init(false), + cl::desc("Enable experimental shuffle legality based on the experimental " + "shuffle lowering. Should only be used with the experimental " + "shuffle lowering."), + cl::Hidden); + +static cl::opt<int> ReciprocalEstimateRefinementSteps( + "x86-recip-refinement-steps", cl::init(1), + cl::desc("Specify the number of Newton-Raphson iterations applied to the " + "result of the hardware reciprocal estimate instruction."), + cl::NotHidden); + // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); @@ -99,21 +114,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin()+NormalizedIdxVal, + makeArrayRef(Vec->op_begin() + NormalizedIdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, - VecIdx); - - return Result; - + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } + /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the -/// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes +/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { @@ -150,25 +162,23 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec, * ElemsPerChunk); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, - VecIdx); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } + /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits -/// we want. It need not be aligned to a 128-bit bounday. That makes +/// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. -static SDValue Insert128BitVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl) { +static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG,SDLoc dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } -static SDValue Insert256BitVector(SDValue Result, SDValue Vec, - unsigned IdxVal, SelectionDAG &DAG, - SDLoc dl) { +static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, + SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } @@ -191,28 +201,10 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -static TargetLoweringObjectFile *createTLOF(const Triple &TT) { - if (TT.isOSBinFormatMachO()) { - if (TT.getArch() == Triple::x86_64) - return new X86_64MachoTargetObjectFile(); - return new TargetLoweringObjectFileMachO(); - } - - if (TT.isOSLinux()) - return new X86LinuxTargetObjectFile(); - if (TT.isOSBinFormatELF()) - return new TargetLoweringObjectFileELF(); - if (TT.isKnownWindowsMSVCEnvironment()) - return new X86WindowsTargetObjectFile(); - if (TT.isOSBinFormatCOFF()) - return new TargetLoweringObjectFileCOFF(); - llvm_unreachable("unknown subtarget type"); -} - // FIXME: This should stop caching the target machine as soon as // we can remove resetOperationActions et al. -X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) - : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { +X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM) + : TargetLowering(TM) { Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); @@ -240,13 +232,13 @@ void X86TargetLowering::resetOperationActions() { // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; - // X86 is weird, it always uses i8 for shift amounts and setcc results. + // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - // For 64-bit since we have so many registers use the ILP scheduler, for - // 32-bit code use the register pressure specific scheduling. + // For 64-bit, since we have so many registers, use the ILP scheduler. + // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget->isAtom()) setSchedulingPreference(Sched::ILP); @@ -255,13 +247,14 @@ void X86TargetLowering::resetOperationActions() { else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); + TM.getSubtarget<X86Subtarget>().getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); - // Bypass expensive divides on Atom when compiling with O2 - if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { - addBypassSlowDiv(32, 8); - if (Subtarget->is64Bit()) + // Bypass expensive divides on Atom when compiling with O2. + if (TM.getOptLevel() >= CodeGenOpt::Default) { + if (Subtarget->hasSlowDivide32()) + addBypassSlowDiv(32, 8); + if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) addBypassSlowDiv(64, 16); } @@ -306,7 +299,8 @@ void X86TargetLowering::resetOperationActions() { if (Subtarget->is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); + for (MVT VT : MVT::integer_valuetypes()) + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); @@ -531,7 +525,9 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); @@ -661,8 +657,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? - MVT::i64 : MVT::i32, Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom); if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. @@ -810,13 +805,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); + setOperationAction(ISD::FMINNUM, MVT::f80, Expand); + setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. - for (int i = MVT::FIRST_VECTOR_VALUETYPE; - i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; + for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ADD , VT, Expand); setOperationAction(ISD::SUB , VT, Expand); setOperationAction(ISD::FADD, VT, Expand); @@ -885,18 +880,19 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); - for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction(VT, - (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); + for (MVT InnerVT : MVT::vector_valuetypes()) { + setTruncStoreAction(InnerVT, VT, Expand); + + setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); - // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types, - // we have to deal with them whether we ask for Expansion or not. Setting - // Expand causes its own optimisation problems though, so leave them legal. - if (VT.getVectorElementType() == MVT::i1) - setLoadExtAction(ISD::EXTLOAD, VT, Expand); + // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like + // types, we have to deal with them whether we ask for Expansion or not. + // Setting Expand causes its own optimisation problems though, so leave + // them legal. + if (VT.getVectorElementType() == MVT::i1) + setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); + } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones @@ -953,12 +949,13 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, &X86::VR128RegClass); - // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM + // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, &X86::VR128RegClass); addRegisterClass(MVT::v8i16, &X86::VR128RegClass); @@ -999,6 +996,14 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + // Only provide customized ctpop vector bit twiddling for vector types we + // know to perform better than using the popcnt instructions on each vector + // element. If popcnt isn't supported, always provide the custom version. + if (!Subtarget->hasPOPCNT()) { + setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); + setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + } + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { MVT VT = (MVT::SimpleValueType)i; @@ -1013,6 +1018,22 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } + // We support custom legalizing of sext and anyext loads for specific + // memory vector types which we can load as a scalar (or sequence of + // scalars) and extend in-register to a legal 128-bit vector type. For sext + // loads these must work with a single scalar load. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); + } + setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); @@ -1064,7 +1085,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); @@ -1106,7 +1128,15 @@ void X86TargetLowering::resetOperationActions() { // some vselects for now. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); - // i8 and i16 vectors are custom , because the source register and source + // SSE41 brings specific instructions for doing vector sign extend even in + // cases where we don't have SRA. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); + } + + // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional // information. @@ -1120,7 +1150,7 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); - // FIXME: these should be Legal but thats only for the case where + // FIXME: these should be Legal, but that's only for the case where // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); @@ -1200,7 +1230,8 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); @@ -1270,6 +1301,20 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); + + // The custom lowering for UINT_TO_FP for v8i32 becomes interesting + // when we have a 256bit-wide blend with immediate. + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); + + // Only provide customized ctpop vector bit twiddling for vector types we + // know to perform better than using the popcnt instructions on each + // vector element. If popcnt isn't supported, always provide the custom + // version. + if (!Subtarget->hasPOPCNT()) + setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); + + // Custom CTPOP always performs better on natively supported v8i32 + setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -1298,15 +1343,16 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. - for (int i = MVT::FIRST_VECTOR_VALUETYPE; - i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; - + for (MVT VT : MVT::vector_valuetypes()) { + if (VT.getScalarSizeInBits() >= 32) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. - if (VT.is128BitVector()) + if (VT.is128BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + } // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; @@ -1351,12 +1397,14 @@ void X86TargetLowering::resetOperationActions() { addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + for (MVT VT : MVT::fp_vector_valuetypes()) + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); setOperationAction(ISD::LOAD, MVT::v8i64, Legal); @@ -1394,6 +1442,10 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); @@ -1466,16 +1518,13 @@ void X86TargetLowering::resetOperationActions() { } // Custom lower several nodes. - for (int i = MVT::FIRST_VECTOR_VALUETYPE; - i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { - MVT VT = (MVT::SimpleValueType)i; - + for (MVT VT : MVT::vector_valuetypes()) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. - if (VT.is128BitVector() || VT.is256BitVector()) + if (VT.is128BitVector() || VT.is256BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - + } if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); @@ -1491,12 +1540,14 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); } } for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { MVT VT = (MVT::SimpleValueType)i; - // Do not attempt to promote non-256-bit vectors + // Do not attempt to promote non-512-bit vectors. if (!VT.is512BitVector()) continue; @@ -1505,13 +1556,59 @@ void X86TargetLowering::resetOperationActions() { } }// has AVX-512 + if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) { + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); + + addRegisterClass(MVT::v32i1, &X86::VK32RegClass); + addRegisterClass(MVT::v64i1, &X86::VK64RegClass); + + setOperationAction(ISD::LOAD, MVT::v32i16, Legal); + setOperationAction(ISD::LOAD, MVT::v64i8, Legal); + setOperationAction(ISD::SETCC, MVT::v32i1, Custom); + setOperationAction(ISD::SETCC, MVT::v64i1, Custom); + setOperationAction(ISD::ADD, MVT::v32i16, Legal); + setOperationAction(ISD::ADD, MVT::v64i8, Legal); + setOperationAction(ISD::SUB, MVT::v32i16, Legal); + setOperationAction(ISD::SUB, MVT::v64i8, Legal); + setOperationAction(ISD::MUL, MVT::v32i16, Legal); + + for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { + const MVT VT = (MVT::SimpleValueType)i; + + const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + + // Do not attempt to promote non-512-bit vectors. + if (!VT.is512BitVector()) + continue; + + if (EltSize < 32) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + } + } + } + + if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) { + addRegisterClass(MVT::v4i1, &X86::VK4RegClass); + addRegisterClass(MVT::v2i1, &X86::VK2RegClass); + + setOperationAction(ISD::SETCC, MVT::v4i1, Custom); + setOperationAction(ISD::SETCC, MVT::v2i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Legal); + + setOperationAction(ISD::AND, MVT::v8i32, Legal); + setOperationAction(ISD::OR, MVT::v8i32, Legal); + setOperationAction(ISD::XOR, MVT::v8i32, Legal); + setOperationAction(ISD::AND, MVT::v4i32, Legal); + setOperationAction(ISD::OR, MVT::v4i32, Legal); + setOperationAction(ISD::XOR, MVT::v4i32, Legal); + } + // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. - for (int VT = MVT::FIRST_VECTOR_VALUETYPE; - VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, - Custom); - } + for (MVT VT : MVT::vector_valuetypes()) + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -1537,9 +1634,6 @@ void X86TargetLowering::resetOperationActions() { setOperationAction(ISD::UMULO, VT, Custom); } - // There are no 8-bit 3-address imul/mul instructions - setOperationAction(ISD::SMULO, MVT::i8, Expand); - setOperationAction(ISD::UMULO, MVT::i8, Expand); if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. @@ -1553,9 +1647,8 @@ void X86TargetLowering::resetOperationActions() { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); if (Subtarget->isTargetDarwin()) { - // For MacOSX, we don't want to the normal expansion of a libcall to - // sincos. We want to issue a libcall to __sincos_stret to avoid memory - // traffic. + // For MacOSX, we don't want the normal expansion of a libcall to sincos. + // We want to issue a libcall to __sincos_stret to avoid memory traffic. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } @@ -1586,7 +1679,9 @@ void X86TargetLowering::resetOperationActions() { setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); + setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::MSTORE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -1614,8 +1709,15 @@ void X86TargetLowering::resetOperationActions() { // Predictable cmov don't hurt on atom because it's in-order. PredictableSelectIsExpensive = !Subtarget->isAtom(); - + EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. + + verifyIntrinsicTables(); +} + +// This has so far only been implemented for 64-bit MachO. +bool X86TargetLowering::useLoadStackGuardNode() const { + return Subtarget->isTargetMachO() && Subtarget->is64Bit(); } TargetLoweringBase::LegalizeTypeAction @@ -1632,16 +1734,46 @@ EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; - if (Subtarget->hasAVX512()) - switch(VT.getVectorNumElements()) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; + const unsigned NumElts = VT.getVectorNumElements(); + const EVT EltVT = VT.getVectorElementType(); + if (VT.is512BitVector()) { + if (Subtarget->hasAVX512()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + if (Subtarget->hasBWI()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 32: return MVT::v32i1; + case 64: return MVT::v64i1; + } + } + + if (VT.is256BitVector() || VT.is128BitVector()) { + if (Subtarget->hasVLX()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 2: return MVT::v2i1; + case 4: return MVT::v4i1; + case 8: return MVT::v8i1; + } + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + case 32: return MVT::v32i1; + } } return VT.changeVectorElementTypeToInteger(); } -/// getMaxByValAlign - Helper for getByValTypeAlignment to determine +/// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) @@ -1666,7 +1798,7 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { } } -/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate +/// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. @@ -1685,7 +1817,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { return Align; } -/// getOptimalMemOpType - Returns the target specific optimal type for load +/// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it @@ -1742,15 +1874,16 @@ bool X86TargetLowering::isSafeMemOpType(MVT VT) const { } bool -X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, - unsigned, - bool *Fast) const { +X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned, + unsigned, + bool *Fast) const { if (Fast) *Fast = Subtarget->isUnalignedMemAccessFast(); return true; } -/// getJumpTableEncoding - Return the entry encoding for a jump table in the +/// Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { @@ -1776,8 +1909,7 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, MCSymbolRefExpr::VK_GOTOFF, Ctx); } -/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC -/// jumptable. +/// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget->is64Bit()) @@ -1787,9 +1919,8 @@ SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, return Table; } -/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the -/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an -/// MCExpr. +/// This returns the relocation base for the given PIC jumptable, +/// the same as getPICJumpTableRelocBase, but as an MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { @@ -1810,9 +1941,7 @@ X86TargetLowering::findRepresentativeClass(MVT VT) const{ default: return TargetLowering::findRepresentativeClass(VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: - RRC = Subtarget->is64Bit() ? - (const TargetRegisterClass*)&X86::GR64RegClass : - (const TargetRegisterClass*)&X86::GR32RegClass; + RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; @@ -1867,8 +1996,7 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), - RVLocs, Context); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } @@ -1887,8 +2015,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), - RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; @@ -1905,7 +2032,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); - // Promote values to the appropriate types + // Promote values to the appropriate types. if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) @@ -1916,7 +2043,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && - "Unexpected FP-extend for return value."); + "Unexpected FP-extend for return value."); // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. @@ -1934,8 +2061,8 @@ X86TargetLowering::LowerReturn(SDValue Chain, // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. - if (VA.getLocReg() == X86::ST0 || - VA.getLocReg() == X86::ST1) { + if (VA.getLocReg() == X86::FP0 || + VA.getLocReg() == X86::FP1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) @@ -2021,6 +2148,13 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; + // If we are returning more than one value, we can definitely + // not make a tail call see PR19530 + if (UI->getNumOperands() > 4) + return false; + if (UI->getNumOperands() == 4 && + UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) + return false; HasRet = true; } @@ -2031,8 +2165,8 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { return true; } -MVT -X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, +EVT +X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT; // TODO: Is this also valid on 32-bit? @@ -2041,11 +2175,11 @@ X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, else ReturnMVT = MVT::i32; - MVT MinVT = getRegisterType(ReturnMVT); + EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } -/// LowerCallResult - Lower the result values of a call into the +/// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue @@ -2058,8 +2192,8 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; bool Is64Bit = Subtarget->is64Bit(); - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. @@ -2073,33 +2207,21 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, report_fatal_error("SSE register return with SSE disabled"); } - SDValue Val; - - // If this is a call to a function that returns an fp value on the floating - // point stack, we must guarantee the value is popped from the stack, so - // a CopyFromReg is not good enough - the copy instruction may be eliminated - // if the return value is not used. We use the FpPOP_RETVAL instruction - // instead. - if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { - // If we prefer to use the value in xmm registers, copy it out as f80 and - // use a truncate to move it from fp stack reg to xmm reg. - if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; - SDValue Ops[] = { Chain, InFlag }; - Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, - MVT::Other, MVT::Glue, Ops), 1); - Val = Chain.getValue(0); - - // Round the f80 to the right size, which also moves it to the appropriate - // xmm register. - if (CopyVT != VA.getValVT()) - Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, - // This truncation won't change the value. - DAG.getIntPtrConstant(1)); - } else { - Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), - CopyVT, InFlag).getValue(1); - Val = Chain.getValue(0); - } + // If we prefer to use the value in xmm registers, copy it out as f80 and + // use a truncate to move it from fp stack reg to xmm reg. + if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && + isScalarFPTypeInSSEReg(VA.getValVT())) + CopyVT = MVT::f80; + + Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), + CopyVT, InFlag).getValue(1); + SDValue Val = Chain.getValue(0); + + if (CopyVT != VA.getValVT()) + Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, + // This truncation won't change the value. + DAG.getIntPtrConstant(1)); + InFlag = Chain.getValue(2); InVals.push_back(Val); } @@ -2137,8 +2259,7 @@ callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { return StackStructReturn; } -/// ArgsAreStructReturn - Determines whether a function uses struct -/// return semantics. +/// Determines whether a function uses struct return semantics. static StructReturnType argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { if (Ins.empty()) @@ -2152,10 +2273,9 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { return StackStructReturn; } -/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified -/// by "Src" to address "Dst" with size and alignment information specified by -/// the specific parameter attribute. The copy will be passed as a byval -/// function parameter. +/// Make a copy of an aggregate at address specified by "Src" to address +/// "Dst" with size and alignment information specified by the specific +/// parameter attribute. The copy will be passed as a byval function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, @@ -2167,7 +2287,7 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, MachinePointerInfo(), MachinePointerInfo()); } -/// IsTailCallConvention - Return true if the calling convention is one that +/// Return true if the calling convention is one that /// supports tail call optimization. static bool IsTailCallConvention(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || @@ -2192,7 +2312,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { return true; } -/// FuncIsMadeTailCallSafe - Return true if the function is being made into +/// Return true if the function is being made into /// a tailcall target by changing its ABI. static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, bool GuaranteedTailCallOpt) { @@ -2240,6 +2360,55 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, } } +// FIXME: Get this from tablegen. +static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + + if (Subtarget->isCallingConvWin64(CallConv)) { + static const MCPhysReg GPR64ArgRegsWin64[] = { + X86::RCX, X86::RDX, X86::R8, X86::R9 + }; + return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); + } + + static const MCPhysReg GPR64ArgRegs64Bit[] = { + X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 + }; + return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); +} + +// FIXME: Get this from tablegen. +static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, + CallingConv::ID CallConv, + const X86Subtarget *Subtarget) { + assert(Subtarget->is64Bit()); + if (Subtarget->isCallingConvWin64(CallConv)) { + // The XMM registers which might contain var arg parameters are shadowed + // in their paired GPR. So we only need to save the GPR to their home + // slots. + // TODO: __vectorcall will change this. + return None; + } + + const Function *Fn = MF.getFunction(); + bool NoImplicitFloatOps = Fn->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); + assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && + "SSE register cannot be used when SSE is disabled!"); + if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || + !Subtarget->hasSSE1()) + // Kernel mode asks for SSE to be disabled, so there are no XMM argument + // registers. + return None; + + static const MCPhysReg XMMArgRegs64Bit[] = { + X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, + X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 + }; + return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); +} + SDValue X86TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, @@ -2267,8 +2436,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) @@ -2312,6 +2480,10 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; + else if (RegVT == MVT::v32i1) + RC = &X86::VK32RegClass; + else if (RegVT == MVT::v64i1) + RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); @@ -2378,125 +2550,146 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for - // the start of the first vararg value... for expansion of llvm.va_start. - if (isVarArg) { - if (Is64Bit || (CallConv != CallingConv::X86_FastCall && - CallConv != CallingConv::X86_ThisCall)) { - FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); + // the start of the first vararg value... for expansion of llvm.va_start. We + // can skip this if there are no va_start calls. + if (MFI->hasVAStart() && + (Is64Bit || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall))) { + FuncInfo->setVarArgsFrameIndex( + MFI->CreateFixedObject(1, StackSize, true)); + } + + // Figure out if XMM registers are in use. + assert(!(MF.getTarget().Options.UseSoftFloat && + Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat)) && + "SSE register cannot be used when SSE is disabled!"); + + // 64-bit calling conventions support varargs and register parameters, so we + // have to do extra work to spill them in the prologue. + if (Is64Bit && isVarArg && MFI->hasVAStart()) { + // Find the first unallocated argument registers. + ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); + ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); + unsigned NumIntRegs = + CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size()); + unsigned NumXMMRegs = + CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size()); + assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + "SSE register cannot be used when SSE is disabled!"); + + // Gather all the live in physical registers. + SmallVector<SDValue, 6> LiveGPRs; + SmallVector<SDValue, 8> LiveXMMRegs; + SDValue ALVal; + for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { + unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); + LiveGPRs.push_back( + DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); + } + if (!ArgXMMs.empty()) { + unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); + for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { + unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); + LiveXMMRegs.push_back( + DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); + } } - if (Is64Bit) { - unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; - - // FIXME: We should really autogenerate these arrays - static const MCPhysReg GPR64ArgRegsWin64[] = { - X86::RCX, X86::RDX, X86::R8, X86::R9 - }; - static const MCPhysReg GPR64ArgRegs64Bit[] = { - X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 - }; - static const MCPhysReg XMMArgRegs64Bit[] = { - X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, - X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 - }; - const MCPhysReg *GPR64ArgRegs; - unsigned NumXMMRegs = 0; - - if (IsWin64) { - // The XMM registers which might contain var arg parameters are shadowed - // in their paired GPR. So we only need to save the GPR to their home - // slots. - TotalNumIntRegs = 4; - GPR64ArgRegs = GPR64ArgRegsWin64; - } else { - TotalNumIntRegs = 6; TotalNumXMMRegs = 8; - GPR64ArgRegs = GPR64ArgRegs64Bit; - NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, - TotalNumXMMRegs); - } - unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, - TotalNumIntRegs); - - bool NoImplicitFloatOps = Fn->getAttributes(). - hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); - assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && - "SSE register cannot be used when SSE is disabled!"); - assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && - NoImplicitFloatOps) && - "SSE register cannot be used when SSE is disabled!"); - if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || - !Subtarget->hasSSE1()) - // Kernel mode asks for SSE to be disabled, so don't push them - // on the stack. - TotalNumXMMRegs = 0; - - if (IsWin64) { - const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex( + if (IsWin64) { + const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering(); + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = TFI.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); - // Fixup to set vararg frame on shadow area (4 x i64). - if (NumIntRegs < 4) - FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); - } else { - // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so - // they may be loaded by deferencing the result of va_next. - FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex( - MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, - false)); - } - - // Store the integer parameter registers. - SmallVector<SDValue, 8> MemOps; - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy()); - unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, - DAG.getIntPtrConstant(Offset)); - unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], - &X86::GR64RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - FuncInfo->getRegSaveFrameIndex(), Offset), - false, false, 0); - MemOps.push_back(Store); - Offset += 8; - } - - if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { - // Now store the XMM (fp + vector) parameter registers. - SmallVector<SDValue, 11> SaveXMMOps; - SaveXMMOps.push_back(Chain); - - unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); - SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); - SaveXMMOps.push_back(ALVal); - - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getRegSaveFrameIndex())); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getVarArgsFPOffset())); - - for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { - unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], - &X86::VR128RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); - SaveXMMOps.push_back(Val); - } - MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, SaveXMMOps)); - } - - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by deferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); + } + + // Store the integer parameter registers. + SmallVector<SDValue, 8> MemOps; + SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + getPointerTy()); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (SDValue Val : LiveGPRs) { + SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, + DAG.getIntPtrConstant(Offset)); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); + MemOps.push_back(Store); + Offset += 8; + } + + if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { + // Now store the XMM (fp + vector) parameter registers. + SmallVector<SDValue, 12> SaveXMMOps; + SaveXMMOps.push_back(Chain); + SaveXMMOps.push_back(ALVal); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getRegSaveFrameIndex())); + SaveXMMOps.push_back(DAG.getIntPtrConstant( + FuncInfo->getVarArgsFPOffset())); + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, + MVT::Other, SaveXMMOps)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); + } + + if (isVarArg && MFI->hasMustTailInVarArgFunc()) { + // Find the largest legal vector type. + MVT VecVT = MVT::Other; + // FIXME: Only some x86_32 calling conventions support AVX512. + if (Subtarget->hasAVX512() && + (Is64Bit || (CallConv == CallingConv::X86_VectorCall || + CallConv == CallingConv::Intel_OCL_BI))) + VecVT = MVT::v16f32; + else if (Subtarget->hasAVX()) + VecVT = MVT::v8f32; + else if (Subtarget->hasSSE2()) + VecVT = MVT::v4f32; + + // We forward some GPRs and some vector types. + SmallVector<MVT, 2> RegParmTypes; + MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; + RegParmTypes.push_back(IntVT); + if (VecVT != MVT::Other) + RegParmTypes.push_back(VecVT); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVectorImpl<ForwardedRegister> &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); + + // Conservatively forward AL on x86_64, since it might be used for varargs. + if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { + unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); + Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); + } + + // Copy all forwards from physical to virtual registers. + for (ForwardedRegister &F : Forwards) { + // FIXME: Can we use a less constrained schedule? + SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); + Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); } } @@ -2544,7 +2737,7 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, false, false, 0); } -/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call +/// Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, @@ -2561,7 +2754,7 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, return SDValue(OutRetAddr.getNode(), 1); } -/// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call +/// Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, @@ -2599,6 +2792,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; + X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; @@ -2630,8 +2824,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), - ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) @@ -2652,7 +2845,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. - X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; @@ -2671,8 +2863,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; - assert(ArgLocs.back().getLocMemOffset() == 0 && - "an inalloca argument must be the only memory argument"); + if (!ArgLocs.back().isMemLoc()) + report_fatal_error("cannot use inalloca attribute on a register " + "parameter"); + if (ArgLocs.back().getLocMemOffset() != 0) + report_fatal_error("any parameter with the inalloca attribute must be " + "the only memory argument"); } if (!IsSibcall) @@ -2691,8 +2887,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; @@ -2791,7 +2987,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } - if (Is64Bit && isVarArg && !IsWin64) { + if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in @@ -2813,6 +3009,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(NumXMMRegs, MVT::i8))); } + if (isVarArg && IsMustTail) { + const auto &Forwards = X86Info->getForwardedMustTailRegParms(); + for (const auto &F : Forwards) { + SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); + RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + } + } + // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. @@ -2889,10 +3093,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. - } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + } else if (Callee->getOpcode() == ISD::GlobalAddress) { // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack // it. + GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee); // We should use extra load for direct calls to dllimported functions in // non-JIT mode. @@ -2962,6 +3167,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), OpFlags); + } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { + // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI + Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } // Returns a chain & a flag for retval copy to use. @@ -2988,7 +3196,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3079,9 +3287,9 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(TM.getRegisterInfo()); - const TargetFrameLowering &TFI = *TM.getFrameLowering(); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); + const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; @@ -3194,8 +3402,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); if (RegInfo->needsStackRealignment(MF)) return false; @@ -3223,8 +3431,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) @@ -3244,12 +3452,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } if (Unused) { SmallVector<CCValAssign, 16> RVLocs; - CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; - if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) + if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; } } @@ -3258,13 +3466,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector<CCValAssign, 16> RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs1, *DAG.getContext()); + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, + *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector<CCValAssign, 16> RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - DAG.getTarget(), RVLocs2, *DAG.getContext()); + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, + *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) @@ -3290,8 +3498,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - DAG.getTarget(), ArgLocs, *DAG.getContext()); + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); // Allocate shadow area for Win64 if (IsCalleeWin64) @@ -3308,7 +3516,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = - static_cast<const X86InstrInfo *>(DAG.getTarget().getInstrInfo()); + static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -3336,7 +3544,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = - (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; + (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -3378,6 +3586,8 @@ static bool MayFoldIntoStore(SDValue Op) { static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; + case X86ISD::BLENDI: + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: @@ -3395,7 +3605,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: return true; @@ -3421,7 +3631,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERMI: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } @@ -3433,6 +3643,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PALIGNR: + case X86ISD::VALIGN: case X86ISD::SHUFP: case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, @@ -3459,8 +3670,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); int ReturnAddrIndex = FuncInfo->getRAIndex(); @@ -3500,7 +3711,7 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. - if (M == CodeModel::Kernel && Offset > 0) + if (M == CodeModel::Kernel && Offset >= 0) return true; return false; @@ -3510,23 +3721,18 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, /// own arguments. Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt) { - if (IsVarArg) - return false; - switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: - return !is64Bit; case CallingConv::X86_FastCall: - return !is64Bit; case CallingConv::X86_ThisCall: return !is64Bit; case CallingConv::Fast: - return TailCallOpt; case CallingConv::GHC: - return TailCallOpt; case CallingConv::HiPE: + if (IsVarArg) + return false; return TailCallOpt; } } @@ -3667,6 +3873,18 @@ bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return false; } +bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, + ISD::LoadExtType ExtTy, + EVT NewVT) const { + // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF + // relocation target a movq or addq instruction: don't let the load shrink. + SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); + if (BasePtr.getOpcode() == X86ISD::WrapperRIP) + if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) + return GA->getTargetFlags() != X86II::MO_GOTTPOFF; + return true; +} + /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, @@ -3679,6 +3897,24 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } +bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + return (Index == 0 || Index == ResVT.getVectorNumElements()); +} + +bool X86TargetLowering::isCheapToSpeculateCttz() const { + // Speculate cttz only if we can directly use TZCNT. + return Subtarget->hasBMI(); +} + +bool X86TargetLowering::isCheapToSpeculateCtlz() const { + // Speculate ctlz only if we can directly use LZCNT. + return Subtarget->hasLZCNT(); +} + /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { @@ -3693,7 +3929,7 @@ static bool isUndefOrEqual(int Val, int CmpVal) { /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified -/// sequential range (L, L+Pos]. or is undef. +/// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) @@ -3703,14 +3939,23 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, } /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference -/// the second operand. -static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) { - if (VT == MVT::v4f32 || VT == MVT::v4i32 ) - return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); - if (VT == MVT::v2f64 || VT == MVT::v2i64) - return (Mask[0] < 2 && Mask[1] < 2); - return false; +/// is suitable for input to PSHUFD. That is, it doesn't reference the other +/// operand - by default will match for first operand. +static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT, + bool TestSecondOperand = false) { + if (VT != MVT::v4f32 && VT != MVT::v4i32 && + VT != MVT::v2f64 && VT != MVT::v2i64) + return false; + + unsigned NumElems = VT.getVectorNumElements(); + unsigned Lo = TestSecondOperand ? NumElems : 0; + unsigned Hi = Lo + NumElems; + + for (unsigned i = 0; i < NumElems; ++i) + if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi)) + return false; + + return true; } /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that @@ -3771,16 +4016,12 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) { return true; } -/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that -/// is suitable for input to PALIGNR. -static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, - const X86Subtarget *Subtarget) { - if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || - (VT.is256BitVector() && !Subtarget->hasInt256())) - return false; - +/// \brief Return true if the mask specifies a shuffle of elements that is +/// suitable for input to intralane (palignr) or interlane (valign) vector +/// right-shift. +static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) { unsigned NumElts = VT.getVectorNumElements(); - unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; + unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; // Do not handle 64-bit element shuffles with palignr. @@ -3844,6 +4085,29 @@ static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, return true; } +/// \brief Return true if the node specifies a shuffle of elements that is +/// suitable for input to PALIGNR. +static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT, + const X86Subtarget *Subtarget) { + if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || + (VT.is256BitVector() && !Subtarget->hasInt256()) || + VT.is512BitVector()) + // FIXME: Add AVX512BW. + return false; + + return isAlignrMask(Mask, VT, false); +} + +/// \brief Return true if the node specifies a shuffle of elements that is +/// suitable for input to VALIGN. +static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT, + const X86Subtarget *Subtarget) { + // FIXME: Add AVX512VL. + if (!VT.is512BitVector() || !Subtarget->hasAVX512()) + return false; + return isAlignrMask(Mask, VT, true); +} + /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, @@ -4086,43 +4350,34 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT, assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"); - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes; - unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector()) { - if (NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; - NumLanes = 2; - NumOf256BitLanes = 1; - } else if (VT.is512BitVector()) { - assert(VT.getScalarType().getSizeInBits() >= 32 && - "Unsupported vector type for unpckh"); - NumLanes = 2; - NumOf256BitLanes = 2; - } else { - NumLanes = 1; - NumOf256BitLanes = 1; - } - unsigned NumEltsInStride = NumElts/NumOf256BitLanes; - unsigned NumLaneElts = NumEltsInStride/NumLanes; + assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && + "Unsupported vector type for unpckh"); - for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { - for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { - for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l256*NumEltsInStride+l+i]; - int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; - if (!isUndefOrEqual(BitI, j+l256*NumElts)) - return false; - if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (!isUndefOrEqual(BitI1, NumElts)) return false; - if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) + } else { + if (!isUndefOrEqual(BitI1, j + NumElts)) return false; } } } + return true; } @@ -4133,39 +4388,29 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT, assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"); - // AVX defines UNPCK* to operate independently on 128-bit lanes. - unsigned NumLanes; - unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); - if (VT.is256BitVector()) { - if (NumElts != 4 && NumElts != 8 && - (!HasInt256 || (NumElts != 16 && NumElts != 32))) + if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && + (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; - NumLanes = 2; - NumOf256BitLanes = 1; - } else if (VT.is512BitVector()) { - assert(VT.getScalarType().getSizeInBits() >= 32 && - "Unsupported vector type for unpckh"); - NumLanes = 2; - NumOf256BitLanes = 2; - } else { - NumLanes = 1; - NumOf256BitLanes = 1; - } - unsigned NumEltsInStride = NumElts/NumOf256BitLanes; - unsigned NumLaneElts = NumEltsInStride/NumLanes; + assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) && + "Unsupported vector type for unpckh"); - for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { - for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { - for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { - int BitI = Mask[l256*NumEltsInStride+l+i]; - int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; - if (!isUndefOrEqual(BitI, j+l256*NumElts)) - return false; - if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) + // AVX defines UNPCK* to operate independently on 128-bit lanes. + unsigned NumLanes = VT.getSizeInBits()/128; + unsigned NumLaneElts = NumElts/NumLanes; + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { + int BitI = Mask[l+i]; + int BitI1 = Mask[l+i+1]; + if (!isUndefOrEqual(BitI, j)) + return false; + if (V2IsSplat) { + if (isUndefOrEqual(BitI1, NumElts)) return false; - if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) + } else { + if (!isUndefOrEqual(BitI1, j+NumElts)) return false; } } @@ -4668,11 +4913,13 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { return Mask; } -/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle -/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. -static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { +/// \brief Return the appropriate immediate to shuffle the specified +/// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with +/// VALIGN (if Interlane is true) instructions. +static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp, + bool InterLane) { MVT VT = SVOp->getSimpleValueType(0); - unsigned EltSize = VT.is512BitVector() ? 1 : + unsigned EltSize = InterLane ? 1 : VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); @@ -4693,6 +4940,19 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { return (Val - i) * EltSize; } +/// \brief Return the appropriate immediate to shuffle the specified +/// VECTOR_SHUFFLE mask with the PALIGNR instruction. +static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { + return getShuffleAlignrImmediate(SVOp, false); +} + +/// \brief Return the appropriate immediate to shuffle the specified +/// VECTOR_SHUFFLE mask with the VALIGN instruction. +static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) { + return getShuffleAlignrImmediate(SVOp, true); +} + + static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) @@ -4891,32 +5151,32 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Vec; if (VT.is128BitVector()) { // SSE if (Subtarget->hasSSE2()) { // SSE2 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 - SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.is256BitVector()) { // AVX if (Subtarget->hasInt256()) { // AVX2 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. - SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); + SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 - SDValue Cst = DAG.getTargetConstant(0, MVT::i32); + SDValue Cst = DAG.getConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); - SDValue Cst = DAG.getTargetConstant(0, MVT::i1); + SDValue Cst = DAG.getConstant(0, MVT::i1); SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else @@ -4933,7 +5193,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); - SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); + SDValue Cst = DAG.getConstant(~0U, MVT::i32); SDValue Vec; if (VT.is256BitVector()) { if (HasInt256) { // AVX2 @@ -5103,37 +5363,49 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, } /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the -/// target specific opcode. Returns true if the Mask could be calculated. -/// Sets IsUnary to true if only uses one source. +/// target specific opcode. Returns true if the Mask could be calculated. Sets +/// IsUnary to true if only uses one source. Note that this will set IsUnary for +/// shuffles which use a single input multiple times, and in those cases it will +/// adjust the mask to only have indices within that single input. static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl<int> &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; IsUnary = false; + bool IsFakeUnary = false; switch(N->getOpcode()) { + case X86ISD::BLENDI: + ImmN = N->getOperand(N->getNumOperands()-1); + DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: DecodeUNPCKLMask(VT, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); break; case X86ISD::PSHUFD: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; @@ -5148,6 +5420,68 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; + case X86ISD::PSHUFB: { + IsUnary = true; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + EVT VT = MaskNode.getValueType(); + assert(VT.isVector() && + "Can't produce a non-vector with a build_vector!"); + if (!VT.isInteger()) + return false; + + int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; + + SmallVector<uint64_t, 32> RawMask; + for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) { + RawMask.push_back((uint64_t)SM_SentinelUndef); + continue; + } + auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + + // We now have to decode the element which could be any integer size and + // extract each byte of it. + for (int j = 0; j < NumBytesPerElement; ++j) { + // Note that this is x86 and so always little endian: the low byte is + // the first byte of the mask. + RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); + MaskElement = MaskElement.lshr(8); + } + } + DecodePSHUFBMask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodePSHUFBMask(C, Mask); + if (Mask.empty()) + return false; + break; + } + + return false; + } case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); @@ -5169,17 +5503,29 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); if (Mask.empty()) return false; break; + case X86ISD::MOVSLDUP: + DecodeMOVSLDUPMask(VT, Mask); + break; + case X86ISD::MOVSHDUP: + DecodeMOVSHDUPMask(VT, Mask); + break; case X86ISD::MOVDDUP: case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: - case X86ISD::MOVSHDUP: - case X86ISD::MOVSLDUP: // Not yet implemented return false; default: llvm_unreachable("unknown target shuffle node"); } + // If we have a fake unary shuffle, the shuffle mask is spread across two + // inputs that are actually the same node. Re-map the mask to always point + // into the first input. + if (IsFakeUnary) + for (int &M : Mask) + if (M >= (int)Mask.size()) + M -= Mask.size(); + return true; } @@ -5470,76 +5816,112 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, } /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. -static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, - unsigned NonZeros, unsigned NumNonZero, - unsigned NumZero, SelectionDAG &DAG, +static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { - // We know there's at least one non-zero element - unsigned FirstNonZeroIdx = 0; - SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); - while (FirstNonZero.getOpcode() == ISD::UNDEF || - X86::isZeroNode(FirstNonZero)) { - ++FirstNonZeroIdx; - FirstNonZero = Op->getOperand(FirstNonZeroIdx); + // Find all zeroable elements. + bool Zeroable[4]; + for (int i=0; i < 4; ++i) { + SDValue Elt = Op->getOperand(i); + Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); + } + assert(std::count_if(&Zeroable[0], &Zeroable[4], + [](bool M) { return !M; }) > 1 && + "We expect at least two non-zero elements!"); + + // We only know how to deal with build_vector nodes where elements are either + // zeroable or extract_vector_elt with constant index. + SDValue FirstNonZero; + unsigned FirstNonZeroIdx; + for (unsigned i=0; i < 4; ++i) { + if (Zeroable[i]) + continue; + SDValue Elt = Op->getOperand(i); + if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa<ConstantSDNode>(Elt.getOperand(1))) + return SDValue(); + // Make sure that this node is extracting from a 128-bit vector. + MVT VT = Elt.getOperand(0).getSimpleValueType(); + if (!VT.is128BitVector()) + return SDValue(); + if (!FirstNonZero.getNode()) { + FirstNonZero = Elt; + FirstNonZeroIdx = i; + } } - if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(FirstNonZero.getOperand(1))) - return SDValue(); + assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); + SDValue V1 = FirstNonZero.getOperand(0); + MVT VT = V1.getSimpleValueType(); - SDValue V = FirstNonZero.getOperand(0); - MVT VVT = V.getSimpleValueType(); - if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32)) - return SDValue(); + // See if this build_vector can be lowered as a blend with zero. + SDValue Elt; + unsigned EltMaskIdx, EltIdx; + int Mask[4]; + for (EltIdx = 0; EltIdx < 4; ++EltIdx) { + if (Zeroable[EltIdx]) { + // The zero vector will be on the right hand side. + Mask[EltIdx] = EltIdx+4; + continue; + } - unsigned FirstNonZeroDst = - cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue(); - unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; - unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; - unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; + Elt = Op->getOperand(EltIdx); + // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. + EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue(); + if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) + break; + Mask[EltIdx] = EltIdx; + } - for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { - SDValue Elem = Op.getOperand(Idx); - if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) - continue; + if (EltIdx == 4) { + // Let the shuffle legalizer deal with blend operations. + SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); + if (V1.getSimpleValueType() != VT) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); + return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); + } - // TODO: What else can be here? Deal with it. - if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); + // See if we can lower this build_vector to a INSERTPS. + if (!Subtarget->hasSSE41()) + return SDValue(); - // TODO: Some optimizations are still possible here - // ex: Getting one element from a vector, and the rest from another. - if (Elem.getOperand(0) != V) - return SDValue(); + SDValue V2 = Elt.getOperand(0); + if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) + V1 = SDValue(); - unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue(); - if (Dst == Idx) - ++CorrectIdx; - else if (IncorrectIdx == -1U) { - IncorrectIdx = Idx; - IncorrectDst = Dst; - } else - // There was already one element with an incorrect index. - // We can't optimize this case to an insertps. - return SDValue(); + bool CanFold = true; + for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { + if (Zeroable[i]) + continue; + + SDValue Current = Op->getOperand(i); + SDValue SrcVector = Current->getOperand(0); + if (!V1.getNode()) + V1 = SrcVector; + CanFold = SrcVector == V1 && + cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i; } - if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { - SDLoc dl(Op); - EVT VT = Op.getSimpleValueType(); - unsigned ElementMoveMask = 0; - if (IncorrectIdx == -1U) - ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; - else - ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; + if (!CanFold) + return SDValue(); - SDValue InsertpsMask = - DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf)); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); - } + assert(V1.getNode() && "Expected at least two non-zero elements!"); + if (V1.getSimpleValueType() != MVT::v4f32) + V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); + if (V2.getSimpleValueType() != MVT::v4f32) + V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); - return SDValue(); + // Ok, we can emit an INSERTPS instruction. + unsigned ZMask = 0; + for (int i = 0; i < 4; ++i) + if (Zeroable[i]) + ZMask |= 1 << i; + + unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2, + DAG.getIntPtrConstant(InsertPSMask)); + return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result); } /// getVShift - Return a vector logical shift node. @@ -5685,15 +6067,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, SDValue NewLd = SDValue(); - if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) - NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), 0); NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), - LDBase->isVolatile(), LDBase->isNonTemporal(), - LDBase->isInvariant(), LDBase->getAlignment()); + LDBase->getPointerInfo(), LDBase->isVolatile(), + LDBase->isNonTemporal(), LDBase->isInvariant(), + LDBase->getAlignment()); if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, @@ -5706,7 +6083,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, return NewLd; } - if (NumElems == 4 && LastLoadedElt == 1 && + + //TODO: The code below fires only for for loading the low v2i32 / v2f32 + //of a v4i32 / v4f32. It's probably worth generalizing. + if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; @@ -5742,7 +6122,10 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { - if (!Subtarget->hasFp256()) + // VBROADCAST requires AVX. + // TODO: Splats could be generated for non-AVX CPUs using SSE + // instructions, but there's less potential gain for only 128-bit vectors. + if (!Subtarget->hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); @@ -5819,17 +6202,34 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, } } + unsigned ScalarSize = Ld.getValueType().getSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); - // Handle the broadcasting a single constant scalar from the constant pool - // into a vector. On Sandybridge it is still better to load a constant vector + // When optimizing for size, generate up to 5 extra bytes for a broadcast + // instruction to save 8 or more bytes of constant pool data. + // TODO: If multiple splats are generated to load the same constant, + // it may be detrimental to overall size. There needs to be a way to detect + // that condition to know if this is truly a size win. + const Function *F = DAG.getMachineFunction().getFunction(); + bool OptForSize = F->getAttributes(). + hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + // Handle broadcasting a single constant scalar from the constant pool + // into a vector. + // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. - if (ConstSplatVal && Subtarget->hasInt256()) { + // But override that restriction when optimizing for size. + // TODO: Check if splatting is recommended for other AVX-capable CPUs. + if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); - unsigned ScalarSize = CVT.getSizeInBits(); - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { + // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. + // For size optimization, also splat v2f64 and v2i64, and for size opt + // with AVX2, also splat i8 and i16. + // With pattern matching, the VBROADCAST node may become a VMOVDDUP. + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) C = CI->getConstantIntValue(); @@ -5850,7 +6250,6 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); - unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && @@ -5861,7 +6260,8 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, if (!IsLoad) return SDValue(); - if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (Subtarget->hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match @@ -6017,8 +6417,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { AllContants = false; NonConstIdx = idx; NumNonConsts++; - } - else { + } else { NumConsts++; if (cast<ConstantSDNode>(In)->getZExtValue()) Immediate |= (1ULL << idx); @@ -6041,7 +6440,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT::getIntegerVT(VT.getSizeInBits())); DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); } - else + else DstVec = DAG.getUNDEF(VT); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(NonConstIdx), @@ -6064,7 +6463,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { /// \brief Return true if \p N implements a horizontal binop and return the /// operands for the horizontal binop into V0 and V1. -/// +/// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal @@ -6085,7 +6484,7 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); - + bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; @@ -6154,13 +6553,13 @@ static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, } /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by -/// a concat_vector. +/// a concat_vector. /// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two -/// horizontal binary operations. +/// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// @@ -6235,11 +6634,6 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && "build_vector with an invalid type found!"); - // Don't try to emit a VSELECT that cannot be lowered into a blend. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) - return SDValue(); - // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from @@ -6251,14 +6645,14 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, for (unsigned i = 0, e = NumElts; i != e; i++) { SDValue Op = BV->getOperand(i); - + // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) { std::swap(ExpectedOpcode, NextExpectedOpcode); continue; } - + // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) return SDValue(); @@ -6312,34 +6706,11 @@ static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, std::swap(ExpectedOpcode, NextExpectedOpcode); } - // Don't try to fold this build_vector into a VSELECT if it has - // too many UNDEF operands. + // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && - InVec1.getOpcode() != ISD::UNDEF) { - // Emit a sequence of vector add and sub followed by a VSELECT. - // The new VSELECT will be lowered into a BLENDI. - // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI' - // and emit a single ADDSUB instruction. - SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1); - SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1); - - // Construct the VSELECT mask. - EVT MaskVT = VT.changeVectorElementTypeToInteger(); - EVT SVT = MaskVT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - SmallVector<SDValue, 8> Ops; + InVec1.getOpcode() != ISD::UNDEF) + return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); - for (unsigned i = 0, e = NumElts; i != e; ++i) { - APInt Value = i & 1 ? APInt::getNullValue(SVTBits) : - APInt::getAllOnesValue(SVTBits); - SDValue Constant = DAG.getConstant(Value, SVT); - Ops.push_back(Constant); - } - - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops); - return DAG.getSelect(DL, VT, Mask, Sub, Add); - } - return SDValue(); } @@ -6382,18 +6753,18 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); - + if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); - + if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } - + if (!Subtarget->hasAVX()) return SDValue(); @@ -6444,7 +6815,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, // Do this only if the target has AVX2. if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); - + // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) @@ -6575,6 +6946,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); + + // If using the new shuffle lowering, just directly insert this. + if (ExperimentalVectorShuffleLowering) + return DAG.getNode( + ISD::BITCAST, dl, VT, + getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG)); + Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); // Now we have our 32-bit value zero extended in the low element of @@ -6648,6 +7026,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); + // If using the new shuffle lowering, just directly insert this. + if (ExperimentalVectorShuffleLowering) + return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); + // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector<int, 8> MaskVec; @@ -6677,13 +7059,18 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (IsAllConstants) return SDValue(); - // For AVX-length vectors, build the individual 128-bit pieces and use + // For AVX-length vectors, see if we can use a vector load to get all of the + // elements, otherwise build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { SmallVector<SDValue, 64> V; for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); + // Check for a build vector of consecutive loads. + if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) + return LD; + EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. @@ -6725,8 +7112,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) { - SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, - NumZero, DAG, Subtarget, *this); + SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this); if (V.getNode()) return V; } @@ -6917,6 +7303,89 @@ static bool isSingleInputShuffleMask(ArrayRef<int> Mask) { return true; } +/// \brief Test whether there are elements crossing 128-bit lanes in this +/// shuffle mask. +/// +/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations +/// and we routinely test for these. +static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + return true; + return false; +} + +/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. +/// +/// This checks a shuffle mask to see if it is performing the same +/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies +/// that it is also not lane-crossing. It may however involve a blend from the +/// same lane of a second vector. +/// +/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is +/// non-trivial to compute in the face of undef lanes. The representation is +/// *not* suitable for use with existing 128-bit shuffles as it will contain +/// entries from both V1 and V2 inputs to the wider mask. +static bool +is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, + SmallVectorImpl<int> &RepeatedMask) { + int LaneSize = 128 / VT.getScalarSizeInBits(); + RepeatedMask.resize(LaneSize, -1); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + if ((Mask[i] % Size) / LaneSize != i / LaneSize) + // This entry crosses lanes, so there is no way to model this shuffle. + return false; + + // Ok, handle the in-lane shuffles by detecting if and when they repeat. + if (RepeatedMask[i % LaneSize] == -1) + // This is the first non-undef entry in this slot of a 128-bit lane. + RepeatedMask[i % LaneSize] = + Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; + else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) + // Found a mismatch with the repeated mask. + return false; + } + return true; +} + +// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC +// 2013 will allow us to use it as a non-type template parameter. +namespace { + +/// \brief Implementation of the \c isShuffleEquivalent variadic functor. +/// +/// See its documentation for details. +bool isShuffleEquivalentImpl(ArrayRef<int> Mask, ArrayRef<const int *> Args) { + if (Mask.size() != Args.size()) + return false; + for (int i = 0, e = Mask.size(); i < e; ++i) { + assert(*Args[i] >= 0 && "Arguments must be positive integers!"); + if (Mask[i] != -1 && Mask[i] != *Args[i]) + return false; + } + return true; +} + +} // namespace + +/// \brief Checks whether a shuffle mask is equivalent to an explicit list of +/// arguments. +/// +/// This is a fast way to test a shuffle mask against a fixed pattern: +/// +/// if (isShuffleEquivalent(Mask, 3, 2, 1, 0)) { ... } +/// +/// It returns true if the mask is exactly as wide as the argument list, and +/// each element of the mask is either -1 (signifying undef) or the value given +/// in the argument. +static const VariadicFunction1< + bool, ArrayRef<int>, int, isShuffleEquivalentImpl> isShuffleEquivalent = {}; + /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -6941,6 +7410,835 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, return DAG.getConstant(Imm, MVT::i8); } +/// \brief Try to emit a blend instruction for a shuffle. +/// +/// This doesn't do any checks for the availability of instructions for blending +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to +/// be matched in the backend with the type given. What it does check for is +/// that the shuffle mask is in fact a blend. +static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + + unsigned BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] >= Size) { + if (Mask[i] != i + Size) + return SDValue(); // Shuffled V2 input! + BlendMask |= 1u << i; + continue; + } + if (Mask[i] >= 0 && Mask[i] != i) + return SDValue(); // Shuffled V1 input! + } + switch (VT.SimpleTy) { + case MVT::v2f64: + case MVT::v4f32: + case MVT::v4f64: + case MVT::v8f32: + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + + case MVT::v4i64: + case MVT::v8i32: + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // FALLTHROUGH + case MVT::v2i64: + case MVT::v4i32: + // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into + // that instruction. + if (Subtarget->hasAVX2()) { + // Scale the blend by the number of 32-bit dwords per element. + int Scale = VT.getScalarSizeInBits() / 32; + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; + V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + // FALLTHROUGH + case MVT::v8i16: { + // For integer shuffles we need to expand the mask and cast the inputs to + // v8i16s prior to blending. + int Scale = 8 / VT.getVectorNumElements(); + BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= Size) + for (int j = 0; j < Scale; ++j) + BlendMask |= 1u << (i * Scale + j); + + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8))); + } + + case MVT::v16i16: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + SmallVector<int, 8> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { + // We can lower these with PBLENDW which is mirrored across 128-bit lanes. + assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); + BlendMask = 0; + for (int i = 0; i < 8; ++i) + if (RepeatedMask[i] >= 16) + BlendMask |= 1u << i; + return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); + } + } + // FALLTHROUGH + case MVT::v32i8: { + assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); + // Scale the blend by the number of bytes per element. + int Scale = VT.getScalarSizeInBits() / 8; + assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!"); + + // Compute the VSELECT mask. Note that VSELECT is really confusing in the + // mix of LLVM's code generator and the x86 backend. We tell the code + // generator that boolean values in the elements of an x86 vector register + // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' + // mapping a select to operand #1, and 'false' mapping to operand #2. The + // reality in x86 is that vector masks (pre-AVX-512) use only the high bit + // of the element (the remaining are ignored) and 0 in that high bit would + // mean operand #1 while 1 in the high bit would mean operand #2. So while + // the LLVM model for boolean values in vector elements gets the relevant + // bit set, it is set backwards and over constrained relative to x86's + // actual model. + SDValue VSELECTMask[32]; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + for (int j = 0; j < Scale; ++j) + VSELECTMask[Scale * i + j] = + Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8); + + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2); + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(ISD::VSELECT, DL, MVT::v32i8, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, VSELECTMask), + V1, V2)); + } + + default: + llvm_unreachable("Not a supported integer vector type!"); + } +} + +/// \brief Generic routine to lower a shuffle and blend as a decomposed set of +/// unblended shuffles followed by an unshuffled blend. +/// +/// This matches the extremely common pattern for handling combined +/// shuffle+blend operations on newer X86 ISAs where we have very fast blend +/// operations. +static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, + SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // Shuffle the input elements into the desired positions in V1 and V2 and + // blend them together. + SmallVector<int, 32> V1Mask(Mask.size(), -1); + SmallVector<int, 32> V2Mask(Mask.size(), -1); + SmallVector<int, 32> BlendMask(Mask.size(), -1); + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] < Size) { + V1Mask[i] = Mask[i]; + BlendMask[i] = i; + } else if (Mask[i] >= Size) { + V2Mask[i] = Mask[i] - Size; + BlendMask[i] = i + Size; + } + + V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); + return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); +} + +/// \brief Try to lower a vector shuffle as a byte rotation. +/// +/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary +/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use +/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will +/// try to generically lower a vector shuffle through such an pattern. It +/// does not check for the profitability of lowering either as PALIGNR or +/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. +/// This matches shuffle vectors that look like: +/// +/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] +/// +/// Essentially it concatenates V1 and V2, shifts right by some number of +/// elements, and takes the low elements as the result. Note that while this is +/// specified as a *right shift* because x86 is little-endian, it is a *left +/// rotate* of the vector lanes. +/// +/// Note that this only handles 128-bit vector widths currently. +static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, + ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + // We need to detect various ways of spelling a rotation: + // [11, 12, 13, 14, 15, 0, 1, 2] + // [-1, 12, 13, 14, -1, -1, 1, -1] + // [-1, -1, -1, -1, -1, -1, 1, 2] + // [ 3, 4, 5, 6, 7, 8, 9, 10] + // [-1, 4, 5, 6, -1, -1, 9, -1] + // [-1, 4, 5, 6, -1, -1, -1, -1] + int Rotation = 0; + SDValue Lo, Hi; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] == -1) + continue; + assert(Mask[i] >= 0 && "Only -1 is a valid negative mask element!"); + + // Based on the mod-Size value of this mask element determine where + // a rotated vector would have started. + int StartIdx = i - (Mask[i] % Size); + if (StartIdx == 0) + // The identity rotation isn't interesting, stop. + return SDValue(); + + // If we found the tail of a vector the rotation must be the missing + // front. If we found the head of a vector, it must be how much of the head. + int CandidateRotation = StartIdx < 0 ? -StartIdx : Size - StartIdx; + + if (Rotation == 0) + Rotation = CandidateRotation; + else if (Rotation != CandidateRotation) + // The rotations don't match, so we can't match this mask. + return SDValue(); + + // Compute which value this mask is pointing at. + SDValue MaskV = Mask[i] < Size ? V1 : V2; + + // Compute which of the two target values this index should be assigned to. + // This reflects whether the high elements are remaining or the low elements + // are remaining. + SDValue &TargetV = StartIdx < 0 ? Hi : Lo; + + // Either set up this value if we've not encountered it before, or check + // that it remains consistent. + if (!TargetV) + TargetV = MaskV; + else if (TargetV != MaskV) + // This may be a rotation, but it pulls from the inputs in some + // unsupported interleaving. + return SDValue(); + } + + // Check that we successfully analyzed the mask, and normalize the results. + assert(Rotation != 0 && "Failed to locate a viable rotation!"); + assert((Lo || Hi) && "Failed to find a rotated input vector!"); + if (!Lo) + Lo = Hi; + else if (!Hi) + Hi = Lo; + + assert(VT.getSizeInBits() == 128 && + "Rotate-based lowering only supports 128-bit lowering!"); + assert(Mask.size() <= 16 && + "Can shuffle at most 16 bytes in a 128-bit vector!"); + + // The actual rotate instruction rotates bytes, so we need to scale the + // rotation based on how many bytes are in the vector. + int Scale = 16 / Mask.size(); + + // SSSE3 targets can use the palignr instruction + if (Subtarget->hasSSSE3()) { + // Cast the inputs to v16i8 to match PALIGNR. + Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Hi); + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PALIGNR, DL, MVT::v16i8, Hi, Lo, + DAG.getConstant(Rotation * Scale, MVT::i8))); + } + + // Default SSE2 implementation + int LoByteShift = 16 - Rotation * Scale; + int HiByteShift = Rotation * Scale; + + // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. + Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo); + Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi); + + SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, + DAG.getConstant(8 * LoByteShift, MVT::i8)); + SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, + DAG.getConstant(8 * HiByteShift, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); +} + +/// \brief Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable(Mask.size(), false); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + Zeroable[i] = true; + continue; + } + + // If this is an index into a build_vector node, dig out the input value and + // use it. + SDValue V = M < Size ? V1 : V2; + if (V.getOpcode() != ISD::BUILD_VECTOR) + continue; + + SDValue Input = V.getOperand(M % Size); + // The UNDEF opcode check really should be dead code here, but not quite + // worth asserting on (it isn't invalid, just unexpected). + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) + Zeroable[i] = true; + } + + return Zeroable; +} + +/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros). +/// +/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2 +/// byte-shift instructions. The mask must consist of a shifted sequential +/// shuffle from one of the input vectors and zeroable elements for the +/// remaining 'shifted in' elements. +/// +/// Note that this only handles 128-bit vector widths currently. +static SDValue lowerVectorShuffleAsByteShift(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Size = Mask.size(); + int Scale = 16 / Size; + + for (int Shift = 1; Shift < Size; Shift++) { + int ByteShift = Shift * Scale; + + // PSRLDQ : (little-endian) right byte shift + // [ 5, 6, 7, zz, zz, zz, zz, zz] + // [ -1, 5, 6, 7, zz, zz, zz, zz] + // [ 1, 2, -1, -1, -1, -1, zz, zz] + bool ZeroableRight = true; + for (int i = Size - Shift; i < Size; i++) { + ZeroableRight &= Zeroable[i]; + } + + if (ZeroableRight) { + bool ValidShiftRight1 = + isSequentialOrUndefInRange(Mask, 0, Size - Shift, Shift); + bool ValidShiftRight2 = + isSequentialOrUndefInRange(Mask, 0, Size - Shift, Size + Shift); + + if (ValidShiftRight1 || ValidShiftRight2) { + // Cast the inputs to v2i64 to match PSRLDQ. + SDValue &TargetV = ValidShiftRight1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + + // PSLLDQ : (little-endian) left byte shift + // [ zz, 0, 1, 2, 3, 4, 5, 6] + // [ zz, zz, -1, -1, 2, 3, 4, -1] + // [ zz, zz, zz, zz, zz, zz, -1, 1] + bool ZeroableLeft = true; + for (int i = 0; i < Shift; i++) { + ZeroableLeft &= Zeroable[i]; + } + + if (ZeroableLeft) { + bool ValidShiftLeft1 = + isSequentialOrUndefInRange(Mask, Shift, Size - Shift, 0); + bool ValidShiftLeft2 = + isSequentialOrUndefInRange(Mask, Shift, Size - Shift, Size); + + if (ValidShiftLeft1 || ValidShiftLeft2) { + // Cast the inputs to v2i64 to match PSLLDQ. + SDValue &TargetV = ValidShiftLeft1 ? V1 : V2; + SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TargetV); + SDValue Shifted = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, V, + DAG.getConstant(ByteShift * 8, MVT::i8)); + return DAG.getNode(ISD::BITCAST, DL, VT, Shifted); + } + } + } + + return SDValue(); +} + +/// \brief Lower a vector shuffle as a zero or any extension. +/// +/// Given a specific number of elements, element bit width, and extension +/// stride, produce either a zero or any extension based on the available +/// features of the subtarget. +static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( + SDLoc DL, MVT VT, int NumElements, int Scale, bool AnyExt, SDValue InputV, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(Scale > 1 && "Need a scale to extend."); + int EltBits = VT.getSizeInBits() / NumElements; + assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && + "Only 8, 16, and 32 bit elements can be extended."); + assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); + + // Found a valid zext mask! Try various lowering strategies based on the + // input type and available ISA extensions. + if (Subtarget->hasSSE41()) { + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), + NumElements / Scale); + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + } + + // For any extends we can cheat for larger element sizes and use shuffle + // instructions that can fold with a load and/or copy. + if (AnyExt && EltBits == 32) { + int PSHUFDMask[4] = {0, -1, 1, -1}; + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + } + if (AnyExt && EltBits == 16 && Scale > 2) { + int PSHUFDMask[4] = {0, -1, 0, -1}; + InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)); + int PSHUFHWMask[4] = {1, -1, -1, -1}; + return DAG.getNode( + ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, InputV), + getV4X86ShuffleImm8ForMask(PSHUFHWMask, DAG))); + } + + // If this would require more than 2 unpack instructions to expand, use + // pshufb when available. We can only use more than 2 unpack instructions + // when zero extending i8 elements which also makes it easier to use pshufb. + if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { + assert(NumElements == 16 && "Unexpected byte vector width!"); + SDValue PSHUFBMask[16]; + for (int i = 0; i < 16; ++i) + PSHUFBMask[i] = + DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, MVT::i8); + InputV = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, InputV); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v16i8, PSHUFBMask))); + } + + // Otherwise emit a sequence of unpacks. + do { + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); + SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) + : getZeroVector(InputVT, Subtarget, DAG, DL); + InputV = DAG.getNode(ISD::BITCAST, DL, InputVT, InputV); + InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); + Scale /= 2; + EltBits *= 2; + NumElements /= 2; + } while (Scale > 1); + return DAG.getNode(ISD::BITCAST, DL, VT, InputV); +} + +/// \brief Try to lower a vector shuffle as a zero extension on any micrarch. +/// +/// This routine will try to do everything in its power to cleverly lower +/// a shuffle which happens to match the pattern of a zero extend. It doesn't +/// check for the profitability of this lowering, it tries to aggressively +/// match this pattern. It will use all of the micro-architectural details it +/// can to emit an efficient lowering. It handles both blends with all-zero +/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to +/// masking out later). +/// +/// The reason we have dedicated lowering for zext-style shuffles is that they +/// are both incredibly common and often quite performance sensitive. +static SDValue lowerVectorShuffleAsZeroOrAnyExtend( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + int Bits = VT.getSizeInBits(); + int NumElements = Mask.size(); + + // Define a helper function to check a particular ext-scale and lower to it if + // valid. + auto Lower = [&](int Scale) -> SDValue { + SDValue InputV; + bool AnyExt = true; + for (int i = 0; i < NumElements; ++i) { + if (Mask[i] == -1) + continue; // Valid anywhere but doesn't tell us anything. + if (i % Scale != 0) { + // Each of the extend elements needs to be zeroable. + if (!Zeroable[i]) + return SDValue(); + + // We no lorger are in the anyext case. + AnyExt = false; + continue; + } + + // Each of the base elements needs to be consecutive indices into the + // same input vector. + SDValue V = Mask[i] < NumElements ? V1 : V2; + if (!InputV) + InputV = V; + else if (InputV != V) + return SDValue(); // Flip-flopping inputs. + + if (Mask[i] % NumElements != i / Scale) + return SDValue(); // Non-consecutive strided elemenst. + } + + // If we fail to find an input, we have a zero-shuffle which should always + // have already been handled. + // FIXME: Maybe handle this here in case during blending we end up with one? + if (!InputV) + return SDValue(); + + return lowerVectorShuffleAsSpecificZeroOrAnyExtend( + DL, VT, NumElements, Scale, AnyExt, InputV, Subtarget, DAG); + }; + + // The widest scale possible for extending is to a 64-bit integer. + assert(Bits % 64 == 0 && + "The number of bits in a vector must be divisible by 64 on x86!"); + int NumExtElements = Bits / 64; + + // Each iteration, try extending the elements half as much, but into twice as + // many elements. + for (; NumExtElements < NumElements; NumExtElements *= 2) { + assert(NumElements % NumExtElements == 0 && + "The input vector size must be divisble by the extended size."); + if (SDValue V = Lower(NumElements / NumExtElements)) + return V; + } + + // No viable ext lowering found. + return SDValue(); +} + +/// \brief Try to get a scalar value for a specific element of a vector. +/// +/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. +static SDValue getScalarValueForVectorElement(SDValue V, int Idx, + SelectionDAG &DAG) { + MVT VT = V.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + // If the bitcasts shift the element size, we can't extract an equivalent + // element from it. + MVT NewVT = V.getSimpleValueType(); + if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) + return SDValue(); + + if (V.getOpcode() == ISD::BUILD_VECTOR || + (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) + return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, V.getOperand(Idx)); + + return SDValue(); +} + +/// \brief Helper to test for a load that can be folded with x86 shuffles. +/// +/// This is particularly important because the set of instructions varies +/// significantly based on whether the operand is a load or not. +static bool isShuffleFoldableLoad(SDValue V) { + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); + + return ISD::isNON_EXTLoad(V.getNode()); +} + +/// \brief Try to lower insertion of a single element into a zero vector. +/// +/// This is a common pattern that we have especially efficient patterns to lower +/// across all subtarget feature sets. +static SDValue lowerVectorShuffleAsElementInsertion( + MVT VT, SDLoc DL, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + MVT ExtVT = VT; + MVT EltVT = VT.getVectorElementType(); + + int V2Index = std::find_if(Mask.begin(), Mask.end(), + [&Mask](int M) { return M >= (int)Mask.size(); }) - + Mask.begin(); + bool IsV1Zeroable = true; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (i != V2Index && !Zeroable[i]) { + IsV1Zeroable = false; + break; + } + + // Check for a single input from a SCALAR_TO_VECTOR node. + // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and + // all the smarts here sunk into that routine. However, the current + // lowering of BUILD_VECTOR makes that nearly impossible until the old + // vector shuffle lowering is dead. + if (SDValue V2S = getScalarValueForVectorElement( + V2, Mask[V2Index] - Mask.size(), DAG)) { + // We need to zext the scalar if it is smaller than an i32. + V2S = DAG.getNode(ISD::BITCAST, DL, EltVT, V2S); + if (EltVT == MVT::i8 || EltVT == MVT::i16) { + // Using zext to expand a narrow element won't work for non-zero + // insertions. + if (!IsV1Zeroable) + return SDValue(); + + // Zero-extend directly to i32. + ExtVT = MVT::v4i32; + V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); + } + V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); + } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || + EltVT == MVT::i16) { + // Either not inserting from the low element of the input or the input + // element size is too small to use VZEXT_MOVL to clear the high bits. + return SDValue(); + } + + if (!IsV1Zeroable) { + // If V1 can't be treated as a zero vector we have fewer options to lower + // this. We can't support integer vectors or non-zero targets cheaply, and + // the V1 elements can't be permuted in any way. + assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); + if (!VT.isFloatingPoint() || V2Index != 0) + return SDValue(); + SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end()); + V1Mask[V2Index] = -1; + if (!isNoopShuffleMask(V1Mask)) + return SDValue(); + // This is essentially a special case blend operation, but if we have + // general purpose blend operations, they are always faster. Bail and let + // the rest of the lowering handle these as blends. + if (Subtarget->hasSSE41()) + return SDValue(); + + // Otherwise, use MOVSD or MOVSS. + assert((EltVT == MVT::f32 || EltVT == MVT::f64) && + "Only two types of floating point element types to handle!"); + return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, + ExtVT, V1, V2); + } + + V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); + if (ExtVT != VT) + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + + if (V2Index != 0) { + // If we have 4 or fewer lanes we can cheaply shuffle the element into + // the desired position. Otherwise it is more efficient to do a vector + // shift left. We know that we can do a vector shift left because all + // the inputs are zero. + if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { + SmallVector<int, 4> V2Shuffle(Mask.size(), 1); + V2Shuffle[V2Index] = 0; + V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); + } else { + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, V2); + V2 = DAG.getNode( + X86ISD::VSHLDQ, DL, MVT::v2i64, V2, + DAG.getConstant( + V2Index * EltVT.getSizeInBits(), + DAG.getTargetLoweringInfo().getScalarShiftAmountTy(MVT::v2i64))); + V2 = DAG.getNode(ISD::BITCAST, DL, VT, V2); + } + } + return V2; +} + +/// \brief Try to lower broadcast of a single element. +/// +/// For convenience, this code also bundles all of the subtarget feature set +/// filtering. While a little annoying to re-dispatch on type here, there isn't +/// a convenient way to factor it out. +static SDValue lowerVectorShuffleAsBroadcast(MVT VT, SDLoc DL, SDValue V, + ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (!Subtarget->hasAVX()) + return SDValue(); + if (VT.isInteger() && !Subtarget->hasAVX2()) + return SDValue(); + + // Check that the mask is a broadcast. + int BroadcastIdx = -1; + for (int M : Mask) + if (M >= 0 && BroadcastIdx == -1) + BroadcastIdx = M; + else if (M >= 0 && M != BroadcastIdx) + return SDValue(); + + assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " + "a sorted mask where the broadcast " + "comes from V1."); + + // Go up the chain of (vector) values to try and find a scalar load that + // we can combine with the broadcast. + for (;;) { + switch (V.getOpcode()) { + case ISD::CONCAT_VECTORS: { + int OperandSize = Mask.size() / V.getNumOperands(); + V = V.getOperand(BroadcastIdx / OperandSize); + BroadcastIdx %= OperandSize; + continue; + } + + case ISD::INSERT_SUBVECTOR: { + SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); + auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); + if (!ConstantIdx) + break; + + int BeginIdx = (int)ConstantIdx->getZExtValue(); + int EndIdx = + BeginIdx + (int)VInner.getValueType().getVectorNumElements(); + if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { + BroadcastIdx -= BeginIdx; + V = VInner; + } else { + V = VOuter; + } + continue; + } + } + break; + } + + // Check if this is a broadcast of a scalar. We special case lowering + // for scalars so that we can more effectively fold with loads. + if (V.getOpcode() == ISD::BUILD_VECTOR || + (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { + V = V.getOperand(BroadcastIdx); + + // If the scalar isn't a load we can't broadcast from it in AVX1, only with + // AVX2. + if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) + return SDValue(); + } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { + // We can't broadcast from a vector register w/o AVX2, and we can only + // broadcast from the zero-element of a vector register. + return SDValue(); + } + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); +} + +// Check for whether we can use INSERTPS to perform the shuffle. We only use +// INSERTPS when the V1 elements are already in the correct locations +// because otherwise we can just always use two SHUFPS instructions which +// are much smaller to encode than a SHUFPS and an INSERTPS. We can also +// perform INSERTPS if a single V1 element is out of place and all V2 +// elements are zeroable. +static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + unsigned ZMask = 0; + int V1DstIndex = -1; + int V2DstIndex = -1; + bool V1UsedInPlace = false; + + for (int i = 0; i < 4; i++) { + // Synthesize a zero mask from the zeroable elements (includes undefs). + if (Zeroable[i]) { + ZMask |= 1 << i; + continue; + } + + // Flag if we use any V1 inputs in place. + if (i == Mask[i]) { + V1UsedInPlace = true; + continue; + } + + // We can only insert a single non-zeroable element. + if (V1DstIndex != -1 || V2DstIndex != -1) + return SDValue(); + + if (Mask[i] < 4) { + // V1 input out of place for insertion. + V1DstIndex = i; + } else { + // V2 input for insertion. + V2DstIndex = i; + } + } + + // Don't bother if we have no (non-zeroable) element for insertion. + if (V1DstIndex == -1 && V2DstIndex == -1) + return SDValue(); + + // Determine element insertion src/dst indices. The src index is from the + // start of the inserted vector, not the start of the concatenated vector. + unsigned V2SrcIndex = 0; + if (V1DstIndex != -1) { + // If we have a V1 input out of place, we use V1 as the V2 element insertion + // and don't use the original V2 at all. + V2SrcIndex = Mask[V1DstIndex]; + V2DstIndex = V1DstIndex; + V2 = V1; + } else { + V2SrcIndex = Mask[V2DstIndex] - 4; + } + + // If no V1 inputs are used in place, then the result is created only from + // the zero mask and the V2 insertion - so remove V1 dependency. + if (!V1UsedInPlace) + V1 = DAG.getUNDEF(MVT::v4f32); + + unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; + assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); + + // Insert the V2 element into the desired position. + SDLoc DL(Op); + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, + DAG.getConstant(InsertPSMask, MVT::i8)); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -6963,12 +8261,56 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); + + if (Subtarget->hasAVX()) { + // If we have AVX, we can use VPERMILPS which will allow folding a load + // into the shuffle. + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, DAG.getConstant(SHUFPDMask, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + + // If we have a single input, insert that into V1 if we can do so cheaply. + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2f64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2f64, DL, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + } + + // Try to use one of the special instruction patterns to handle two common + // blend patterns if a zero-blend above didn't work. + if (isShuffleEquivalent(Mask, 0, 3) || isShuffleEquivalent(Mask, 1, 3)) + if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) + // We can either use a special instruction to load over the low double or + // to move just the low double. + return DAG.getNode( + isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, + DL, MVT::v2f64, V2, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); @@ -6992,6 +8334,11 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v2i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. @@ -7005,6 +8352,44 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v2i64, V1, V2, Mask, DAG)) + return Shift; + + // If we have a single input from V2 insert that into V1 if we can do so + // cheaply. + if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2i64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + // Try inverting the insertion since for v2 masks it is easy to do and we + // can't reliably sort the mask one way or the other. + int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), + Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v2i64, DL, V2, V1, InverseMask, Subtarget, DAG)) + return Insertion; + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 2)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 3)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't @@ -7015,38 +8400,25 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } -/// \brief Lower 4-lane 32-bit floating point shuffles. +/// \brief Lower a vector shuffle using the SHUFPS instruction. /// -/// Uses instructions exclusively from the floating point unit to minimize -/// domain crossing penalties, as these are sufficient to implement all v4f32 -/// shuffles. -static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - SDLoc DL(Op); - assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); - assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); - ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); - ArrayRef<int> Mask = SVOp->getMask(); - assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - +/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. +/// It makes no assumptions about whether this is the *best* lowering, it simply +/// uses it. +static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); - if (NumV2Elements == 0) - // Straight shuffle of a single input vector. We pass the input vector to - // both operands to simulate this with a SHUFPS. - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, - getV4X86ShuffleImm8ForMask(Mask, DAG)); - if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - Mask.begin(); + // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; @@ -7063,7 +8435,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; - V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, + V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now proceed to reconstruct the final blend as we have the necessary @@ -7080,9 +8452,17 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the - // high lanes. We never see this reversed because we sort the shuffle. + // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; + } else if (Mask[2] < 4 && Mask[3] < 4) { + // We also handle the reversed case because this utility may get called + // when we detect a SHUFPS pattern but can't easily commute the shuffle to + // arrange things in the right direction. + NewMask[0] -= 4; + NewMask[1] -= 4; + HighV = V1; + LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final @@ -7094,7 +8474,7 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; - V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, + V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to @@ -7106,10 +8486,78 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, NewMask[3] = Mask[2] < 4 ? 3 : 1; } } - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, + return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DAG)); } +/// \brief Lower 4-lane 32-bit floating point shuffles. +/// +/// Uses instructions exclusively from the floating point unit to minimize +/// domain crossing penalties, as these are sufficient to implement all v4f32 +/// shuffles. +static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); + assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (Subtarget->hasAVX()) { + // If we have AVX, we can use VPERMILPS which will allow folding a load + // into the shuffle. + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Otherwise, use a straight shuffle of a single input vector. We pass the + // input vector to both operands to simulate this with a SHUFPS. + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + + // There are special ways we can lower some single-element blends. However, we + // have custom ways we can lower more complex single-element blends below that + // we defer to if both this and BLENDPS fail to match, so restrict this to + // when the V2 input is targeting element 0 of the mask -- that is the fast + // case here. + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4f32, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + if (Subtarget->hasSSE41()) { + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use INSERTPS if we can complete the shuffle efficiently. + if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) + return V; + } + + // Otherwise fall back to a SHUFPS lowering strategy. + return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); +} + /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for @@ -7125,11 +8573,66 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); - if (isSingleInputShuffleMask(Mask)) + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. It also allows us to fold memory operands into the + // shuffle in many cases. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return ZExt; + + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. + // We coerce the shuffle pattern to be compatible with UNPCK instructions + // but we aren't actually going to use the UNPCK instruction because doing + // so prevents folding a load into this instruction or making a copy. + const int UnpackLoMask[] = {0, 0, 1, 1}; + const int UnpackHiMask[] = {2, 2, 3, 3}; + if (isShuffleEquivalent(Mask, 0, 0, 1, 1)) + Mask = UnpackLoMask; + else if (isShuffleEquivalent(Mask, 2, 2, 3, 3)) + Mask = UnpackHiMask; + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); + } + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v4i32, V1, V2, Mask, DAG)) + return Shift; + + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v4i32, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 1, 5)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Try to use byte rotation instructions. + // Its more profitable for pre-SSSE3 to use shuffles/unpacks. + if (Subtarget->hasSSSE3()) + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) + return Rotate; // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build @@ -7182,6 +8685,27 @@ static SDValue lowerV8I16SingleInputVectorShuffle( MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i16, DL, V, + Mask, Subtarget, DAG)) + return Broadcast; + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V, V, Mask, DAG)) + return Shift; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 0, 1, 1, 2, 2, 3, 3)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V, V); + if (isShuffleEquivalent(Mask, 4, 4, 5, 5, 6, 6, 7, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V, V); + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V, V, Mask, Subtarget, DAG)) + return Rotate; + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through @@ -7190,22 +8714,126 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // - // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2 - // and 2-2. - auto balanceSides = [&](ArrayRef<int> ThreeInputs, int OneInput, - int ThreeInputHalfSum, int OneInputHalfOffset) { + // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half + // and an existing 2-into-2 on the other half. In this case we may have to + // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or + // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. + // Fortunately, we don't have to handle anything but a 2-into-2 pattern + // because any other situation (including a 3-into-1 or 1-into-3 in the other + // half than the one we target for fixing) will be fixed when we re-enter this + // path. We will also combine away any sequence of PSHUFD instructions that + // result into a single instruction. Here is an example of the tricky case: + // + // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] + // + // This now has a 1-into-3 in the high half! Instead, we do two shuffles: + // + // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] + // + // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] + // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] + // + // The result is fine to be handled by the generic logic. + auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs, + ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs, + int AOffset, int BOffset) { + assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && + "Must call this with A having 3 or 1 inputs from the A half."); + assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && + "Must call this with B having 1 or 3 inputs from the B half."); + assert(AToAInputs.size() + BToAInputs.size() == 4 && + "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); + // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. - int DWordA = (ThreeInputHalfSum - - std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) / - 2; - int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2; + int ADWord, BDWord; + int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; + int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; + int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; + ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; + int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; + int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); + int TripleNonInputIdx = + TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); + TripleDWord = TripleNonInputIdx / 2; + + // We use xor with one to compute the adjacent DWord to whichever one the + // OneInput is in. + OneInputDWord = (OneInput / 2) ^ 1; + + // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA + // and BToA inputs. If there is also such a problem with the BToB and AToB + // inputs, we don't try to fix it necessarily -- we'll recurse and see it in + // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it + // is essential that we don't *create* a 3<-1 as then we might oscillate. + if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { + // Compute how many inputs will be flipped by swapping these DWords. We + // need + // to balance this to ensure we don't form a 3-1 shuffle in the other + // half. + int NumFlippedAToBInputs = + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); + int NumFlippedBToBInputs = + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); + if ((NumFlippedAToBInputs == 1 && + (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || + (NumFlippedBToBInputs == 1 && + (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { + // We choose whether to fix the A half or B half based on whether that + // half has zero flipped inputs. At zero, we may not be able to fix it + // with that half. We also bias towards fixing the B half because that + // will more commonly be the high half, and we have to bias one way. + auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, + ArrayRef<int> Inputs) { + int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. + bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), + PinnedIdx ^ 1) != Inputs.end(); + // Determine whether the free index is in the flipped dword or the + // unflipped dword based on where the pinned index is. We use this bit + // in an xor to conditionally select the adjacent dword. + int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); + bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), + FixFreeIdx) != Inputs.end(); + if (IsFixIdxInput == IsFixFreeIdxInput) + FixFreeIdx += 1; + IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), + FixFreeIdx) != Inputs.end(); + assert(IsFixIdxInput != IsFixFreeIdxInput && + "We need to be changing the number of flipped inputs!"); + int PSHUFHalfMask[] = {0, 1, 2, 3}; + std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); + V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, + MVT::v8i16, V, + getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DAG)); + + for (int &M : Mask) + if (M != -1 && M == FixIdx) + M = FixFreeIdx; + else if (M != -1 && M == FixFreeIdx) + M = FixIdx; + }; + if (NumFlippedBToBInputs != 0) { + int BPinnedIdx = + BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); + } else { + assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); + int APinnedIdx = + AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); + } + } + } int PSHUFDMask[] = {0, 1, 2, 3}; - PSHUFDMask[DWordA] = DWordB; - PSHUFDMask[DWordB] = DWordA; + PSHUFDMask[ADWord] = BDWord; + PSHUFDMask[BDWord] = ADWord; V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), @@ -7213,24 +8841,20 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // Adjust the mask to match the new locations of A and B. for (int &M : Mask) - if (M != -1 && M/2 == DWordA) - M = 2 * DWordB + M % 2; - else if (M != -1 && M/2 == DWordB) - M = 2 * DWordA + M % 2; + if (M != -1 && M/2 == ADWord) + M = 2 * BDWord + M % 2; + else if (M != -1 && M/2 == BDWord) + M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), Mask); }; - if (NumLToL == 3 && NumHToL == 1) - return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4); - else if (NumLToL == 1 && NumHToL == 3) - return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0); - else if (NumLToH == 1 && NumHToH == 3) - return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0); - else if (NumLToH == 3 && NumHToH == 1) - return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4); + if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) + return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); + else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) + return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and @@ -7244,9 +8868,10 @@ static SDValue lowerV8I16SingleInputVectorShuffle( // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. - auto fixInPlaceInputs = [&PSHUFDMask]( - ArrayRef<int> InPlaceInputs, MutableArrayRef<int> SourceHalfMask, - MutableArrayRef<int> HalfMask, int HalfOffset) { + auto fixInPlaceInputs = + [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs, + MutableArrayRef<int> SourceHalfMask, + MutableArrayRef<int> HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { @@ -7255,6 +8880,14 @@ static SDValue lowerV8I16SingleInputVectorShuffle( PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } + if (IncomingInputs.empty()) { + // Just fix all of the in place inputs. + for (int Input : InPlaceInputs) { + SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; + PSHUFDMask[Input / 2] = Input / 2; + } + return; + } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = @@ -7266,10 +8899,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle( std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; - if (!HToLInputs.empty()) - fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0); - if (!LToHInputs.empty()) - fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4); + fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); + fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. @@ -7278,7 +8909,8 @@ static SDValue lowerV8I16SingleInputVectorShuffle( auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, - int SourceOffset, int DestOffset) { + MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, + int DestOffset) { auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; }; @@ -7304,7 +8936,7 @@ static SDValue lowerV8I16SingleInputVectorShuffle( Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) - if (M == SourceHalfMask[Input - SourceOffset]) + if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; @@ -7356,18 +8988,68 @@ static SDValue lowerV8I16SingleInputVectorShuffle( } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { - int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2; - assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) && - "Not all dwords can be clobbered!"); - SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset; - SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset; + // We have two non-adjacent or clobbered inputs we need to extract from + // the source half. To do this, we need to map them into some adjacent + // dword slot in the source mask. + int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, + IncomingInputs[1] - SourceOffset}; + + // If there is a free slot in the source half mask adjacent to one of + // the inputs, place the other input in it. We use (Index XOR 1) to + // compute an adjacent index. + if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && + SourceHalfMask[InputsFixed[0] ^ 1] == -1) { + SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + InputsFixed[1] = InputsFixed[0] ^ 1; + } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && + SourceHalfMask[InputsFixed[1] ^ 1] == -1) { + SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; + InputsFixed[0] = InputsFixed[1] ^ 1; + } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { + // The two inputs are in the same DWord but it is clobbered and the + // adjacent DWord isn't used at all. Move both inputs to the free + // slot. + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; + SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; + InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); + InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; + } else { + // The only way we hit this point is if there is no clobbering + // (because there are no off-half inputs to this half) and there is no + // free slot adjacent to one of the inputs. In this case, we have to + // swap an input with a non-input. + for (int i = 0; i < 4; ++i) + assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && + "We can't handle any clobbers here!"); + assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && + "Cannot have adjacent inputs here!"); + + SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; + SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; + + // We also have to update the final source mask in this case because + // it may need to undo the above swap. + for (int &M : FinalSourceHalfMask) + if (M == (InputsFixed[0] ^ 1) + SourceOffset) + M = InputsFixed[1] + SourceOffset; + else if (M == InputsFixed[1] + SourceOffset) + M = (InputsFixed[0] ^ 1) + SourceOffset; + + InputsFixed[1] = InputsFixed[0] ^ 1; + } + + // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) - M = SourceDWordBase + SourceOffset; + M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) - M = SourceDWordBase + 1 + SourceOffset; - IncomingInputs[0] = SourceDWordBase + SourceOffset; - IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset; + M = InputsFixed[1] + SourceOffset; + + IncomingInputs[0] = InputsFixed[0] + SourceOffset; + IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); @@ -7377,13 +9059,14 @@ static SDValue lowerV8I16SingleInputVectorShuffle( int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; - for (int Input : IncomingInputs) - std::replace(HalfMask.begin(), HalfMask.end(), Input, - FreeDWord * 2 + Input % 2); + for (int &M : HalfMask) + for (int Input : IncomingInputs) + if (M == Input) + M = FreeDWord * 2 + Input % 2; }; - moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, + moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); - moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, + moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their @@ -7520,34 +9203,37 @@ static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, if (GoodInputs.size() == 2) { // If the low inputs are spread across two dwords, pack them into // a single dword. - MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] = - Mask[GoodInputs[0]] - MaskOffset; - MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] = - Mask[GoodInputs[1]] - MaskOffset; - Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; - Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; + MoveMask[MoveOffset] = Mask[GoodInputs[0]] - MaskOffset; + MoveMask[MoveOffset + 1] = Mask[GoodInputs[1]] - MaskOffset; + Mask[GoodInputs[0]] = MoveOffset + MaskOffset; + Mask[GoodInputs[1]] = MoveOffset + 1 + MaskOffset; } else { - // Otherwise pin the low inputs. + // Otherwise pin the good inputs. for (int GoodInput : GoodInputs) MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; } - int MoveMaskIdx = - std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - - std::begin(MoveMask); - assert(MoveMaskIdx >= MoveOffset && "Established above"); - if (BadInputs.size() == 2) { + // If we have two bad inputs then there may be either one or two good + // inputs fixed in place. Find a fixed input, and then find the *other* + // two adjacent indices by using modular arithmetic. + int GoodMaskIdx = + std::find_if(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), + [](int M) { return M >= 0; }) - + std::begin(MoveMask); + int MoveMaskIdx = + ((((GoodMaskIdx - MoveOffset) & ~1) + 2) % 4) + MoveOffset; assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); - MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] = - Mask[BadInputs[0]] - MaskOffset; - MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] = - Mask[BadInputs[1]] - MaskOffset; - Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset; - Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset; + MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; + MoveMask[MoveMaskIdx + 1] = Mask[BadInputs[1]] - MaskOffset; + Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; + Mask[BadInputs[1]] = MoveMaskIdx + 1 + MaskOffset; } else { assert(BadInputs.size() == 1 && "All sizes handled"); + int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset, + std::end(MoveMask), -1) - + std::begin(MoveMask); MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; } @@ -7603,6 +9289,12 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + // Whenever we can lower this as a zext, that instruction is strictly faster + // than any alternative. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v8i16, V1, V2, OrigMask, Subtarget, DAG)) + return ZExt; + auto isV1 = [](int M) { return M >= 0 && M < 8; }; auto isV2 = [](int M) { return M >= 8; }; @@ -7615,6 +9307,33 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v8i16, V1, V2, Mask, DAG)) + return Shift; + + // There are special ways we can lower some single-element blends. + if (NumV2Inputs == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v8i16, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); + if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); + + if (Subtarget->hasSSE41()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) + return Rotate; + if (NumV1Inputs + NumV2Inputs <= 4) return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); @@ -7658,6 +9377,74 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); } +/// \brief Check whether a compaction lowering can be done by dropping even +/// elements and compute how many times even elements must be dropped. +/// +/// This handles shuffles which take every Nth element where N is a power of +/// two. Example shuffle masks: +/// +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 +/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 +/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 +/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 +/// +/// Any of these lanes can of course be undef. +/// +/// This routine only supports N <= 3. +/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here +/// for larger N. +/// +/// \returns N above, or the number of times even elements must be dropped if +/// there is such a number. Otherwise returns zero. +static int canLowerByDroppingEvenElements(ArrayRef<int> Mask) { + // Figure out whether we're looping over two inputs or just one. + bool IsSingleInput = isSingleInputShuffleMask(Mask); + + // The modulus for the shuffle vector entries is based on whether this is + // a single input or not. + int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); + assert(isPowerOf2_32((uint32_t)ShuffleModulus) && + "We should only be called with masks with a power-of-2 size!"); + + uint64_t ModMask = (uint64_t)ShuffleModulus - 1; + + // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, + // and 2^3 simultaneously. This is because we may have ambiguity with + // partially undef inputs. + bool ViableForN[3] = {true, true, true}; + + for (int i = 0, e = Mask.size(); i < e; ++i) { + // Ignore undef lanes, we'll optimistically collapse them to the pattern we + // want. + if (Mask[i] == -1) + continue; + + bool IsAnyViable = false; + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) { + uint64_t N = j + 1; + + // The shuffle mask must be equal to (i * 2^N) % M. + if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) + IsAnyViable = true; + else + ViableForN[j] = false; + } + // Early exit if we exhaust the possible powers of two. + if (!IsAnyViable) + break; + } + + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) + return j + 1; + + // Return 0 as there is no viable power of two. + return 0; +} + /// \brief Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to @@ -7675,6 +9462,22 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); ArrayRef<int> OrigMask = SVOp->getMask(); assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // Try to use byte shift instructions. + if (SDValue Shift = lowerVectorShuffleAsByteShift( + DL, MVT::v16i8, V1, V2, OrigMask, DAG)) + return Shift; + + // Try to use byte rotation instructions. + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( + DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + return Rotate; + + // Try to use a zext lowering. + if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( + DL, MVT::v16i8, V1, V2, OrigMask, Subtarget, DAG)) + return ZExt; + int MaskStorage[16] = { OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], @@ -7684,8 +9487,16 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MutableArrayRef<int> LoMask = Mask.slice(0, 8); MutableArrayRef<int> HiMask = Mask.slice(8, 8); + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 16; }); + // For single-input shuffles, there are some nicer lowering tricks we can use. - if (isSingleInputShuffleMask(Mask)) { + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies @@ -7695,10 +9506,10 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef<int> Mask) { - for (int i = 0; i < 16; i += 2) { - if (Mask[i] != Mask[i + 1]) + for (int i = 0; i < 16; i += 2) + if (Mask[i] != -1 && Mask[i + 1] != -1 && Mask[i] != Mask[i + 1]) return false; - } + return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { @@ -7759,11 +9570,16 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT::v16i8, V1, V1); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; - for (int i = 0; i < 16; i += 2) { - if (Mask[i] != -1) - PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); - assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); - } + for (int i = 0; i < 16; ++i) + if (Mask[i] != -1) { + int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); + assert(MappedMask < 8 && "Invalid v8 shuffle mask!"); + if (PostDupI16Shuffle[i / 2] == -1) + PostDupI16Shuffle[i / 2] = MappedMask; + else + assert(PostDupI16Shuffle[i / 2] == MappedMask && + "Conflicting entrties in the original shuffle!"); + } return DAG.getNode( ISD::BITCAST, DL, MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, @@ -7780,21 +9596,127 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // // FIXME: We need to handle other interleaving widths (i16, i32, ...). if (shouldLowerAsInterleaving(Mask)) { - // FIXME: Figure out whether we should pack these into the low or high - // halves. - - int EMask[16], OMask[16]; + int NumLoHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { + return (M >= 0 && M < 8) || (M >= 16 && M < 24); + }); + int NumHiHalf = std::count_if(Mask.begin(), Mask.end(), [](int M) { + return (M >= 8 && M < 16) || M >= 24; + }); + int EMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1}; + int OMask[16] = {-1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1}; + bool UnpackLo = NumLoHalf >= NumHiHalf; + MutableArrayRef<int> TargetEMask(UnpackLo ? EMask : EMask + 8, 8); + MutableArrayRef<int> TargetOMask(UnpackLo ? OMask : OMask + 8, 8); for (int i = 0; i < 8; ++i) { - EMask[i] = Mask[2*i]; - OMask[i] = Mask[2*i + 1]; - EMask[i + 8] = -1; - OMask[i + 8] = -1; + TargetEMask[i] = Mask[2 * i]; + TargetOMask[i] = Mask[2 * i + 1]; } SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); + return DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, + MVT::v16i8, Evens, Odds); + } + + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly + // with PSHUFB. It is important to do this before we attempt to generate any + // blends but after all of the single-input lowerings. If the single input + // lowerings can find an instruction sequence that is faster than a PSHUFB, we + // want to preserve that and we can DAG combine any longer sequences into + // a PSHUFB in the end. But once we start blending from multiple inputs, + // the complexity of DAG combining bad patterns back into PSHUFB is too high, + // and there are *very* few patterns that would actually be faster than the + // PSHUFB approach because of its ability to zero lanes. + // + // FIXME: The only exceptions to the above are blends which are exact + // interleavings with direct instructions supporting them. We currently don't + // handle those well here. + if (Subtarget->hasSSSE3()) { + SDValue V1Mask[16]; + SDValue V2Mask[16]; + bool V1InUse = false; + bool V2InUse = false; + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + + for (int i = 0; i < 16; ++i) { + if (Mask[i] == -1) { + V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); + } else { + const int ZeroMask = 0x80; + int V1Idx = (Mask[i] < 16 ? Mask[i] : ZeroMask); + int V2Idx = (Mask[i] < 16 ? ZeroMask : Mask[i] - 16); + if (Zeroable[i]) + V1Idx = V2Idx = ZeroMask; + V1Mask[i] = DAG.getConstant(V1Idx, MVT::i8); + V2Mask[i] = DAG.getConstant(V2Idx, MVT::i8); + V1InUse |= (ZeroMask != V1Idx); + V2InUse |= (ZeroMask != V2Idx); + } + } + + if (V1InUse) + V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); + if (V2InUse) + V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, V2, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); + + // If we need shuffled inputs from both, blend the two. + if (V1InUse && V2InUse) + return DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); + if (V1InUse) + return V1; // Single inputs are easy. + if (V2InUse) + return V2; // Single inputs are easy. + // Shuffling to a zeroable vector. + return getZeroVector(MVT::v16i8, Subtarget, DAG, DL); + } + + // There are special ways we can lower some single-element blends. + if (NumV2Elements == 1) + if (SDValue V = lowerVectorShuffleAsElementInsertion(MVT::v16i8, DL, V1, V2, + Mask, Subtarget, DAG)) + return V; + + // Check whether a compaction lowering can be done. This handles shuffles + // which take every Nth element for some even N. See the helper function for + // details. + // + // We special case these as they can be particularly efficiently handled with + // the PACKUSB instruction on x86 and they show up in common patterns of + // rearranging bytes to truncate wide elements. + if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask)) { + // NumEvenDrops is the power of two stride of the elements. Another way of + // thinking about it is that we need to drop the even elements this many + // times to get the original input. + bool IsSingleInput = isSingleInputShuffleMask(Mask); + + // First we need to zero all the dropped bytes. + assert(NumEvenDrops <= 3 && + "No support for dropping even elements more than 3 times."); + // We use the mask type to pick which bytes are preserved based on how many + // elements are dropped. + MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 }; + SDValue ByteClearMask = + DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, + DAG.getConstant(0xFF, MaskVTs[NumEvenDrops - 1])); + V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); + if (!IsSingleInput) + V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); + + // Now pack things back together. + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1); + V2 = IsSingleInput ? V1 : DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2); + SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); + for (int i = 1; i < NumEvenDrops; ++i) { + Result = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Result); + Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); + } + + return Result; } int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; @@ -7893,15 +9815,1109 @@ static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } -/// \brief Tiny helper function to test whether adjacent masks are sequential. -static bool areAdjacentMasksSequential(ArrayRef<int> Mask) { - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - if (Mask[i] + 1 != Mask[i+1]) +/// \brief Helper function to test whether a shuffle mask could be +/// simplified by widening the elements being shuffled. +/// +/// Appends the mask for wider elements in WidenedMask if valid. Otherwise +/// leaves it in an unspecified state. +/// +/// NOTE: This must handle normal vector shuffle masks and *target* vector +/// shuffle masks. The latter have the special property of a '-2' representing +/// a zero-ed lane of a vector. +static bool canWidenShuffleElements(ArrayRef<int> Mask, + SmallVectorImpl<int> &WidenedMask) { + for (int i = 0, Size = Mask.size(); i < Size; i += 2) { + // If both elements are undef, its trivial. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) { + WidenedMask.push_back(SM_SentinelUndef); + continue; + } + + // Check for an undef mask and a mask value properly aligned to fit with + // a pair of values. If we find such a case, use the non-undef mask's value. + if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 && Mask[i + 1] % 2 == 1) { + WidenedMask.push_back(Mask[i + 1] / 2); + continue; + } + if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // When zeroing, we need to spread the zeroing across both lanes to widen. + if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) { + if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) && + (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) { + WidenedMask.push_back(SM_SentinelZero); + continue; + } + return false; + } + + // Finally check if the two mask values are adjacent and aligned with + // a pair. + if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 && Mask[i] + 1 == Mask[i + 1]) { + WidenedMask.push_back(Mask[i] / 2); + continue; + } + + // Otherwise we can't safely widen the elements used in this shuffle. + return false; + } + assert(WidenedMask.size() == Mask.size() / 2 && + "Incorrect size of mask after widening the elements!"); + + return true; +} + +/// \brief Generic routine to split ector shuffle into half-sized shuffles. +/// +/// This routine just extracts two subvectors, shuffles them independently, and +/// then concatenates them back together. This should work effectively with all +/// AVX vector shuffle types. +static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() >= 256 && + "Only for 256-bit or wider vector shuffles!"); + assert(V1.getSimpleValueType() == VT && "Bad operand type!"); + assert(V2.getSimpleValueType() == VT && "Bad operand type!"); + + ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2); + ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2); + + int NumElements = VT.getVectorNumElements(); + int SplitNumElements = NumElements / 2; + MVT ScalarVT = VT.getScalarType(); + MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); + + SDValue LoV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V1, + DAG.getIntPtrConstant(SplitNumElements)); + SDValue LoV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, + DAG.getIntPtrConstant(0)); + SDValue HiV2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, V2, + DAG.getIntPtrConstant(SplitNumElements)); + + // Now create two 4-way blends of these half-width vectors. + auto HalfBlend = [&](ArrayRef<int> HalfMask) { + bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false; + SmallVector<int, 32> V1BlendMask, V2BlendMask, BlendMask; + for (int i = 0; i < SplitNumElements; ++i) { + int M = HalfMask[i]; + if (M >= NumElements) { + if (M >= NumElements + SplitNumElements) + UseHiV2 = true; + else + UseLoV2 = true; + V2BlendMask.push_back(M - NumElements); + V1BlendMask.push_back(-1); + BlendMask.push_back(SplitNumElements + i); + } else if (M >= 0) { + if (M >= SplitNumElements) + UseHiV1 = true; + else + UseLoV1 = true; + V2BlendMask.push_back(-1); + V1BlendMask.push_back(M); + BlendMask.push_back(i); + } else { + V2BlendMask.push_back(-1); + V1BlendMask.push_back(-1); + BlendMask.push_back(-1); + } + } + + // Because the lowering happens after all combining takes place, we need to + // manually combine these blend masks as much as possible so that we create + // a minimal number of high-level vector shuffle nodes. + + // First try just blending the halves of V1 or V2. + if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) + return DAG.getUNDEF(SplitVT); + if (!UseLoV2 && !UseHiV2) + return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); + if (!UseLoV1 && !UseHiV1) + return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); + + SDValue V1Blend, V2Blend; + if (UseLoV1 && UseHiV1) { + V1Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); + } else { + // We only use half of V1 so map the usage down into the final blend mask. + V1Blend = UseLoV1 ? LoV1 : HiV1; + for (int i = 0; i < SplitNumElements; ++i) + if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) + BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); + } + if (UseLoV2 && UseHiV2) { + V2Blend = + DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); + } else { + // We only use half of V2 so map the usage down into the final blend mask. + V2Blend = UseLoV2 ? LoV2 : HiV2; + for (int i = 0; i < SplitNumElements; ++i) + if (BlendMask[i] >= SplitNumElements) + BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); + } + return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); + }; + SDValue Lo = HalfBlend(LoMask); + SDValue Hi = HalfBlend(HiMask); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); +} + +/// \brief Either split a vector in halves or decompose the shuffles and the +/// blend. +/// +/// This is provided as a good fallback for many lowerings of non-single-input +/// shuffles with more than one 128-bit lane. In those cases, we want to select +/// between splitting the shuffle into 128-bit components and stitching those +/// back together vs. extracting the single-input shuffles and blending those +/// results. +static SDValue lowerVectorShuffleAsSplitOrBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && "This routine must not be used to " + "lower single-input shuffles as it " + "could then recurse on itself."); + int Size = Mask.size(); + + // If this can be modeled as a broadcast of two elements followed by a blend, + // prefer that lowering. This is especially important because broadcasts can + // often fold with memory operands. + auto DoBothBroadcast = [&] { + int V1BroadcastIdx = -1, V2BroadcastIdx = -1; + for (int M : Mask) + if (M >= Size) { + if (V2BroadcastIdx == -1) + V2BroadcastIdx = M - Size; + else if (M - Size != V2BroadcastIdx) + return false; + } else if (M >= 0) { + if (V1BroadcastIdx == -1) + V1BroadcastIdx = M; + else if (M != V1BroadcastIdx) + return false; + } + return true; + }; + if (DoBothBroadcast()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, + DAG); + + // If the inputs all stem from a single 128-bit lane of each input, then we + // split them rather than blending because the split will decompose to + // unusually few instructions. + int LaneCount = VT.getSizeInBits() / 128; + int LaneSize = Size / LaneCount; + SmallBitVector LaneInputs[2]; + LaneInputs[0].resize(LaneCount, false); + LaneInputs[1].resize(LaneCount, false); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; + if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + // Otherwise, just fall back to decomposed shuffles and a blend. This requires + // that the decomposed single-input shuffles don't end up here. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + +/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as +/// a permutation and blend of those lanes. +/// +/// This essentially blends the out-of-lane inputs to each lane into the lane +/// from a permuted copy of the vector. This lowering strategy results in four +/// instructions in the worst case for a single-input cross lane shuffle which +/// is lower than any other fully general cross-lane shuffle strategy I'm aware +/// of. Special cases for each particular shuffle pattern should be handled +/// prior to trying this lowering. +static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { + // FIXME: This should probably be generalized for 512-bit vectors as well. + assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); + int LaneSize = Mask.size() / 2; + + // If there are only inputs from one 128-bit lane, splitting will in fact be + // less expensive. The flags track wether the given lane contains an element + // that crosses to another lane. + bool LaneCrossing[2] = {false, false}; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + LaneCrossing[(Mask[i] % Size) / LaneSize] = true; + if (!LaneCrossing[0] || !LaneCrossing[1]) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + if (isSingleInputShuffleMask(Mask)) { + SmallVector<int, 32> FlippedBlendMask; + for (int i = 0, Size = Mask.size(); i < Size; ++i) + FlippedBlendMask.push_back( + Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize) + ? Mask[i] + : Mask[i] % LaneSize + + (i / LaneSize) * LaneSize + Size)); + + // Flip the vector, and blend the results which should now be in-lane. The + // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and + // 5 for the high source. The value 3 selects the high half of source 2 and + // the value 2 selects the low half of source 2. We only use source 2 to + // allow folding it into a memory operand. + unsigned PERMMask = 3 | 2 << 4; + SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), + V1, DAG.getConstant(PERMMask, MVT::i8)); + return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); + } + + // This now reduces to two single-input shuffles of V1 and V2 which at worst + // will be handled by the above logic and a blend of the results, much like + // other patterns in AVX. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering 2-lane 128-bit shuffles. +static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + // Blends are faster and handle all the non-lane-crossing cases. + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + // Check for patterns which can be matched with a single insert of a 128-bit + // subvector. + if (isShuffleEquivalent(Mask, 0, 1, 0, 1) || + isShuffleEquivalent(Mask, 0, 1, 4, 5)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, + Mask[2] < 4 ? V1 : V2, DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + if (isShuffleEquivalent(Mask, 0, 1, 6, 7)) { + SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, + DAG.getIntPtrConstant(0)); + SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, + DAG.getIntPtrConstant(2)); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); + } + + // Otherwise form a 128-bit permutation. + // FIXME: Detect zero-vector inputs and use the VPERM2X128 to zero that half. + unsigned PermMask = Mask[0] / 2 | (Mask[2] / 2) << 4; + return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, + DAG.getConstant(PermMask, MVT::i8)); +} + +/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then +/// shuffling each lane. +/// +/// This will only succeed when the result of fixing the 128-bit lanes results +/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in +/// each 128-bit lanes. This handles many cases where we can quickly blend away +/// the lane crosses early and then use simpler shuffles within each lane. +/// +/// FIXME: It might be worthwhile at some point to support this without +/// requiring the 128-bit lane-relative shuffles to be repeating, but currently +/// in x86 only floating point has interesting non-repeating shuffles, and even +/// those are still *marginally* more expensive. +static SDValue lowerVectorShuffleByMerging128BitLanes( + SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { + assert(!isSingleInputShuffleMask(Mask) && + "This is only useful with multiple inputs."); + + int Size = Mask.size(); + int LaneSize = 128 / VT.getScalarSizeInBits(); + int NumLanes = Size / LaneSize; + assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles."); + + // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also + // check whether the in-128-bit lane shuffles share a repeating pattern. + SmallVector<int, 4> Lanes; + Lanes.resize(NumLanes, -1); + SmallVector<int, 4> InLaneMask; + InLaneMask.resize(LaneSize, -1); + for (int i = 0; i < Size; ++i) { + if (Mask[i] < 0) + continue; + + int j = i / LaneSize; + + if (Lanes[j] < 0) { + // First entry we've seen for this lane. + Lanes[j] = Mask[i] / LaneSize; + } else if (Lanes[j] != Mask[i] / LaneSize) { + // This doesn't match the lane selected previously! + return SDValue(); + } + + // Check that within each lane we have a consistent shuffle mask. + int k = i % LaneSize; + if (InLaneMask[k] < 0) { + InLaneMask[k] = Mask[i] % LaneSize; + } else if (InLaneMask[k] != Mask[i] % LaneSize) { + // This doesn't fit a repeating in-lane mask. + return SDValue(); + } + } + + // First shuffle the lanes into place. + MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64, + VT.getSizeInBits() / 64); + SmallVector<int, 8> LaneMask; + LaneMask.resize(NumLanes * 2, -1); + for (int i = 0; i < NumLanes; ++i) + if (Lanes[i] >= 0) { + LaneMask[2 * i + 0] = 2*Lanes[i] + 0; + LaneMask[2 * i + 1] = 2*Lanes[i] + 1; + } + + V1 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, LaneVT, V2); + SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask); + + // Cast it back to the type we actually want. + LaneShuffle = DAG.getNode(ISD::BITCAST, DL, VT, LaneShuffle); + + // Now do a simple shuffle that isn't lane crossing. + SmallVector<int, 8> NewMask; + NewMask.resize(Size, -1); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize; + assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) && + "Must not introduce lane crosses at this point!"); + + return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); +} + +/// \brief Test whether the specified input (0 or 1) is in-place blended by the +/// given mask. +/// +/// This returns true if the elements from a particular input are already in the +/// slot required by the given mask and require no permutation. +static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { + assert((Input == 0 || Input == 1) && "Only two inputs to shuffles."); + int Size = Mask.size(); + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) return false; return true; } +/// \brief Handle lowering of 4-lane 64-bit floating point shuffles. +/// +/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + + SmallVector<int, 4> WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask, Subtarget, + DAG); + + if (isSingleInputShuffleMask(Mask)) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4f64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { + // Non-half-crossing single input shuffles can be lowerid with an + // interleaved permutation. + unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, + DAG.getConstant(VPERMILPMask, MVT::i8)); + } + + // With AVX2 we have direct support for this permutation. + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, + DAG); + } + + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); + + // If we have a single input to the zero element, insert that into V1 if we + // can do so cheaply. + int NumV2Elements = + std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); + if (NumV2Elements == 1 && Mask[0] >= 4) + if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( + MVT::v4f64, DL, V1, V2, Mask, Subtarget, DAG)) + return Insertion; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check if the blend happens to exactly fit that of SHUFPD. + if ((Mask[0] == -1 || Mask[0] < 2) && + (Mask[1] == -1 || (Mask[1] >= 4 && Mask[1] < 6)) && + (Mask[2] == -1 || (Mask[2] >= 2 && Mask[2] < 4)) && + (Mask[3] == -1 || Mask[3] >= 6)) { + unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 5) << 1) | + ((Mask[2] == 3) << 2) | ((Mask[3] == 7) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V1, V2, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + if ((Mask[0] == -1 || (Mask[0] >= 4 && Mask[0] < 6)) && + (Mask[1] == -1 || Mask[1] < 2) && + (Mask[2] == -1 || Mask[2] >= 6) && + (Mask[3] == -1 || (Mask[3] >= 2 && Mask[3] < 4))) { + unsigned SHUFPDMask = (Mask[0] == 5) | ((Mask[1] == 1) << 1) | + ((Mask[2] == 7) << 2) | ((Mask[3] == 3) << 3); + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f64, V2, V1, + DAG.getConstant(SHUFPDMask, MVT::i8)); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. However, if we have AVX2 and either inputs are already in place, + // we will be able to shuffle even across lanes the other input in a single + // instruction so skip this pattern. + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + isShuffleMaskInputInPlace(1, Mask)))) + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // If we have AVX2 then we always want to lower with a blend because an v4 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 4-lane 64-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v4i64 shuffling.. +static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v4i64 with AVX2!"); + + SmallVector<int, 4> WidenedMask; + if (canWidenShuffleElements(Mask, WidenedMask)) + return lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask, Subtarget, + DAG); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v4i64, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // When the shuffle is mirrored between the 128-bit lanes of the unit, we can + // use lower latency instructions that will operate on both 128-bit lanes. + SmallVector<int, 2> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { + if (isSingleInputShuffleMask(Mask)) { + int PSHUFDMask[] = {-1, -1, -1, -1}; + for (int i = 0; i < 2; ++i) + if (RepeatedMask[i] >= 0) { + PSHUFDMask[2 * i] = 2 * RepeatedMask[i]; + PSHUFDMask[2 * i + 1] = 2 * RepeatedMask[i] + 1; + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v4i64, + DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, + DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, V1), + getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); + } + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 4, 2, 6)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 5, 3, 7)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); + } + + // AVX2 provides a direct instruction for permuting a single input across + // lanes. + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, + getV4X86ShuffleImm8ForMask(Mask, DAG)); + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. However, if we have AVX2 and either inputs are already in place, + // we will be able to shuffle even across lanes the other input in a single + // instruction so skip this pattern. + if (!(Subtarget->hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) || + isShuffleMaskInputInPlace(1, Mask)))) + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit floating point shuffles. +/// +/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 +/// isn't available. +static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // If the shuffle mask is repeated in each 128-bit lane, we have many more + // options to efficiently lower the shuffle. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && + "Repeated masks must be half the mask width!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); + + // Otherwise, fall back to a SHUFPS sequence. Here it is important that we + // have already handled any direct blends. We also need to squash the + // repeated mask into a simulated v4f32 mask. + for (int i = 0; i < 4; ++i) + if (RepeatedMask[i] >= 8) + RepeatedMask[i] -= 4; + return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); + } + + // If we have a single input shuffle with different shuffle patterns in the + // two 128-bit lanes use the variable mask to VPERMILPS. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], MVT::i32); + if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) + return DAG.getNode( + X86ISD::VPERMILPV, DL, MVT::v8f32, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask)); + + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, + DAG.getNode(ISD::BITCAST, DL, MVT::v8f32, + DAG.getNode(ISD::BUILD_VECTOR, DL, + MVT::v8i32, VPermMask)), + V1); + + // Otherwise, fall back. + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, + DAG); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // If we have AVX2 then we always want to lower with a blend because at v8 we + // can fully permute the elements. + if (Subtarget->hasAVX2()) + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG); + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 32-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v8i32 shuffling.. +static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v8i32 with AVX2!"); + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8i32, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // If the shuffle mask is repeated in each 128-bit lane we can use more + // efficient instructions that mirror the shuffles across the two 128-bit + // lanes. + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) { + assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!"); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, + getV4X86ShuffleImm8ForMask(RepeatedMask, DAG)); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 4, 12, 5, 13)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); + if (isShuffleEquivalent(Mask, 2, 10, 3, 11, 6, 14, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); + } + + // If the shuffle patterns aren't repeated but it is a single input, directly + // generate a cross-lane VPERMD instruction. + if (isSingleInputShuffleMask(Mask)) { + SDValue VPermMask[8]; + for (int i = 0; i < 8; ++i) + VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32) + : DAG.getConstant(Mask[i], MVT::i32); + return DAG.getNode( + X86ISD::VPERMV, DL, MVT::v8i32, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic blend lowering. + return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2, + Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 16-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v16i16 shuffling.. +static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v16i16, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 0, 16, 1, 17, 2, 18, 3, 19, + // Second 128-bit lane: + 8, 24, 9, 25, 10, 26, 11, 27)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); + if (isShuffleEquivalent(Mask, + // First 128-bit lane: + 4, 20, 5, 21, 6, 22, 7, 23, + // Second 128-bit lane: + 12, 28, 13, 29, 14, 30, 15, 31)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + + if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i16 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, + Mask, DAG); + + SDValue PSHUFBMask[32]; + for (int i = 0; i < 16; ++i) { + if (Mask[i] == -1) { + PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8); + continue; + } + + int M = i < 8 ? Mask[i] : Mask[i] - 8; + assert(M >= 0 && M < 8 && "Invalid single-input mask!"); + PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8); + PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8); + } + return DAG.getNode( + ISD::BITCAST, DL, MVT::v16i16, + DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, + DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1), + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask))); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 32-lane 8-bit integer shuffles. +/// +/// This routine is only called when we have AVX2 and thus a reasonable +/// instruction set for v32i8 shuffling.. +static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasAVX2() && "We can only lower v32i8 with AVX2!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v32i8, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, + Subtarget, DAG)) + return Blend; + + // Use dedicated unpack instructions for masks that match their pattern. + // Note that these are repeated 128-bit lane unpacks, not unpacks across all + // 256-bit lanes. + if (isShuffleEquivalent( + Mask, + // First 128-bit lane: + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, + // Second 128-bit lane: + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); + if (isShuffleEquivalent( + Mask, + // First 128-bit lane: + 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + // Second 128-bit lane: + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + + if (isSingleInputShuffleMask(Mask)) { + // There are no generalized cross-lane shuffle operations available on i8 + // element types. + if (is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) + return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, + Mask, DAG); + + SDValue PSHUFBMask[32]; + for (int i = 0; i < 32; ++i) + PSHUFBMask[i] = + Mask[i] < 0 + ? DAG.getUNDEF(MVT::i8) + : DAG.getConstant(Mask[i] < 16 ? Mask[i] : Mask[i] - 16, MVT::i8); + + return DAG.getNode( + X86ISD::PSHUFB, DL, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)); + } + + // Try to simplify this by merging 128-bit lanes to enable a lane-based + // shuffle. + if (SDValue Result = lowerVectorShuffleByMerging128BitLanes( + DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) + return Result; + + // Otherwise fall back on generic lowering. + return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG); +} + +/// \brief High-level routine to lower various 256-bit x86 vector shuffles. +/// +/// This routine either breaks down the specific type of a 256-bit x86 vector +/// shuffle or splits it into two 128-bit shuffles and fuses the results back +/// together based on the available instructions. +static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + + // There is a really nice hard cut-over between AVX1 and AVX2 that means we can + // check for those subtargets here and avoid much of the subtarget querying in + // the per-vector-type lowering routines. With AVX1 we have essentially *zero* + // ability to manipulate a 256-bit vector with integer types. Since we'll use + // floating point types there eventually, just immediately cast everything to + // a float and operate entirely in that domain. + if (VT.isInteger() && !Subtarget->hasAVX2()) { + int ElementBits = VT.getScalarSizeInBits(); + if (ElementBits < 32) + // No floating point type available, decompose into 128-bit vectors. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + + MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), + VT.getVectorNumElements()); + V1 = DAG.getNode(ISD::BITCAST, DL, FpVT, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, FpVT, V2); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); + } + + switch (VT.SimpleTy) { + case MVT::v4f64: + return lowerV4F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v4i64: + return lowerV4I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8f32: + return lowerV8F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i32: + return lowerV8I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i16: + return lowerV16I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v32i8: + return lowerV32I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + + default: + llvm_unreachable("Not a valid 256-bit x86 vector type!"); + } +} + +/// \brief Handle lowering of 8-lane 64-bit floating point shuffles. +static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit floating point shuffles. +static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, + 0, 16, 1, 17, 4, 20, 5, 21, + 8, 24, 9, 25, 12, 28, 13, 29)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); + if (isShuffleEquivalent(Mask, + 2, 18, 3, 19, 6, 22, 7, 23, + 10, 26, 11, 27, 14, 30, 15, 31)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 8-lane 64-bit integer shuffles. +static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(Mask, 0, 8, 2, 10, 4, 12, 6, 14)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); + if (isShuffleEquivalent(Mask, 1, 9, 3, 11, 5, 13, 7, 15)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 16-lane 32-bit integer shuffles. +static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); + + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(Mask, + 0, 16, 1, 17, 4, 20, 5, 21, + 8, 24, 9, 25, 12, 28, 13, 29)) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); + if (isShuffleEquivalent(Mask, + 2, 18, 3, 19, 6, 22, 7, 23, + 10, 26, 11, 27, 14, 30, 15, 31)) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 32-lane 16-bit integer shuffles. +static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); +} + +/// \brief Handle lowering of 64-lane 8-bit integer shuffles. +static SDValue lowerV64I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"); + assert(Subtarget->hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"); + + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); +} + +/// \brief High-level routine to lower various 512-bit x86 vector shuffles. +/// +/// This routine either breaks down the specific type of a 512-bit x86 vector +/// shuffle or splits it into two 256-bit shuffles and fuses the results back +/// together based on the available instructions. +static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/ basic ISA!"); + + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(VT.SimpleTy, DL, V1, + Mask, Subtarget, DAG)) + return Broadcast; + + // Dispatch to each element type for lowering. If we don't have supprot for + // specific element type shuffles at 512 bits, immediately split them and + // lower them. Each lowering routine of a given type is allowed to assume that + // the requisite ISA extensions for that element type are available. + switch (VT.SimpleTy) { + case MVT::v8f64: + return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16f32: + return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i64: + return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v16i32: + return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v32i16: + if (Subtarget->hasBWI()) + return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + case MVT::v64i8: + if (Subtarget->hasBWI()) + return lowerV64I8VectorShuffle(Op, V1, V2, Subtarget, DAG); + break; + + default: + llvm_unreachable("Not a valid 512-bit x86 vector type!"); + } + + // Otherwise fall back on splitting. + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); +} + /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 @@ -7945,22 +10961,25 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } - // For integer vector shuffles, try to collapse them into a shuffle of fewer - // lanes but wider integers. We cap this to not form integers larger than i64 - // but it might be interesting to form i128 integers to handle flipping the - // low and high halves of AVX 256-bit vectors. - if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && - areAdjacentMasksSequential(Mask)) { - SmallVector<int, 8> NewMask; - for (int i = 0, Size = Mask.size(); i < Size; i += 2) - NewMask.push_back(Mask[i] / 2); - MVT NewVT = - MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), - VT.getVectorNumElements() / 2); - V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); - V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); + // Try to collapse shuffles into using a vector type with fewer elements but + // wider element types. We cap this to not form integers or floating point + // elements wider than 64 bits, but it might be interesting to form i128 + // integers to handle flipping the low and high halves of AVX 256-bit vectors. + SmallVector<int, 16> WidenedMask; + if (VT.getScalarSizeInBits() < 64 && + canWidenShuffleElements(Mask, WidenedMask)) { + MVT NewEltVT = VT.isFloatingPoint() + ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) + : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); + MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + // Make sure that the new vector type is legal. For example, v2f64 isn't + // legal on SSE1. + if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { + V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); + V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getVectorShuffle(NewVT, dl, V1, V2, WidenedMask)); + } } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; @@ -7979,7 +10998,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, return DAG.getCommutedVectorShuffle(*SVOp); // When the number of V1 and V2 elements are the same, try to minimize the - // number of uses of V2 in the low half of the vector. + // number of uses of V2 in the low half of the vector. When that is tied, + // ensure that the sum of indices for V1 is equal to or lower than the sum + // indices for V2. When those are equal, try to ensure that the number of odd + // indices for V1 is lower than the number of odd indices for V2. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : SVOp->getMask().slice(0, NumElements / 2)) @@ -7987,14 +11009,42 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, ++LowV2Elements; else if (M >= 0) ++LowV1Elements; - if (LowV2Elements > LowV1Elements) + if (LowV2Elements > LowV1Elements) { return DAG.getCommutedVectorShuffle(*SVOp); + } else if (LowV2Elements == LowV1Elements) { + int SumV1Indices = 0, SumV2Indices = 0; + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) + if (SVOp->getMask()[i] >= NumElements) + SumV2Indices += i; + else if (SVOp->getMask()[i] >= 0) + SumV1Indices += i; + if (SumV2Indices < SumV1Indices) { + return DAG.getCommutedVectorShuffle(*SVOp); + } else if (SumV2Indices == SumV1Indices) { + int NumV1OddIndices = 0, NumV2OddIndices = 0; + for (int i = 0, Size = SVOp->getMask().size(); i < Size; ++i) + if (SVOp->getMask()[i] >= NumElements) + NumV2OddIndices += i % 2; + else if (SVOp->getMask()[i] >= 0) + NumV1OddIndices += i % 2; + if (NumV2OddIndices < NumV1OddIndices) + return DAG.getCommutedVectorShuffle(*SVOp); + } + } } // For each vector width, delegate to a specialized lowering routine. if (VT.getSizeInBits() == 128) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + if (VT.getSizeInBits() == 256) + return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + + // Force AVX-512 vectors to be scalarized for now. + // FIXME: Implement AVX-512 support! + if (VT.getSizeInBits() == 512) + return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + llvm_unreachable("Unimplemented!"); } @@ -9060,7 +12110,9 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, // should assume we're changing V2's element's place and behave // accordingly. int FromV2 = std::count_if(Mask.begin(), Mask.end(), FromV2Predicate); - if (FromV1 == FromV2 && DestIndex == Mask[DestIndex] % 4) { + assert(DestIndex <= INT32_MAX && "truncated destination index"); + if (FromV1 == FromV2 && + static_cast<int>(DestIndex) == Mask[DestIndex] % 4) { From = V2; To = V1; DestIndex = @@ -9163,37 +12215,6 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget, if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT)) return SDValue(); - // Simplify the operand as it's prepared to be fed into shuffle. - unsigned SignificantBits = NVT.getSizeInBits() >> Shift; - if (V1.getOpcode() == ISD::BITCAST && - V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && - V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - V1.getOperand(0).getOperand(0) - .getSimpleValueType().getSizeInBits() == SignificantBits) { - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) - SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); - ConstantSDNode *CIdx = - dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); - // If it's foldable, i.e. normal load with single use, we will let code - // selection to fold it. Otherwise, we will short the conversion sequence. - if (CIdx && CIdx->getZExtValue() == 0 && - (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) { - MVT FullVT = V.getSimpleValueType(); - MVT V1VT = V1.getSimpleValueType(); - if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) { - // The "ext_vec_elt" node is wider than the result node. - // In this case we should extract subvector from V. - // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)). - unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits(); - MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(), - FullVT.getVectorNumElements()/Ratio); - V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, - DAG.getIntPtrConstant(0)); - } - V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V); - } - } - return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); } @@ -9343,7 +12364,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); if (HasFp256 && (VT == MVT::v4f32 || VT == MVT::v2f64)) - return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, TargetMask, + return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, TargetMask, DAG); return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V1, @@ -9355,6 +12376,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { getShufflePALIGNRImmediate(SVOp), DAG); + if (isVALIGNMask(M, VT, Subtarget)) + return getTargetShuffleNode(X86ISD::VALIGN, dl, VT, V1, V2, + getShuffleVALIGNImmediate(SVOp), + DAG); + // Check if this can be converted into a logical shift. bool isLeft = false; unsigned ShAmt = 0; @@ -9520,7 +12546,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32) return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); - return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1, + return getTargetShuffleNode(X86ISD::VPERMILPI, dl, VT, V1, getShuffleSHUFImmediate(SVOp), DAG); } @@ -9639,9 +12665,10 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, return true; } -// Try to lower a vselect node into a simple blend instruction. -static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +/// \brief Try to lower a VSELECT instruction to an immediate-controlled blend +/// instruction. +static SDValue lowerVSELECTtoBLENDI(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDValue Cond = Op.getOperand(0); SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); @@ -9683,7 +12710,14 @@ static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget, } SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { - SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG); + // A vselect where all conditions and data are constants can be optimized into + // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). + if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) && + ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) && + ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) + return SDValue(); + + SDValue BlendOp = lowerVSELECTtoBLENDI(Op, Subtarget, DAG); if (BlendOp.getNode()) return BlendOp; @@ -9696,6 +12730,8 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { break; case MVT::v8i16: case MVT::v16i16: + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + break; return SDValue(); } @@ -9914,62 +12950,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return SDValue(); } -static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT.getVectorElementType(); - SDLoc dl(Op); - - SDValue N0 = Op.getOperand(0); - SDValue N1 = Op.getOperand(1); - SDValue N2 = Op.getOperand(2); - - if (!VT.is128BitVector()) - return SDValue(); - - if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && - isa<ConstantSDNode>(N2)) { - unsigned Opc; - if (VT == MVT::v8i16) - Opc = X86ISD::PINSRW; - else if (VT == MVT::v16i8) - Opc = X86ISD::PINSRB; - else - Opc = X86ISD::PINSRB; - - // Transform it so it match pinsr{b,w} which expects a GR32 as its second - // argument. - if (N1.getValueType() != MVT::i32) - N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); - if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); - return DAG.getNode(Opc, dl, VT, N0, N1, N2); - } - - if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { - // Bits [7:6] of the constant are the source select. This will always be - // zero here. The DAG Combiner may combine an extract_elt index into these - // bits. For example (insert (extract, 3), 2) could be matched by putting - // the '3' into bits [7:6] of X86ISD::INSERTPS. - // Bits [5:4] of the constant are the destination select. This is the - // value of the incoming immediate. - // Bits [3:0] of the constant are the zero mask. The DAG Combiner may - // combine either bitwise AND or insert of float 0.0 to set these bits. - N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); - // Create this as a scalar to vector.. - N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); - } - - if ((EltVT == MVT::i32 || EltVT == MVT::i64) && isa<ConstantSDNode>(N2)) { - // PINSR* works with constant index. - return Op; - } - return SDValue(); -} - /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. -SDValue +SDValue X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); @@ -9982,7 +12965,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { // insert element and then truncate the result. MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); - SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, + SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); @@ -10001,11 +12984,12 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(MaxSift - IdxVal, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } -SDValue -X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { + +SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); - + if (EltVT == MVT::i1) return InsertBitToMaskVector(Op, DAG); @@ -10013,20 +12997,20 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2 = Op.getOperand(2); + if (!isa<ConstantSDNode>(N2)) + return SDValue(); + auto *N2C = cast<ConstantSDNode>(N2); + unsigned IdxVal = N2C->getZExtValue(); - // If this is a 256-bit vector result, first extract the 128-bit vector, - // insert the element into the extracted half and then place it back. + // If the vector is wider than 128 bits, extract the 128-bit subvector, insert + // into that, and then insert the subvector back into the result. if (VT.is256BitVector() || VT.is512BitVector()) { - if (!isa<ConstantSDNode>(N2)) - return SDValue(); - // Get the desired 128-bit vector half. - unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue(); SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl); // Insert the element into the desired half. - unsigned NumEltsIn128 = 128/EltVT.getSizeInBits(); - unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128; + unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); + unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, MVT::i32)); @@ -10034,20 +13018,60 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // Insert the changed part back to the 256-bit vector return Insert128BitVector(N0, V, IdxVal, DAG, dl); } + assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); - if (Subtarget->hasSSE41()) - return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); + if (Subtarget->hasSSE41()) { + if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) { + unsigned Opc; + if (VT == MVT::v8i16) { + Opc = X86ISD::PINSRW; + } else { + assert(VT == MVT::v16i8); + Opc = X86ISD::PINSRB; + } + + // Transform it so it match pinsr{b,w} which expects a GR32 as its second + // argument. + if (N1.getValueType() != MVT::i32) + N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); + if (N2.getValueType() != MVT::i32) + N2 = DAG.getIntPtrConstant(IdxVal); + return DAG.getNode(Opc, dl, VT, N0, N1, N2); + } + + if (EltVT == MVT::f32) { + // Bits [7:6] of the constant are the source select. This will always be + // zero here. The DAG Combiner may combine an extract_elt index into + // these + // bits. For example (insert (extract, 3), 2) could be matched by + // putting + // the '3' into bits [7:6] of X86ISD::INSERTPS. + // Bits [5:4] of the constant are the destination select. This is the + // value of the incoming immediate. + // Bits [3:0] of the constant are the zero mask. The DAG Combiner may + // combine either bitwise AND or insert of float 0.0 to set these bits. + N2 = DAG.getIntPtrConstant(IdxVal << 4); + // Create this as a scalar to vector.. + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); + } + + if (EltVT == MVT::i32 || EltVT == MVT::i64) { + // PINSR* works with constant index. + return Op; + } + } if (EltVT == MVT::i8) return SDValue(); - if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { + if (EltVT.getSizeInBits() == 16) { // Transform it so it match pinsrw which expects a 16-bit value in a GR32 // as its second argument. if (N1.getValueType() != MVT::i32) N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); if (N2.getValueType() != MVT::i32) - N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); + N2 = DAG.getIntPtrConstant(IdxVal); return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); } return SDValue(); @@ -10360,6 +13384,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. MFI->setAdjustsStack(true); + MFI->setHasCalls(true); SDValue Flag = Chain.getValue(1); return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); @@ -10593,7 +13618,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { if (Subtarget->is64Bit()) IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, getPointerTy(), Chain, IDX, MachinePointerInfo(), MVT::i32, - false, false, 0); + false, false, false, 0); else IDX = DAG.getLoad(getPointerTy(), dl, Chain, IDX, MachinePointerInfo(), false, false, false, 0); @@ -10677,9 +13702,17 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { MVT SrcVT = Op.getOperand(0).getSimpleValueType(); + SDLoc dl(Op); - if (SrcVT.isVector()) + if (SrcVT.isVector()) { + if (SrcVT.getVectorElementType() == MVT::i1) { + MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, + Op.getOperand(0))); + } return SDValue(); + } assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!"); @@ -10693,7 +13726,6 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Op; } - SDLoc dl(Op); unsigned Size = SrcVT.getSizeInBits()/8; MachineFunction &MF = DAG.getMachineFunction(); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); @@ -10880,19 +13912,135 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, return Sub; } +static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // The algorithm is the following: + // #ifdef __SSE4_1__ + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + // #else + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + // #endif + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + // return (float4) lo + fhi; + + SDLoc DL(Op); + SDValue V = Op->getOperand(0); + EVT VecIntVT = V.getValueType(); + bool Is128 = VecIntVT == MVT::v4i32; + EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; + // If we convert to something else than the supported type, e.g., to v4f64, + // abort early. + if (VecFloatVT != Op->getValueType(0)) + return SDValue(); + + unsigned NumElts = VecIntVT.getVectorNumElements(); + assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && + "Unsupported custom type"); + assert(NumElts <= 8 && "The size of the constant array must be fixed"); + + // In the #idef/#else code, we have in common: + // - The vector of constants: + // -- 0x4b000000 + // -- 0x53000000 + // - A shift: + // -- v >> 16 + + // Create the splat vector for 0x4b000000. + SDValue CstLow = DAG.getConstant(0x4b000000, MVT::i32); + SDValue CstLowArray[] = {CstLow, CstLow, CstLow, CstLow, + CstLow, CstLow, CstLow, CstLow}; + SDValue VecCstLow = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstLowArray[0], NumElts)); + // Create the splat vector for 0x53000000. + SDValue CstHigh = DAG.getConstant(0x53000000, MVT::i32); + SDValue CstHighArray[] = {CstHigh, CstHigh, CstHigh, CstHigh, + CstHigh, CstHigh, CstHigh, CstHigh}; + SDValue VecCstHigh = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstHighArray[0], NumElts)); + + // Create the right shift. + SDValue CstShift = DAG.getConstant(16, MVT::i32); + SDValue CstShiftArray[] = {CstShift, CstShift, CstShift, CstShift, + CstShift, CstShift, CstShift, CstShift}; + SDValue VecCstShift = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, + makeArrayRef(&CstShiftArray[0], NumElts)); + SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); + + SDValue Low, High; + if (Subtarget.hasSSE41()) { + EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; + // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); + SDValue VecCstLowBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstLow); + SDValue VecBitcast = DAG.getNode(ISD::BITCAST, DL, VecI16VT, V); + // Low will be bitcasted right away, so do not bother bitcasting back to its + // original type. + Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, + VecCstLowBitcast, DAG.getConstant(0xaa, MVT::i32)); + // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), + // (uint4) 0x53000000, 0xaa); + SDValue VecCstHighBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, VecCstHigh); + SDValue VecShiftBitcast = + DAG.getNode(ISD::BITCAST, DL, VecI16VT, HighShift); + // High will be bitcasted right away, so do not bother bitcasting back to + // its original type. + High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, + VecCstHighBitcast, DAG.getConstant(0xaa, MVT::i32)); + } else { + SDValue CstMask = DAG.getConstant(0xffff, MVT::i32); + SDValue VecCstMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecIntVT, CstMask, + CstMask, CstMask, CstMask); + // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; + SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); + Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); + + // uint4 hi = (v >> 16) | (uint4) 0x53000000; + High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); + } + + // Create the vector constant for -(0x1.0p39f + 0x1.0p23f). + SDValue CstFAdd = DAG.getConstantFP( + APFloat(APFloat::IEEEsingle, APInt(32, 0xD3000080)), MVT::f32); + SDValue CstFAddArray[] = {CstFAdd, CstFAdd, CstFAdd, CstFAdd, + CstFAdd, CstFAdd, CstFAdd, CstFAdd}; + SDValue VecCstFAdd = DAG.getNode(ISD::BUILD_VECTOR, DL, VecFloatVT, + makeArrayRef(&CstFAddArray[0], NumElts)); + + // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); + SDValue HighBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, High); + SDValue FHigh = + DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); + // return (float4) lo + fhi; + SDValue LowBitcast = DAG.getNode(ISD::BITCAST, DL, VecFloatVT, Low); + return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); +} + SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); MVT SVT = N0.getSimpleValueType(); SDLoc dl(Op); - assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || - SVT == MVT::v8i8 || SVT == MVT::v8i16) && - "Custom UINT_TO_FP is not supported!"); - - MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); - return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); + switch (SVT.SimpleTy) { + default: + llvm_unreachable("Custom UINT_TO_FP is not supported!"); + case MVT::v4i8: + case MVT::v4i16: + case MVT::v8i8: + case MVT::v8i16: { + MVT NVT = MVT::getVectorVT(MVT::i32, SVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); + } + case MVT::v4i32: + case MVT::v8i32: + return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); + } + llvm_unreachable(nullptr); } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, @@ -10978,7 +14126,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // FIXME: Avoid the extend by constructing the right constant pool? SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, MachinePointerInfo::getConstantPool(), - MVT::f32, false, false, 4); + MVT::f32, false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); @@ -11192,12 +14340,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i1) { assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && "Invalid scalar TRUNCATE operation"); - if (InVT == MVT::i32) + if (InVT.getSizeInBits() >= 32) return SDValue(); - if (InVT.getSizeInBits() == 64) - In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::i32, In); - else if (InVT.getSizeInBits() < 32) - In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); + In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); return DAG.getNode(ISD::TRUNCATE, DL, VT, In); } assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && @@ -11215,7 +14360,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); InVT = ExtVT; } - + SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType()); const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue(); SDValue CP = DAG.getConstantPool(C, getPointerTy()); @@ -11375,58 +14520,47 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { In, DAG.getUNDEF(SVT))); } -static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) { - LLVMContext *Context = DAG.getContext(); +/// The only differences between FABS and FNEG are the mask and the logic op. +/// FNEG also has a folding opportunity for FNEG(FABS(x)). +static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { + assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && + "Wrong opcode for lowering FABS or FNEG."); + + bool IsFABS = (Op.getOpcode() == ISD::FABS); + + // If this is a FABS and it has an FNEG user, bail out to fold the combination + // into an FNABS. We'll lower the FABS after that if it is still in use. + if (IsFABS) + for (SDNode *User : Op->uses()) + if (User->getOpcode() == ISD::FNEG) + return Op; + + SDValue Op0 = Op.getOperand(0); + bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); + SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + // Assume scalar op for initialization; update for vector if needed. + // Note that there are no scalar bitwise logical SSE/AVX instructions, so we + // generate a 16-byte vector constant and logic op even for the scalar case. + // Using a 16-byte mask allows folding the load of the mask with + // the logic op, so it can save (~4 bytes) on code size. MVT EltVT = VT; unsigned NumElts = VT == MVT::f64 ? 2 : 4; + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to + // decide if we should generate a 16-byte constant mask when we only need 4 or + // 8 bytes for the scalar case. if (VT.isVector()) { EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); } - Constant *C; - if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, - APInt(64, ~(1ULL << 63)))); - else - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, - APInt(32, ~(1U << 31)))); - C = ConstantVector::getSplat(NumElts, C); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); - unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); - if (VT.isVector()) { - MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; - return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::AND, dl, ANDVT, - DAG.getNode(ISD::BITCAST, dl, ANDVT, - Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); - } - return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); -} -static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) { + unsigned EltBits = EltVT.getSizeInBits(); LLVMContext *Context = DAG.getContext(); - SDLoc dl(Op); - MVT VT = Op.getSimpleValueType(); - MVT EltVT = VT; - unsigned NumElts = VT == MVT::f64 ? 2 : 4; - if (VT.isVector()) { - EltVT = VT.getVectorElementType(); - NumElts = VT.getVectorNumElements(); - } - Constant *C; - if (EltVT == MVT::f64) - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble, - APInt(64, 1ULL << 63))); - else - C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle, - APInt(32, 1U << 31))); + // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... + APInt MaskElt = + IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits); + Constant *C = ConstantInt::get(*Context, MaskElt); C = ConstantVector::getSplat(NumElts, C); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy()); @@ -11434,16 +14568,24 @@ static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) { SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); + if (VT.isVector()) { - MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64); + // For a vector, cast operands to a vector type, perform the logic op, + // and cast the result back to the original value type. + MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); + SDValue MaskCasted = DAG.getNode(ISD::BITCAST, dl, VecVT, Mask); + SDValue Operand = IsFNABS ? + DAG.getNode(ISD::BITCAST, dl, VecVT, Op0.getOperand(0)) : + DAG.getNode(ISD::BITCAST, dl, VecVT, Op0); + unsigned BitOp = IsFABS ? ISD::AND : IsFNABS ? ISD::OR : ISD::XOR; return DAG.getNode(ISD::BITCAST, dl, VT, - DAG.getNode(ISD::XOR, dl, XORVT, - DAG.getNode(ISD::BITCAST, dl, XORVT, - Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, XORVT, Mask))); + DAG.getNode(BitOp, dl, VecVT, Operand, MaskCasted)); } - return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); + // If not vector, then scalar. + unsigned BitOp = IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; + SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; + return DAG.getNode(BitOp, dl, VT, Operand, Mask); } static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { @@ -11469,19 +14611,17 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. - // First get the sign bit of second operand. - SmallVector<Constant*,4> CV; - if (SrcVT == MVT::f64) { - const fltSemantics &Sem = APFloat::IEEEdouble; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); - } else { - const fltSemantics &Sem = APFloat::IEEEsingle; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - } + const fltSemantics &Sem = + VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; + const unsigned SizeInBits = VT.getSizeInBits(); + + SmallVector<Constant *, 4> CV( + VT == MVT::f64 ? 2 : 4, + ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); + + // First, clear all bits but the sign bit from the second operand (sign). + CV[0] = ConstantFP::get(*Context, + APFloat(Sem, APInt::getHighBitsSet(SizeInBits, 1))); Constant *C = ConstantVector::get(CV); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, @@ -11489,40 +14629,30 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { false, false, false, 16); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); - // Shift sign bit right or left if the two operands have different types. - if (SrcVT.bitsGT(VT)) { - // Op0 is MVT::f32, Op1 is MVT::f64. - SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); - SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, - DAG.getConstant(32, MVT::i32)); - SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit); - SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, - DAG.getIntPtrConstant(0)); - } - - // Clear first operand sign bit. - CV.clear(); - if (VT == MVT::f64) { - const fltSemantics &Sem = APFloat::IEEEdouble; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, - APInt(64, ~(1ULL << 63))))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0)))); + // Next, clear the sign bit from the first operand (magnitude). + // If it's a constant, we can clear it here. + if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Op0)) { + APFloat APF = Op0CN->getValueAPF(); + // If the magnitude is a positive zero, the sign bit alone is enough. + if (APF.isPosZero()) + return SignBit; + APF.clearSign(); + CV[0] = ConstantFP::get(*Context, APF); } else { - const fltSemantics &Sem = APFloat::IEEEsingle; - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, - APInt(32, ~(1U << 31))))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); - CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0)))); + CV[0] = ConstantFP::get( + *Context, + APFloat(Sem, APInt::getLowBitsSet(SizeInBits, SizeInBits - 1))); } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(), 16); - SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); - - // Or the value with the sign bit. + SDValue Val = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(), + false, false, false, 16); + // If the magnitude operand wasn't a constant, we need to AND out the sign. + if (!isa<ConstantFPSDNode>(Op0)) + Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Val); + + // OR the magnitude value with the sign bit. return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } @@ -11537,8 +14667,7 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); } -// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. -// +// Check whether an OR'd tree is PTEST-able. static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); @@ -11897,12 +15026,12 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, if (Op0.getValueType() == MVT::i1) llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); } - + if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { - // Do the comparison at i32 if it's smaller, besides the Atom case. - // This avoids subregister aliasing issues. Keep the smaller reference - // if we're optimizing for size, however, as that'll allow better folding + // Do the comparison at i32 if it's smaller, besides the Atom case. + // This avoids subregister aliasing issues. Keep the smaller reference + // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && !DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( @@ -11946,6 +15075,66 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRsqrtEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor and/or sqrt operand. + if (!Subtarget->useSqrtEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rsqrtss and rsqrtps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 + // instructions: convert to single, rsqrtss, convert back to double, refine + // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = 1; + UseOneConstNR = false; + return DCI.DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op); + } + return SDValue(); +} + +/// The minimum architected relative accuracy is 2^-12. We need one +/// Newton-Raphson step to have a good float result (24 bits of precision). +SDValue X86TargetLowering::getRecipEstimate(SDValue Op, + DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const { + // FIXME: We should use instruction latency models to calculate the cost of + // each potential sequence, but this is very hard to do reliably because + // at least Intel's Core* chips have variable timing based on the number of + // significant digits in the divisor. + if (!Subtarget->useReciprocalEst()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. + // TODO: Add support for AVX512 (v16f32). + // It is likely not profitable to do this for f64 because a double-precision + // reciprocal estimate with refinement on x86 prior to FMA requires + // 15 instructions: convert to single, rcpss, convert back to double, refine + // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA + // along with FMA, this could be a throughput win. + if ((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasAVX() && VT == MVT::v8f32)) { + RefinementSteps = ReciprocalEstimateRefinementSteps; + return DCI.DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op); + } + return SDValue(); +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); return C && C->isAllOnesValue(); @@ -12105,7 +15294,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 && + assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && Op.getValueType().getScalarType() == MVT::i1 && "Cannot set masked compare for this operation"); @@ -12219,11 +15408,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, EVT OpVT = Op1.getValueType(); if (Subtarget->hasAVX512()) { if (Op1.getValueType().is512BitVector() || + (Subtarget->hasBWI() && Subtarget->hasVLX()) || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); // In AVX-512 architecture setcc returns mask with i1 elements, - // But there is no compare instruction for i8 and i16 elements. + // But there is no compare instruction for i8 and i16 elements in KNL. // We are not talking about 512-bit operands in this case, these // types are illegal. if (MaskResult && @@ -12426,8 +15616,11 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { cast<ConstantSDNode>(Op1)->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); - if (NewSetCC.getNode()) + if (NewSetCC.getNode()) { + if (VT == MVT::i1) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); return NewSetCC; + } } // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of @@ -12729,18 +15922,40 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); } -static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); + MVT VTElt = VT.getVectorElementType(); + MVT InVTElt = InVT.getVectorElementType(); SDLoc dl(Op); + // SKX processor + if ((InVTElt == MVT::i1) && + (((Subtarget->hasBWI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasBWI() && VT.is512BitVector() && + VTElt.getSizeInBits() <= 16)) || + + ((Subtarget->hasDQI() && Subtarget->hasVLX() && + VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) || + + ((Subtarget->hasDQI() && VT.is512BitVector() && + VTElt.getSizeInBits() >= 32)))) + return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + unsigned int NumElts = VT.getVectorNumElements(); + if (NumElts != 8 && NumElts != 16) return SDValue(); - if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) + if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) { + if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) + return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0)); return DAG.getNode(X86ISD::VSEXT, dl, VT, In); + } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); @@ -12768,7 +15983,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SDLoc dl(Op); if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) - return LowerSIGN_EXTEND_AVX512(Op, DAG); + return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && (VT != MVT::v8i32 || InVT != MVT::v8i16) && @@ -12811,6 +16026,208 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +// Lower vector extended loads using a shuffle. If SSSE3 is not available we +// may emit an illegal shuffle but the expansion is still better than scalar +// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise +// we'll emit a shuffle and a arithmetic shift. +// TODO: It is possible to support ZExt by zeroing the undef values during +// the shuffle phase or after the shuffle. +static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT RegVT = Op.getSimpleValueType(); + assert(RegVT.isVector() && "We only custom lower vector sext loads."); + assert(RegVT.isInteger() && + "We only custom lower integer vector sext loads."); + + // Nothing useful we can do without SSE2 shuffles. + assert(Subtarget->hasSSE2() && "We only custom lower sext loads with SSE2."); + + LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); + SDLoc dl(Ld); + EVT MemVT = Ld->getMemoryVT(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned RegSz = RegVT.getSizeInBits(); + + ISD::LoadExtType Ext = Ld->getExtensionType(); + + assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD) + && "Only anyext and sext are currently implemented."); + assert(MemVT != RegVT && "Cannot extend to the same type"); + assert(MemVT.isVector() && "Must load a vector from memory"); + + unsigned NumElems = RegVT.getVectorNumElements(); + unsigned MemSz = MemVT.getSizeInBits(); + assert(RegSz > MemSz && "Register size must be greater than the mem size"); + + if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) { + // The only way in which we have a legal 256-bit vector result but not the + // integer 256-bit operations needed to directly lower a sextload is if we + // have AVX1 but not AVX2. In that case, we can always emit a sextload to + // a 128-bit vector and a normal sign_extend to 256-bits that should get + // correctly legalized. We do this late to allow the canonical form of + // sextload to persist throughout the rest of the DAG combiner -- it wants + // to fold together any extensions it can, and so will fuse a sign_extend + // of an sextload into a sextload targeting a wider value. + SDValue Load; + if (MemSz == 128) { + // Just switch this to a normal load. + assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, " + "it must be a legal 128-bit vector " + "type!"); + Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment()); + } else { + assert(MemSz < 128 && + "Can't extend a type wider than 128 bits to a 256 bit vector!"); + // Do an sext load to a 128-bit vector type. We want to use the same + // number of elements, but elements half as wide. This will end up being + // recursively lowered by this routine, but will succeed as we definitely + // have all the necessary features if we're using AVX1. + EVT HalfEltVT = + EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2); + EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems); + Load = + DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), MemVT, Ld->isVolatile(), + Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + } + + // Replace chain users with the new chain. + assert(Load->getNumValues() == 2 && "Loads must carry a chain!"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); + + // Finally, do a normal sign-extend to the desired register. + return DAG.getSExtOrTrunc(Load, dl, RegVT); + } + + // All sizes must be a power of two. + assert(isPowerOf2_32(RegSz * MemSz * NumElems) && + "Non-power-of-two elements are not custom lowered!"); + + // Attempt to load the original value using scalar loads. + // Find the largest scalar type that divides the total loaded size. + MVT SclrLoadTy = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { + SclrLoadTy = Tp; + } + } + + // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. + if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && + (64 <= MemSz)) + SclrLoadTy = MVT::f64; + + // Calculate the number of scalar loads that we need to perform + // in order to load our vector from memory. + unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); + + assert((Ext != ISD::SEXTLOAD || NumLoads == 1) && + "Can only lower sext loads with a single scalar load!"); + + unsigned loadRegZize = RegSz; + if (Ext == ISD::SEXTLOAD && RegSz == 256) + loadRegZize /= 2; + + // Represent our vector as a sequence of elements which are the + // largest scalar that we can load. + EVT LoadUnitVecVT = EVT::getVectorVT( + *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits()); + + // Represent the data using the same element type that is stored in + // memory. In practice, we ''widen'' MemVT. + EVT WideVecVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), + loadRegZize / MemVT.getScalarType().getSizeInBits()); + + assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && + "Invalid vector type"); + + // We can't shuffle using an illegal type. + assert(TLI.isTypeLegal(WideVecVT) && + "We only lower types that form legal widened vector types"); + + SmallVector<SDValue, 8> Chains; + SDValue Ptr = Ld->getBasePtr(); + SDValue Increment = + DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, TLI.getPointerTy()); + SDValue Res = DAG.getUNDEF(LoadUnitVecVT); + + for (unsigned i = 0; i < NumLoads; ++i) { + // Perform a single load. + SDValue ScalarLoad = + DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), + Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), + Ld->getAlignment()); + Chains.push_back(ScalarLoad.getValue(1)); + // Create the first element type using SCALAR_TO_VECTOR in order to avoid + // another round of DAGCombining. + if (i == 0) + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); + else + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, + ScalarLoad, DAG.getIntPtrConstant(i)); + + Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); + } + + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + + // Bitcast the loaded value to a vector of the original element type, in + // the size of the target vector type. + SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); + unsigned SizeRatio = RegSz / MemSz; + + if (Ext == ISD::SEXTLOAD) { + // If we have SSE4.1, we can directly emit a VSEXT node. + if (Subtarget->hasSSE41()) { + SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Sext; + } + + // Otherwise we'll shuffle the small elements in the high bits of the + // larger type and perform an arithmetic shift. If the shift is not legal + // it's better to scalarize. + assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && + "We can't implement a sext load without an arithmetic right shift!"); + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio + SizeRatio - 1] = i; + + SDValue Shuff = DAG.getVectorShuffle( + WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + + // Build the arithmetic shift. + unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - + MemVT.getVectorElementType().getSizeInBits(); + Shuff = + DAG.getNode(ISD::SRA, dl, RegVT, Shuff, DAG.getConstant(Amt, RegVT)); + + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Shuff; + } + + // Redistribute the loaded elements into the different locations. + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + + // Bitcast to the requested type. + Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Shuff; +} + // isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or // ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart // from the AND / OR. @@ -13116,7 +16533,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. -// Calls to _alloca is needed to probe the stack when allocating more than 4k +// Calls to _alloca are needed to probe the stack when allocating more than 4k // bytes in one go. Touching the stack at 4K increments is necessary to ensure // that the guard pages used by the OS virtual memory manager are allocated in // correct sequence. @@ -13125,7 +16542,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); - bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) || + bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMachO()) || SplitStack; SDLoc dl(Op); @@ -13151,7 +16568,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); - const TargetFrameLowering &TFI = *DAG.getTarget().getFrameLowering(); + const TargetFrameLowering &TFI = *DAG.getSubtarget().getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) @@ -13174,7 +16591,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, EVT VT = Op.getNode()->getValueType(0); bool Is64Bit = Subtarget->is64Bit(); - EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32; + EVT SPTy = getPointerTy(); if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -13192,7 +16609,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, } const TargetRegisterClass *AddrRegClass = - getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32); + getRegClassFor(getPointerTy()); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, @@ -13201,7 +16618,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; - unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX); + const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag); Flag = Chain.getValue(1); @@ -13209,8 +16626,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); unsigned SPReg = RegInfo->getStackRegister(); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); Chain = SP.getValue(1); @@ -13451,7 +16868,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT, static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, SDValue SrcOp, SDValue ShAmt, SelectionDAG &DAG) { - assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32"); + MVT SVT = ShAmt.getSimpleValueType(); + assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); // Catch shift-by-constant. if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) @@ -13466,13 +16884,28 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, case X86ISD::VSRAI: Opc = X86ISD::VSRA; break; } - // Need to build a vector containing shift amount - // Shift amount is 32-bits, but SSE instructions read 64-bit, so fill with 0 - SDValue ShOps[4]; - ShOps[0] = ShAmt; - ShOps[1] = DAG.getConstant(0, MVT::i32); - ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32); - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps); + const X86Subtarget &Subtarget = + DAG.getTarget().getSubtarget<X86Subtarget>(); + if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND && + ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) { + // Let the shuffle legalizer expand this shift amount node. + SDValue Op0 = ShAmt.getOperand(0); + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0); + ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG); + } else { + // Need to build a vector containing shift amount. + // SSE/AVX packed shifts only use the lower 64-bit of the shift count. + SmallVector<SDValue, 4> ShOps; + ShOps.push_back(ShAmt); + if (SVT == MVT::i32) { + ShOps.push_back(DAG.getConstant(0, SVT)); + ShOps.push_back(DAG.getUNDEF(SVT)); + } + ShOps.push_back(DAG.getUNDEF(SVT)); + + MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64; + ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps); + } // The return type has to be a 128-bit type with the same element // type as the input type. @@ -13483,382 +16916,271 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { - SDLoc dl(Op); - unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - switch (IntNo) { - default: return SDValue(); // Don't custom lower most intrinsics. - // Comparison intrinsics. - case Intrinsic::x86_sse_comieq_ss: - case Intrinsic::x86_sse_comilt_ss: - case Intrinsic::x86_sse_comile_ss: - case Intrinsic::x86_sse_comigt_ss: - case Intrinsic::x86_sse_comige_ss: - case Intrinsic::x86_sse_comineq_ss: - case Intrinsic::x86_sse_ucomieq_ss: - case Intrinsic::x86_sse_ucomilt_ss: - case Intrinsic::x86_sse_ucomile_ss: - case Intrinsic::x86_sse_ucomigt_ss: - case Intrinsic::x86_sse_ucomige_ss: - case Intrinsic::x86_sse_ucomineq_ss: - case Intrinsic::x86_sse2_comieq_sd: - case Intrinsic::x86_sse2_comilt_sd: - case Intrinsic::x86_sse2_comile_sd: - case Intrinsic::x86_sse2_comigt_sd: - case Intrinsic::x86_sse2_comige_sd: - case Intrinsic::x86_sse2_comineq_sd: - case Intrinsic::x86_sse2_ucomieq_sd: - case Intrinsic::x86_sse2_ucomilt_sd: - case Intrinsic::x86_sse2_ucomile_sd: - case Intrinsic::x86_sse2_ucomigt_sd: - case Intrinsic::x86_sse2_ucomige_sd: - case Intrinsic::x86_sse2_ucomineq_sd: { - unsigned Opc; - ISD::CondCode CC; +/// \brief Return (and \p Op, \p Mask) for compare instructions or +/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the +/// necessary casting for \p Mask when lowering masking intrinsics. +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), + MVT::i1, VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + + assert(MaskVT.isSimple() && "invalid mask type"); + + if (isAllOnes(Mask)) + return Op; + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + } + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); +} + +/// \brief Creates an SDNode for a predicated scalar operation. +/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). +/// The mask is comming as MVT::i8 and it should be truncated +/// to MVT::i1 while lowering masking intrinsics. +/// The main difference between ScalarMaskingNode and VectorMaskingNode is using +/// "X86select" instead of "vselect". We just can't create the "vselect" node for +/// a scalar instruction. +static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + if (isAllOnes(Mask)) + return Op; + + EVT VT = Op.getValueType(); + SDLoc dl(Op); + // The mask should be of type MVT::i1 + SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); +} + +static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) { switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse_comieq_ss: - case Intrinsic::x86_sse2_comieq_sd: - Opc = X86ISD::COMI; - CC = ISD::SETEQ; - break; - case Intrinsic::x86_sse_comilt_ss: - case Intrinsic::x86_sse2_comilt_sd: - Opc = X86ISD::COMI; - CC = ISD::SETLT; - break; - case Intrinsic::x86_sse_comile_ss: - case Intrinsic::x86_sse2_comile_sd: - Opc = X86ISD::COMI; - CC = ISD::SETLE; - break; - case Intrinsic::x86_sse_comigt_ss: - case Intrinsic::x86_sse2_comigt_sd: - Opc = X86ISD::COMI; - CC = ISD::SETGT; - break; - case Intrinsic::x86_sse_comige_ss: - case Intrinsic::x86_sse2_comige_sd: - Opc = X86ISD::COMI; - CC = ISD::SETGE; - break; - case Intrinsic::x86_sse_comineq_ss: - case Intrinsic::x86_sse2_comineq_sd: - Opc = X86ISD::COMI; - CC = ISD::SETNE; - break; - case Intrinsic::x86_sse_ucomieq_ss: - case Intrinsic::x86_sse2_ucomieq_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETEQ; - break; - case Intrinsic::x86_sse_ucomilt_ss: - case Intrinsic::x86_sse2_ucomilt_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETLT; - break; - case Intrinsic::x86_sse_ucomile_ss: - case Intrinsic::x86_sse2_ucomile_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETLE; - break; - case Intrinsic::x86_sse_ucomigt_ss: - case Intrinsic::x86_sse2_ucomigt_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETGT; - break; - case Intrinsic::x86_sse_ucomige_ss: - case Intrinsic::x86_sse2_ucomige_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETGE; - break; - case Intrinsic::x86_sse_ucomineq_ss: - case Intrinsic::x86_sse2_ucomineq_sd: - Opc = X86ISD::UCOMI; - CC = ISD::SETNE; - break; + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_mask_vfmadd_ps_512: + case Intrinsic::x86_fma_mask_vfmadd_pd_512: + return X86ISD::FMADD; + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_mask_vfmsub_ps_512: + case Intrinsic::x86_fma_mask_vfmsub_pd_512: + return X86ISD::FMSUB; + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_mask_vfnmadd_ps_512: + case Intrinsic::x86_fma_mask_vfnmadd_pd_512: + return X86ISD::FNMADD; + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_mask_vfnmsub_ps_512: + case Intrinsic::x86_fma_mask_vfnmsub_pd_512: + return X86ISD::FNMSUB; + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: + case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: + return X86ISD::FMADDSUB; + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: + case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: + case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: + return X86ISD::FMSUBADD; } +} - SDValue LHS = Op.getOperand(1); - SDValue RHS = Op.getOperand(2); - unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); - assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); - SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); - SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, - DAG.getConstant(X86CC, MVT::i8), Cond); - return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); - } +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc dl(Op); + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + EVT VT = Op.getValueType(); + const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); + if (IntrData) { + switch(IntrData->Type) { + case INTR_TYPE_1OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); + case INTR_TYPE_2OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2)); + case INTR_TYPE_3OP: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + case INTR_TYPE_1OP_MASK_RM: { + SDValue Src = Op.getOperand(1); + SDValue Src0 = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue RoundingMode = Op.getOperand(4); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, + RoundingMode), + Mask, Src0, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src0 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + SDValue RoundingMode = Op.getOperand(5); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, + RoundingMode), + Mask, Src0, Subtarget, DAG); + } + case INTR_TYPE_2OP_MASK: { + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1), + Op.getOperand(2)), + Op.getOperand(4), Op.getOperand(3), Subtarget, DAG); + } + case CMP_MASK: + case CMP_MASK_CC: { + // Comparison intrinsics with masks. + // Example of transformation: + // (i8 (int_x86_avx512_mask_pcmpeq_q_128 + // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> + // (i8 (bitcast + // (v8i1 (insert_subvector undef, + // (v2i1 (and (PCMPEQM %a, %b), + // (extract_subvector + // (v8i1 (bitcast %mask)), 0))), 0)))) + EVT VT = Op.getOperand(1).getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue Cmp; + if (IntrData->Type == CMP_MASK_CC) { + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } else { + assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!"); + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), + Op.getOperand(2)); + } + SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, + DAG.getTargetConstant(0, MaskVT), + Subtarget, DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CmpMask, + DAG.getIntPtrConstant(0)); + return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res); + } + case COMI: { // Comparison intrinsics + ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); + assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); + SDValue Cond = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86CC, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } + case VSHIFT: + return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), + Op.getOperand(1), Op.getOperand(2), DAG); + case VSHIFT_MASK: + return getVectorMaskingNode(getTargetVShiftNode(IntrData->Opc0, dl, + Op.getSimpleValueType(), + Op.getOperand(1), + Op.getOperand(2), DAG), + Op.getOperand(4), Op.getOperand(3), Subtarget, + DAG); + case COMPRESS_EXPAND_IN_REG: { + SDValue Mask = Op.getOperand(3); + SDValue DataToCompress = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + if (isAllOnes(Mask)) // return data as is + return Op.getOperand(1); + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); - // Arithmetic intrinsics. - case Intrinsic::x86_sse2_pmulu_dq: - case Intrinsic::x86_avx2_pmulu_dq: - return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse41_pmuldq: - case Intrinsic::x86_avx2_pmul_dq: - return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pmulhu_w: - case Intrinsic::x86_avx2_pmulhu_w: - return DAG.getNode(ISD::MULHU, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pmulh_w: - case Intrinsic::x86_avx2_pmulh_w: - return DAG.getNode(ISD::MULHS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - // SSE2/AVX2 sub with unsigned saturation intrinsics - case Intrinsic::x86_sse2_psubus_b: - case Intrinsic::x86_sse2_psubus_w: - case Intrinsic::x86_avx2_psubus_b: - case Intrinsic::x86_avx2_psubus_w: - return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - // SSE3/AVX horizontal add/sub intrinsics - case Intrinsic::x86_sse3_hadd_ps: - case Intrinsic::x86_sse3_hadd_pd: - case Intrinsic::x86_avx_hadd_ps_256: - case Intrinsic::x86_avx_hadd_pd_256: - case Intrinsic::x86_sse3_hsub_ps: - case Intrinsic::x86_sse3_hsub_pd: - case Intrinsic::x86_avx_hsub_ps_256: - case Intrinsic::x86_avx_hsub_pd_256: - case Intrinsic::x86_ssse3_phadd_w_128: - case Intrinsic::x86_ssse3_phadd_d_128: - case Intrinsic::x86_avx2_phadd_w: - case Intrinsic::x86_avx2_phadd_d: - case Intrinsic::x86_ssse3_phsub_w_128: - case Intrinsic::x86_ssse3_phsub_d_128: - case Intrinsic::x86_avx2_phsub_w: - case Intrinsic::x86_avx2_phsub_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse3_hadd_ps: - case Intrinsic::x86_sse3_hadd_pd: - case Intrinsic::x86_avx_hadd_ps_256: - case Intrinsic::x86_avx_hadd_pd_256: - Opcode = X86ISD::FHADD; - break; - case Intrinsic::x86_sse3_hsub_ps: - case Intrinsic::x86_sse3_hsub_pd: - case Intrinsic::x86_avx_hsub_ps_256: - case Intrinsic::x86_avx_hsub_pd_256: - Opcode = X86ISD::FHSUB; - break; - case Intrinsic::x86_ssse3_phadd_w_128: - case Intrinsic::x86_ssse3_phadd_d_128: - case Intrinsic::x86_avx2_phadd_w: - case Intrinsic::x86_avx2_phadd_d: - Opcode = X86ISD::HADD; - break; - case Intrinsic::x86_ssse3_phsub_w_128: - case Intrinsic::x86_ssse3_phsub_d_128: - case Intrinsic::x86_avx2_phsub_w: - case Intrinsic::x86_avx2_phsub_d: - Opcode = X86ISD::HSUB; - break; + return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, + PassThru); } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // SSE2/SSE41/AVX2 integer max/min intrinsics. - case Intrinsic::x86_sse2_pmaxu_b: - case Intrinsic::x86_sse41_pmaxuw: - case Intrinsic::x86_sse41_pmaxud: - case Intrinsic::x86_avx2_pmaxu_b: - case Intrinsic::x86_avx2_pmaxu_w: - case Intrinsic::x86_avx2_pmaxu_d: - case Intrinsic::x86_sse2_pminu_b: - case Intrinsic::x86_sse41_pminuw: - case Intrinsic::x86_sse41_pminud: - case Intrinsic::x86_avx2_pminu_b: - case Intrinsic::x86_avx2_pminu_w: - case Intrinsic::x86_avx2_pminu_d: - case Intrinsic::x86_sse41_pmaxsb: - case Intrinsic::x86_sse2_pmaxs_w: - case Intrinsic::x86_sse41_pmaxsd: - case Intrinsic::x86_avx2_pmaxs_b: - case Intrinsic::x86_avx2_pmaxs_w: - case Intrinsic::x86_avx2_pmaxs_d: - case Intrinsic::x86_sse41_pminsb: - case Intrinsic::x86_sse2_pmins_w: - case Intrinsic::x86_sse41_pminsd: - case Intrinsic::x86_avx2_pmins_b: - case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse2_pmaxu_b: - case Intrinsic::x86_sse41_pmaxuw: - case Intrinsic::x86_sse41_pmaxud: - case Intrinsic::x86_avx2_pmaxu_b: - case Intrinsic::x86_avx2_pmaxu_w: - case Intrinsic::x86_avx2_pmaxu_d: - Opcode = X86ISD::UMAX; - break; - case Intrinsic::x86_sse2_pminu_b: - case Intrinsic::x86_sse41_pminuw: - case Intrinsic::x86_sse41_pminud: - case Intrinsic::x86_avx2_pminu_b: - case Intrinsic::x86_avx2_pminu_w: - case Intrinsic::x86_avx2_pminu_d: - Opcode = X86ISD::UMIN; - break; - case Intrinsic::x86_sse41_pmaxsb: - case Intrinsic::x86_sse2_pmaxs_w: - case Intrinsic::x86_sse41_pmaxsd: - case Intrinsic::x86_avx2_pmaxs_b: - case Intrinsic::x86_avx2_pmaxs_w: - case Intrinsic::x86_avx2_pmaxs_d: - Opcode = X86ISD::SMAX; - break; - case Intrinsic::x86_sse41_pminsb: - case Intrinsic::x86_sse2_pmins_w: - case Intrinsic::x86_sse41_pminsd: - case Intrinsic::x86_avx2_pmins_b: - case Intrinsic::x86_avx2_pmins_w: - case Intrinsic::x86_avx2_pmins_d: - Opcode = X86ISD::SMIN; - break; + case BLEND: { + SDValue Mask = Op.getOperand(3); + EVT VT = Op.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDLoc dl(Op); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), + Op.getOperand(2)); } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // SSE/SSE2/AVX floating point max/min intrinsics. - case Intrinsic::x86_sse_max_ps: - case Intrinsic::x86_sse2_max_pd: - case Intrinsic::x86_avx_max_ps_256: - case Intrinsic::x86_avx_max_pd_256: - case Intrinsic::x86_sse_min_ps: - case Intrinsic::x86_sse2_min_pd: - case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse_max_ps: - case Intrinsic::x86_sse2_max_pd: - case Intrinsic::x86_avx_max_ps_256: - case Intrinsic::x86_avx_max_pd_256: - Opcode = X86ISD::FMAX; - break; - case Intrinsic::x86_sse_min_ps: - case Intrinsic::x86_sse2_min_pd: - case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: - Opcode = X86ISD::FMIN; - break; + case FMA_OP_MASK: + { + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)), + Op.getOperand(4), Op.getOperand(1), + Subtarget, DAG); } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // AVX2 variable shift intrinsics - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q_256: - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q_256: - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_avx2_psllv_d: - case Intrinsic::x86_avx2_psllv_q: - case Intrinsic::x86_avx2_psllv_d_256: - case Intrinsic::x86_avx2_psllv_q_256: - Opcode = ISD::SHL; - break; - case Intrinsic::x86_avx2_psrlv_d: - case Intrinsic::x86_avx2_psrlv_q: - case Intrinsic::x86_avx2_psrlv_d_256: - case Intrinsic::x86_avx2_psrlv_q_256: - Opcode = ISD::SRL; - break; - case Intrinsic::x86_avx2_psrav_d: - case Intrinsic::x86_avx2_psrav_d_256: - Opcode = ISD::SRA; + default: break; } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - case Intrinsic::x86_sse2_packssdw_128: - case Intrinsic::x86_sse2_packsswb_128: - case Intrinsic::x86_avx2_packssdw: - case Intrinsic::x86_avx2_packsswb: - return DAG.getNode(X86ISD::PACKSS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_packuswb_128: - case Intrinsic::x86_sse41_packusdw: - case Intrinsic::x86_avx2_packuswb: - case Intrinsic::x86_avx2_packusdw: - return DAG.getNode(X86ISD::PACKUS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_ssse3_pshuf_b_128: - case Intrinsic::x86_avx2_pshuf_b: - return DAG.getNode(X86ISD::PSHUFB, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pshuf_d: - return DAG.getNode(X86ISD::PSHUFD, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pshufl_w: - return DAG.getNode(X86ISD::PSHUFLW, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse2_pshufh_w: - return DAG.getNode(X86ISD::PSHUFHW, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_ssse3_psign_b_128: - case Intrinsic::x86_ssse3_psign_w_128: - case Intrinsic::x86_ssse3_psign_d_128: - case Intrinsic::x86_avx2_psign_b: - case Intrinsic::x86_avx2_psign_w: - case Intrinsic::x86_avx2_psign_d: - return DAG.getNode(X86ISD::PSIGN, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::x86_sse41_insertps: - return DAG.getNode(X86ISD::INSERTPS, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::x86_avx_vperm2f128_ps_256: - case Intrinsic::x86_avx_vperm2f128_pd_256: - case Intrinsic::x86_avx_vperm2f128_si_256: - case Intrinsic::x86_avx2_vperm2i128: - return DAG.getNode(X86ISD::VPERM2X128, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + } - case Intrinsic::x86_avx2_permd: - case Intrinsic::x86_avx2_permps: - // Operands intentionally swapped. Mask is last operand to intrinsic, - // but second operand for node/instruction. - return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(), - Op.getOperand(2), Op.getOperand(1)); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. - case Intrinsic::x86_sse_sqrt_ps: - case Intrinsic::x86_sse2_sqrt_pd: - case Intrinsic::x86_avx_sqrt_ps_256: - case Intrinsic::x86_avx_sqrt_pd_256: - return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::x86_avx512_mask_valign_q_512: + case Intrinsic::x86_avx512_mask_valign_d_512: + // Vector source operands are swapped. + return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl, + Op.getValueType(), Op.getOperand(2), + Op.getOperand(1), + Op.getOperand(3)), + Op.getOperand(5), Op.getOperand(4), + Subtarget, DAG); // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest @@ -13936,100 +17258,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } - // SSE/AVX shift intrinsics - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse2_psll_w: - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_avx2_psll_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - Opcode = X86ISD::VSHL; - break; - case Intrinsic::x86_sse2_psrl_w: - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - Opcode = X86ISD::VSRL; - break; - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: - Opcode = X86ISD::VSRA; - break; - } - return DAG.getNode(Opcode, dl, Op.getValueType(), - Op.getOperand(1), Op.getOperand(2)); - } - - // SSE/AVX immediate shift intrinsics - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: { - unsigned Opcode; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_sse2_pslli_d: - case Intrinsic::x86_sse2_pslli_q: - case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx2_pslli_d: - case Intrinsic::x86_avx2_pslli_q: - Opcode = X86ISD::VSHLI; - break; - case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_sse2_psrli_d: - case Intrinsic::x86_sse2_psrli_q: - case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx2_psrli_d: - case Intrinsic::x86_avx2_psrli_q: - Opcode = X86ISD::VSRLI; - break; - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - Opcode = X86ISD::VSRAI; - break; - } - return getTargetVShiftNode(Opcode, dl, Op.getSimpleValueType(), - Op.getOperand(1), Op.getOperand(2), DAG); - } - case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -14106,6 +17334,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } + + case Intrinsic::x86_fma_mask_vfmadd_ps_512: + case Intrinsic::x86_fma_mask_vfmadd_pd_512: + case Intrinsic::x86_fma_mask_vfmsub_ps_512: + case Intrinsic::x86_fma_mask_vfmsub_pd_512: + case Intrinsic::x86_fma_mask_vfnmadd_ps_512: + case Intrinsic::x86_fma_mask_vfnmadd_pd_512: + case Intrinsic::x86_fma_mask_vfnmsub_ps_512: + case Intrinsic::x86_fma_mask_vfnmsub_pd_512: + case Intrinsic::x86_fma_mask_vfmaddsub_ps_512: + case Intrinsic::x86_fma_mask_vfmaddsub_pd_512: + case Intrinsic::x86_fma_mask_vfmsubadd_ps_512: + case Intrinsic::x86_fma_mask_vfmsubadd_pd_512: { + auto *SAE = cast<ConstantSDNode>(Op.getOperand(5)); + if (SAE->getZExtValue() == X86::STATIC_ROUNDING::CUR_DIRECTION) + return getVectorMaskingNode(DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), + dl, Op.getValueType(), + Op.getOperand(1), + Op.getOperand(2), + Op.getOperand(3)), + Op.getOperand(4), Op.getOperand(1), + Subtarget, DAG); + else + return SDValue(); + } + case Intrinsic::x86_fma_vfmadd_ps: case Intrinsic::x86_fma_vfmadd_pd: case Intrinsic::x86_fma_vfmsub_ps: @@ -14130,74 +17384,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_fma_vfmaddsub_pd_256: case Intrinsic::x86_fma_vfmsubadd_ps_256: case Intrinsic::x86_fma_vfmsubadd_pd_256: - case Intrinsic::x86_fma_vfmadd_ps_512: - case Intrinsic::x86_fma_vfmadd_pd_512: - case Intrinsic::x86_fma_vfmsub_ps_512: - case Intrinsic::x86_fma_vfmsub_pd_512: - case Intrinsic::x86_fma_vfnmadd_ps_512: - case Intrinsic::x86_fma_vfnmadd_pd_512: - case Intrinsic::x86_fma_vfnmsub_ps_512: - case Intrinsic::x86_fma_vfnmsub_pd_512: - case Intrinsic::x86_fma_vfmaddsub_ps_512: - case Intrinsic::x86_fma_vfmaddsub_pd_512: - case Intrinsic::x86_fma_vfmsubadd_ps_512: - case Intrinsic::x86_fma_vfmsubadd_pd_512: { - unsigned Opc; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_fma_vfmadd_ps: - case Intrinsic::x86_fma_vfmadd_pd: - case Intrinsic::x86_fma_vfmadd_ps_256: - case Intrinsic::x86_fma_vfmadd_pd_256: - case Intrinsic::x86_fma_vfmadd_ps_512: - case Intrinsic::x86_fma_vfmadd_pd_512: - Opc = X86ISD::FMADD; - break; - case Intrinsic::x86_fma_vfmsub_ps: - case Intrinsic::x86_fma_vfmsub_pd: - case Intrinsic::x86_fma_vfmsub_ps_256: - case Intrinsic::x86_fma_vfmsub_pd_256: - case Intrinsic::x86_fma_vfmsub_ps_512: - case Intrinsic::x86_fma_vfmsub_pd_512: - Opc = X86ISD::FMSUB; - break; - case Intrinsic::x86_fma_vfnmadd_ps: - case Intrinsic::x86_fma_vfnmadd_pd: - case Intrinsic::x86_fma_vfnmadd_ps_256: - case Intrinsic::x86_fma_vfnmadd_pd_256: - case Intrinsic::x86_fma_vfnmadd_ps_512: - case Intrinsic::x86_fma_vfnmadd_pd_512: - Opc = X86ISD::FNMADD; - break; - case Intrinsic::x86_fma_vfnmsub_ps: - case Intrinsic::x86_fma_vfnmsub_pd: - case Intrinsic::x86_fma_vfnmsub_ps_256: - case Intrinsic::x86_fma_vfnmsub_pd_256: - case Intrinsic::x86_fma_vfnmsub_ps_512: - case Intrinsic::x86_fma_vfnmsub_pd_512: - Opc = X86ISD::FNMSUB; - break; - case Intrinsic::x86_fma_vfmaddsub_ps: - case Intrinsic::x86_fma_vfmaddsub_pd: - case Intrinsic::x86_fma_vfmaddsub_ps_256: - case Intrinsic::x86_fma_vfmaddsub_pd_256: - case Intrinsic::x86_fma_vfmaddsub_ps_512: - case Intrinsic::x86_fma_vfmaddsub_pd_512: - Opc = X86ISD::FMADDSUB; - break; - case Intrinsic::x86_fma_vfmsubadd_ps: - case Intrinsic::x86_fma_vfmsubadd_pd: - case Intrinsic::x86_fma_vfmsubadd_ps_256: - case Intrinsic::x86_fma_vfmsubadd_pd_256: - case Intrinsic::x86_fma_vfmsubadd_ps_512: - case Intrinsic::x86_fma_vfmsubadd_pd_512: - Opc = X86ISD::FMSUBADD; - break; - } - - return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3)); - } + return DAG.getNode(getOpcodeForFMAIntrinsic(IntNo), dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } @@ -14382,122 +17570,25 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } -enum IntrinsicType { - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST -}; - -struct IntrinsicData { - IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1) - :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {} - IntrinsicType Type; - unsigned Opc0; - unsigned Opc1; -}; - -std::map < unsigned, IntrinsicData> IntrMap; -static void InitIntinsicsMap() { - static bool Initialized = false; - if (Initialized) - return; - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, - IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512, - IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512, - IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512, - IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512, - IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, - IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, - IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, - IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, - IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0))); - - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512, - IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, - IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, - IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, - IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, - IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, - IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, - IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, - IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0))); - - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm, - X86::VGATHERPF1QPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm, - X86::VGATHERPF1QPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm, - X86::VGATHERPF1DPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, - IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm, - X86::VGATHERPF1DPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm, - X86::VSCATTERPF1QPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm, - X86::VSCATTERPF1QPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm, - X86::VSCATTERPF1DPDm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, - IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm, - X86::VSCATTERPF1DPSm))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16, - IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32, - IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64, - IntrinsicData(RDRAND, X86ISD::RDRAND, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16, - IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32, - IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64, - IntrinsicData(RDSEED, X86ISD::RDSEED, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_xtest, - IntrinsicData(XTEST, X86ISD::XTEST, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc, - IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp, - IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0))); - IntrMap.insert(std::make_pair(Intrinsic::x86_rdpmc, - IntrinsicData(RDPMC, X86ISD::RDPMC_DAG, 0))); - Initialized = true; -} static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - InitIntinsicsMap(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo); - if (itr == IntrMap.end()) + + const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); + if (!IntrData) return SDValue(); SDLoc dl(Op); - IntrinsicData Intr = itr->second; - switch(Intr.Type) { + switch(IntrData->Type) { + default: + llvm_unreachable("Unknown Intrinsic Type"); + break; case RDSEED: case RDRAND: { // Emit the node with the right value type. SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); - SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0)); + SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. @@ -14521,7 +17612,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(4); SDValue Mask = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, + return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain, Subtarget); } case SCATTER: { @@ -14532,7 +17623,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Index = Op.getOperand(4); SDValue Src = Op.getOperand(5); SDValue Scale = Op.getOperand(6); - return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); + return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain); } case PREFETCH: { SDValue Hint = Op.getOperand(6); @@ -14540,7 +17631,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, if (dyn_cast<ConstantSDNode> (Hint) == nullptr || (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1) llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1"); - unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0); + unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0); SDValue Chain = Op.getOperand(0); SDValue Mask = Op.getOperand(2); SDValue Index = Op.getOperand(3); @@ -14551,7 +17642,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). case RDTSC: { SmallVector<SDValue, 2> Results; - getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results); + getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, Results); return DAG.getMergeValues(Results, dl); } // Read Performance Monitoring Counters. @@ -14563,7 +17654,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, // XTEST intrinsics. case XTEST: { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); - SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0)); + SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, DAG.getConstant(X86::COND_NE, MVT::i8), InTrans); @@ -14571,8 +17662,79 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Ret, SDValue(InTrans.getNode(), 1)); } + // ADC/ADCX/SBB + case ADX: { + SmallVector<SDValue, 2> Results; + SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other); + SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other); + SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2), + DAG.getConstant(-1, MVT::i8)); + SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3), + Op.getOperand(4), GenCF.getValue(1)); + SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0), + Op.getOperand(5), MachinePointerInfo(), + false, false, 0); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_B, MVT::i8), + Res.getValue(1)); + Results.push_back(SetCC); + Results.push_back(Store); + return DAG.getMergeValues(Results, dl); + } + case COMPRESS_TO_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToCompress = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + + if (isAllOnes(Mask)) // return just a store + return DAG.getStore(Chain, dl, DataToCompress, Addr, + MachinePointerInfo(), false, false, 0); + + EVT VT = DataToCompress.getValueType(); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask, + DataToCompress, DAG.getUNDEF(VT)); + return DAG.getStore(Chain, dl, Compressed, Addr, + MachinePointerInfo(), false, false, 0); + } + case EXPAND_FROM_MEM: { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue PathThru = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); + EVT VT = Op.getValueType(); + + if (isAllOnes(Mask)) // return just a load + return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, + false, 0); + EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + VT.getVectorNumElements()); + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getNode(ISD::BITCAST, dl, BitcastVT, Mask), + DAG.getIntPtrConstant(0)); + + SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), + false, false, false, 0); + + SmallVector<SDValue, 2> Results; + Results.push_back(DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, + PathThru)); + Results.push_back(Chain); + return DAG.getMergeValues(Results, dl); + } } - llvm_unreachable("Unknown Intrinsic Type"); } SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, @@ -14589,8 +17751,8 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, @@ -14611,9 +17773,10 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); - unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); + unsigned FrameReg = RegInfo->getPtrSizedFrameRegister( + DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); @@ -14640,8 +17803,8 @@ unsigned X86TargetLowering::getRegisterByName(const char* RegName, SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } @@ -14652,8 +17815,8 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDLoc dl (Op); EVT PtrVT = getPointerTy(); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(DAG.getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && @@ -14700,7 +17863,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SDLoc dl (Op); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - const TargetRegisterInfo* TRI = DAG.getTarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -14864,7 +18027,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); - const TargetFrameLowering &TFI = *TM.getFrameLowering(); + const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); @@ -15198,29 +18361,16 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. - // The internal representation is big endian. - // In other words, a i64 bitcasted to 2 x i32 has its high part at index 0 - // and its low part at index 1. - // Moreover, we have: Mul1 = <ae|cg> ; Mul2 = <bf|dh> - // Vector index 0 1 ; 2 3 - // We want <ae|bf|cg|dh> - // Vector index 0 2 1 3 - // Since each element is seen as 2 x i32, we get: - // high_mask[i] = 2 x vector_index[i] - // low_mask[i] = 2 x vector_index[i] + 1 - // where vector_index = {0, Size/2, 1, Size/2 + 1, ..., - // Size/2 - 1, Size/2 + Size/2 - 1} - // where Size is the number of element of the final vector. SDValue Highs, Lows; if (VT == MVT::v8i32) { - const int HighMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; + const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; + const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } else { - const int HighMask[] = {0, 4, 2, 6}; + const int HighMask[] = {1, 5, 3, 7}; Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {1, 5, 3, 7}; + const int LowMask[] = {0, 4, 2, 6}; Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); } @@ -15238,9 +18388,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup); } - // The low part of a MUL_LOHI is supposed to be the first value and the - // high part the second value. - return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Lows, Highs); + // The first result of MUL_LOHI is actually the low value, followed by the + // high value. + SDValue Ops[] = {Lows, Highs}; + return DAG.getMergeValues(Ops, dl); } static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, @@ -15430,55 +18581,43 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, SDValue BaseShAmt; EVT EltVT = VT.getVectorElementType(); - if (Amt.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = VT.getVectorNumElements(); - unsigned i, j; - for (i = 0; i != NumElts; ++i) { - if (Amt.getOperand(i).getOpcode() == ISD::UNDEF) - continue; - break; - } - for (j = i; j != NumElts; ++j) { - SDValue Arg = Amt.getOperand(j); - if (Arg.getOpcode() == ISD::UNDEF) continue; - if (Arg != Amt.getOperand(i)) - break; - } - if (i != NumElts && j == NumElts) - BaseShAmt = Amt.getOperand(i); + if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { + // Check if this build_vector node is doing a splat. + // If so, then set BaseShAmt equal to the splat value. + BaseShAmt = BV->getSplatValue(); + if (BaseShAmt && BaseShAmt.getOpcode() == ISD::UNDEF) + BaseShAmt = SDValue(); } else { if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) Amt = Amt.getOperand(0); - if (Amt.getOpcode() == ISD::VECTOR_SHUFFLE && - cast<ShuffleVectorSDNode>(Amt)->isSplat()) { + + ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt); + if (SVN && SVN->isSplat()) { + unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); SDValue InVec = Amt.getOperand(0); if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - unsigned NumElts = InVec.getValueType().getVectorNumElements(); - unsigned i = 0; - for (; i != NumElts; ++i) { - SDValue Arg = InVec.getOperand(i); - if (Arg.getOpcode() == ISD::UNDEF) continue; - BaseShAmt = Arg; - break; - } + assert((SplatIdx < InVec.getValueType().getVectorNumElements()) && + "Unexpected shuffle index found!"); + BaseShAmt = InVec.getOperand(SplatIdx); } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { - unsigned SplatIdx = - cast<ShuffleVectorSDNode>(Amt)->getSplatIndex(); if (C->getZExtValue() == SplatIdx) BaseShAmt = InVec.getOperand(1); } } - if (!BaseShAmt.getNode()) - BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt, - DAG.getIntPtrConstant(0)); + + if (!BaseShAmt) + // Avoid introducing an extract element from a shuffle. + BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec, + DAG.getIntPtrConstant(SplatIdx)); } } if (BaseShAmt.getNode()) { - if (EltVT.bitsGT(MVT::i32)) - BaseShAmt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BaseShAmt); + assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!"); + if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32)) + BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt); else if (EltVT.bitsLT(MVT::i32)) BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt); @@ -15596,7 +18735,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. - if (Op.getOpcode() == ISD::SHL && + if (Op.getOpcode() == ISD::SHL && (VT == MVT::v8i16 || VT == MVT::v4i32 || (Subtarget->hasInt256() && VT == MVT::v16i16)) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { @@ -15688,15 +18827,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, CanBeSimplified = Amt2 == Amt->getOperand(j); } } - + if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD. EVT CastVT = MVT::v4i32; - SDValue Splat1 = + SDValue Splat1 = DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); - SDValue Splat2 = + SDValue Splat2 = DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); if (TargetOpcode == X86ISD::MOVSD) @@ -15851,10 +18990,15 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { Cond = X86::COND_B; break; case ISD::SMULO: - BaseOp = X86ISD::SMUL; + BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL; Cond = X86::COND_O; break; case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs + if (N->getValueType(0) == MVT::i8) { + BaseOp = X86ISD::UMUL8; + Cond = X86::COND_O; + break; + } SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0), MVT::i32); SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS); @@ -15880,6 +19024,11 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); } +// Sign extension of the low part of vector elements. This may be used either +// when sign extend instructions are not available or if the vector element +// sizes already match the sign-extended size. If the vector elements are in +// their pre-extended size and sign extend instructions are available, that will +// be handled by LowerSIGN_EXTEND. SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -15925,37 +19074,151 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, case MVT::v4i32: case MVT::v8i16: { SDValue Op0 = Op.getOperand(0); - SDValue Op00 = Op0.getOperand(0); - SDValue Tmp1; - // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. - if (Op0.getOpcode() == ISD::BITCAST && - Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { - // (sext (vzext x)) -> (vsext x) - Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); - if (Tmp1.getNode()) { - EVT ExtraEltVT = ExtraVT.getVectorElementType(); - // This folding is only valid when the in-reg type is a vector of i8, - // i16, or i32. - if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || - ExtraEltVT == MVT::i32) { - SDValue Tmp1Op0 = Tmp1.getOperand(0); - assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && - "This optimization is invalid without a VZEXT."); - return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); - } - Op0 = Tmp1; - } - } - // If the above didn't work, then just use Shift-Left + Shift-Right. - Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff, - DAG); - return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff, + // This is a sign extension of some low part of vector elements without + // changing the size of the vector elements themselves: + // Shift-Left + Shift-Right-Algebraic. + SDValue Shl = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, + BitsDiff, DAG); + return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Shl, BitsDiff, DAG); } } } +/// Returns true if the operand type is exactly twice the native width, and +/// the corresponding cmpxchg8b or cmpxchg16b instruction is available. +/// Used to know whether to use cmpxchg8/16b when expanding atomic operations +/// (otherwise we leave them alone to become __sync_fetch_and_... calls). +bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget<X86Subtarget>(); + unsigned OpWidth = MemType->getPrimitiveSizeInBits(); + + if (OpWidth == 64) + return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b + else if (OpWidth == 128) + return Subtarget.hasCmpxchg16b(); + else + return false; +} + +bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { + return needsCmpXchgNb(SI->getValueOperand()->getType()); +} + +// Note: this turns large loads into lock cmpxchg8b/16b. +// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. +bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { + auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); + return needsCmpXchgNb(PTy->getElementType()); +} + +bool X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget<X86Subtarget>(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + const Type *MemType = AI->getType(); + + // If the operand is too big, we must see if cmpxchg8/16b is available + // and default to library calls otherwise. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return needsCmpXchgNb(MemType); + + AtomicRMWInst::BinOp Op = AI->getOperation(); + switch (Op) { + default: + llvm_unreachable("Unknown atomic operation"); + case AtomicRMWInst::Xchg: + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + // It's better to use xadd, xsub or xchg for these in all cases. + return false; + case AtomicRMWInst::Or: + case AtomicRMWInst::And: + case AtomicRMWInst::Xor: + // If the atomicrmw's result isn't actually used, we can just add a "lock" + // prefix to a normal instruction for these operations. + return !AI->use_empty(); + case AtomicRMWInst::Nand: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + // These always require a non-trivial set of data operations on x86. We must + // use a cmpxchg loop. + return true; + } +} + +static bool hasMFENCE(const X86Subtarget& Subtarget) { + // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for + // no-sse2). There isn't any reason to disable it if the target processor + // supports it. + return Subtarget.hasSSE2() || Subtarget.is64Bit(); +} + +LoadInst * +X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { + const X86Subtarget &Subtarget = + getTargetMachine().getSubtarget<X86Subtarget>(); + unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; + const Type *MemType = AI->getType(); + // Accesses larger than the native width are turned into cmpxchg/libcalls, so + // there is no benefit in turning such RMWs into loads, and it is actually + // harmful as it introduces a mfence. + if (MemType->getPrimitiveSizeInBits() > NativeWidth) + return nullptr; + + auto Builder = IRBuilder<>(AI); + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + auto SynchScope = AI->getSynchScope(); + // We must restrict the ordering to avoid generating loads with Release or + // ReleaseAcquire orderings. + auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); + auto Ptr = AI->getPointerOperand(); + + // Before the load we need a fence. Here is an example lifted from + // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence + // is required: + // Thread 0: + // x.store(1, relaxed); + // r1 = y.fetch_add(0, release); + // Thread 1: + // y.fetch_add(42, acquire); + // r2 = x.load(relaxed); + // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is + // lowered to just a load without a fence. A mfence flushes the store buffer, + // making the optimization clearly correct. + // FIXME: it is required if isAtLeastRelease(Order) but it is not clear + // otherwise, we might be able to be more agressive on relaxed idempotent + // rmw. In practice, they do not look useful, so we don't try to be + // especially clever. + if (SynchScope == SingleThread) { + // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at + // the IR level, so we must wrap it in an intrinsic. + return nullptr; + } else if (hasMFENCE(Subtarget)) { + Function *MFence = llvm::Intrinsic::getDeclaration(M, + Intrinsic::x86_sse2_mfence); + Builder.CreateCall(MFence); + } else { + // FIXME: it might make sense to use a locked operation here but on a + // different cache-line to prevent cache-line bouncing. In practice it + // is probably a small win, and x86 processors without mfence are rare + // enough that we do not bother. + return nullptr; + } + + // Finally we can emit the atomic load. + LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, + AI->getType()->getPrimitiveSizeInBits()); + Loaded->setAtomic(Order, SynchScope); + AI->replaceAllUsesWith(Loaded); + AI->eraseFromParent(); + return Loaded; +} + static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -15967,10 +19230,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) { - // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for - // no-sse2). There isn't any reason to disable it if the target processor - // supports it. - if (Subtarget->hasSSE2() || Subtarget->is64Bit()) + if (hasMFENCE(*Subtarget)) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); @@ -16085,6 +19345,139 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + + Op = Op.getOperand(0); + EVT VT = Op.getValueType(); + assert((VT.is128BitVector() || VT.is256BitVector()) && + "CTPOP lowering only implemented for 128/256-bit wide vector types"); + + unsigned NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + unsigned Len = EltVT.getSizeInBits(); + + // This is the vectorized version of the "best" algorithm from + // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // with a minor tweak to use a series of adds + shifts instead of vector + // multiplications. Implemented for the v2i64, v4i64, v4i32, v8i32 types: + // + // v2i64, v4i64, v4i32 => Only profitable w/ popcnt disabled + // v8i32 => Always profitable + // + // FIXME: There a couple of possible improvements: + // + // 1) Support for i8 and i16 vectors (needs measurements if popcnt enabled). + // 2) Use strategies from http://wm.ite.pl/articles/sse-popcount.html + // + assert(EltVT.isInteger() && (Len == 32 || Len == 64) && Len % 8 == 0 && + "CTPOP not implemented for this vector element type."); + + // X86 canonicalize ANDs to vXi64, generate the appropriate bitcasts to avoid + // extra legalization. + bool NeedsBitcast = EltVT == MVT::i32; + MVT BitcastVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; + + SDValue Cst55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)), EltVT); + SDValue Cst33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)), EltVT); + SDValue Cst0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)), EltVT); + + // v = v - ((v >> 1) & 0x55555555...) + SmallVector<SDValue, 8> Ones(NumElts, DAG.getConstant(1, EltVT)); + SDValue OnesV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ones); + SDValue Srl = DAG.getNode(ISD::SRL, dl, VT, Op, OnesV); + if (NeedsBitcast) + Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); + + SmallVector<SDValue, 8> Mask55(NumElts, Cst55); + SDValue M55 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask55); + if (NeedsBitcast) + M55 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M55); + + SDValue And = DAG.getNode(ISD::AND, dl, Srl.getValueType(), Srl, M55); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Op, And); + + // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...) + SmallVector<SDValue, 8> Mask33(NumElts, Cst33); + SDValue M33 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask33); + SmallVector<SDValue, 8> Twos(NumElts, DAG.getConstant(2, EltVT)); + SDValue TwosV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Twos); + + Srl = DAG.getNode(ISD::SRL, dl, VT, Sub, TwosV); + if (NeedsBitcast) { + Srl = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Srl); + M33 = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M33); + Sub = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Sub); + } + + SDValue AndRHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Srl, M33); + SDValue AndLHS = DAG.getNode(ISD::AND, dl, M33.getValueType(), Sub, M33); + if (VT != AndRHS.getValueType()) { + AndRHS = DAG.getNode(ISD::BITCAST, dl, VT, AndRHS); + AndLHS = DAG.getNode(ISD::BITCAST, dl, VT, AndLHS); + } + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, AndLHS, AndRHS); + + // v = (v + (v >> 4)) & 0x0F0F0F0F... + SmallVector<SDValue, 8> Fours(NumElts, DAG.getConstant(4, EltVT)); + SDValue FoursV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Fours); + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, FoursV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + + SmallVector<SDValue, 8> Mask0F(NumElts, Cst0F); + SDValue M0F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Mask0F); + if (NeedsBitcast) { + Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); + M0F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M0F); + } + And = DAG.getNode(ISD::AND, dl, M0F.getValueType(), Add, M0F); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + + // The algorithm mentioned above uses: + // v = (v * 0x01010101...) >> (Len - 8) + // + // Change it to use vector adds + vector shifts which yield faster results on + // Haswell than using vector integer multiplication. + // + // For i32 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // + // For i64 elements: + // v = v + (v >> 8) + // v = v + (v >> 16) + // v = v + (v >> 32) + // + Add = And; + SmallVector<SDValue, 8> Csts; + for (unsigned i = 8; i <= Len/2; i *= 2) { + Csts.assign(NumElts, DAG.getConstant(i, EltVT)); + SDValue CstsV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Csts); + Srl = DAG.getNode(ISD::SRL, dl, VT, Add, CstsV); + Add = DAG.getNode(ISD::ADD, dl, VT, Add, Srl); + Csts.clear(); + } + + // The result is on the least significant 6-bits on i32 and 7-bits on i64. + SDValue Cst3F = DAG.getConstant(APInt(Len, Len == 32 ? 0x3F : 0x7F), EltVT); + SmallVector<SDValue, 8> Cst3FV(NumElts, Cst3F); + SDValue M3F = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Cst3FV); + if (NeedsBitcast) { + Add = DAG.getNode(ISD::BITCAST, dl, BitcastVT, Add); + M3F = DAG.getNode(ISD::BITCAST, dl, BitcastVT, M3F); + } + And = DAG.getNode(ISD::AND, dl, M3F.getValueType(), Add, M3F); + if (VT != And.getValueType()) + And = DAG.getNode(ISD::BITCAST, dl, VT, And); + + return And; +} + static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); SDLoc dl(Node); @@ -16181,7 +19574,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy()); Type *RetTy = isF64 - ? (Type*)StructType::get(ArgTy, ArgTy, NULL) + ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) : (Type*)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); @@ -16212,6 +19605,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -16240,8 +19634,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); - case ISD::FABS: return LowerFABS(Op, DAG); - case ISD::FNEG: return LowerFNEG(Op, DAG); + case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); + case ISD::FABS: + case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); @@ -16251,7 +19646,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -16292,29 +19687,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } -static void ReplaceATOMIC_LOAD(SDNode *Node, - SmallVectorImpl<SDValue> &Results, - SelectionDAG &DAG) { - SDLoc dl(Node); - EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT(); - - // Convert wide load -> cmpxchg8b/cmpxchg16b - // FIXME: On 32-bit, load -> fild or movq would be more efficient - // (The only way to get a 16-byte load is cmpxchg16b) - // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. - SDValue Zero = DAG.getConstant(0, VT); - SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); - SDValue Swap = - DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs, - Node->getOperand(0), Node->getOperand(1), Zero, Zero, - cast<AtomicSDNode>(Node)->getMemOperand(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getOrdering(), - cast<AtomicSDNode>(Node)->getSynchScope()); - Results.push_back(Swap.getValue(0)); - Results.push_back(Swap.getValue(2)); -} - /// ReplaceNodeResults - Replace a node with an illegal result type /// with a new node built out of custom code. void X86TargetLowering::ReplaceNodeResults(SDNode *N, @@ -16325,6 +19697,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. + case X86ISD::FMINC: + case X86ISD::FMIN: + case X86ISD::FMAXC: + case X86ISD::FMAX: { + EVT VT = N->getValueType(0); + if (VT != MVT::v2f32) + llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX."); + SDValue UNDEF = DAG.getUNDEF(VT); + SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(0), UNDEF); + SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + N->getOperand(1), UNDEF); + Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); + return; + } case ISD::SIGN_EXTEND_INREG: case ISD::ADDC: case ISD::ADDE: @@ -16473,12 +19861,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD: { // Delegate to generic TypeLegalization. Situations we can really handle - // should have already been dealt with by X86AtomicExpand.cpp. + // should have already been dealt with by AtomicExpandPass.cpp. break; - case ISD::ATOMIC_LOAD: { - ReplaceATOMIC_LOAD(N, Results, DAG); - return; } case ISD::BITCAST: { assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); @@ -16561,8 +19947,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; case X86ISD::ANDNP: return "X86ISD::ANDNP"; case X86ISD::PSIGN: return "X86ISD::PSIGN"; - case X86ISD::BLENDV: return "X86ISD::BLENDV"; case X86ISD::BLENDI: return "X86ISD::BLENDI"; + case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND"; case X86ISD::SUBUS: return "X86ISD::SUBUS"; case X86ISD::HADD: return "X86ISD::HADD"; case X86ISD::HSUB: return "X86ISD::HSUB"; @@ -16618,6 +20004,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SBB: return "X86ISD::SBB"; case X86ISD::SMUL: return "X86ISD::SMUL"; case X86ISD::UMUL: return "X86ISD::UMUL"; + case X86ISD::SMUL8: return "X86ISD::SMUL8"; + case X86ISD::UMUL8: return "X86ISD::UMUL8"; + case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG"; + case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG"; case X86ISD::INC: return "X86ISD::INC"; case X86ISD::DEC: return "X86ISD::DEC"; case X86ISD::OR: return "X86ISD::OR"; @@ -16633,6 +20023,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; + case X86ISD::VALIGN: return "X86ISD::VALIGN"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; @@ -16652,7 +20043,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; - case X86ISD::VPERMILP: return "X86ISD::VPERMILP"; + case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; case X86ISD::VPERMV: return "X86ISD::VPERMV"; case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; @@ -16678,6 +20069,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; + case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; + case X86ISD::EXPAND: return "X86ISD::EXPAND"; + case X86ISD::SELECT: return "X86ISD::SELECT"; } } @@ -16868,6 +20262,14 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, if (VT.getSizeInBits() == 64) return false; + // This is an experimental legality test that is tailored to match the + // legality test of the experimental lowering more closely. They are gated + // separately to ease testing of performance differences. + if (ExperimentalVectorShuffleLegality) + // We only care that the types being shuffled are legal. The lowering can + // handle any possible shuffle mask that results. + return isTypeLegal(SVT); + // If this is a single-input shuffle with no 128 bit lane crossings we can // lower it into pshufb. if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) || @@ -16888,9 +20290,12 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, return (SVT.getVectorNumElements() == 2 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isMOVLMask(M, SVT) || + isCommutedMOVLMask(M, SVT) || isMOVHLPSMask(M, SVT) || isSHUFPMask(M, SVT) || + isSHUFPMask(M, SVT, /* Commuted */ true) || isPSHUFDMask(M, SVT) || + isPSHUFDMask(M, SVT, /* SecondOperand */ true) || isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) || isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) || isPALIGNRMask(M, SVT, Subtarget) || @@ -16898,7 +20303,8 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, isUNPCKHMask(M, SVT, Subtarget->hasInt256()) || isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()) || - isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256())); + isBlendMask(M, SVT, Subtarget->hasSSE41(), Subtarget->hasInt256()) || + (Subtarget->hasSSE41() && isINSERTPSMask(M, SVT))); } bool @@ -16908,6 +20314,14 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, return false; MVT SVT = VT.getSimpleVT(); + + // This is an experimental legality test that is tailored to match the + // legality test of the experimental lowering more closely. They are gated + // separately to ease testing of performance differences. + if (ExperimentalVectorShuffleLegality) + // The new vector shuffle lowering is very good at managing zero-inputs. + return isShuffleMaskLegal(Mask, VT); + unsigned NumElts = SVT.getVectorNumElements(); // FIXME: This collection of masks seems suspect. if (NumElts == 2) @@ -16916,7 +20330,9 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, return (isMOVLMask(Mask, SVT) || isCommutedMOVLMask(Mask, SVT, true) || isSHUFPMask(Mask, SVT) || - isSHUFPMask(Mask, SVT, /* Commuted */ true)); + isSHUFPMask(Mask, SVT, /* Commuted */ true) || + isBlendMask(Mask, SVT, Subtarget->hasSSE41(), + Subtarget->hasInt256())); } return false; } @@ -17114,7 +20530,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); // Machine Information - const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); @@ -17266,7 +20682,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter( .setMemRefs(MMOBegin, MMOEnd); // Jump to endMBB - BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) + BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) .addMBB(endMBB); } @@ -17370,7 +20786,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( XMMSaveMBB->addSuccessor(EndMBB); // Now add the instructions. - const TargetInstrInfo *TII = MBB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MBB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); unsigned CountReg = MI->getOperand(0).getReg(); @@ -17380,7 +20796,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( if (!Subtarget->isTargetWin64()) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); - BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); + BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); MBB->addSuccessor(EndMBB); } @@ -17453,7 +20869,7 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // To "insert" a SELECT_CC instruction, we actually have to insert the @@ -17479,7 +20895,8 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. - const TargetRegisterInfo* TRI = BB->getParent()->getTarget().getRegisterInfo(); + const TargetRegisterInfo *TRI = + BB->getParent()->getSubtarget().getRegisterInfo(); if (!MI->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(MI, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -17518,17 +20935,20 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, } MachineBasicBlock * -X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, - bool Is64Bit) const { +X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, + MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); assert(MF->shouldSplitStack()); - unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; - unsigned TlsOffset = Is64Bit ? 0x70 : 0x30; + const bool Is64Bit = Subtarget->is64Bit(); + const bool IsLP64 = Subtarget->isTarget64BitLP64(); + + const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; + const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; // BB: // ... [Till the alloca] @@ -17552,14 +20972,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *AddrRegClass = - getRegClassFor(Is64Bit ? MVT::i64:MVT::i32); + getRegClassFor(getPointerTy()); unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI->getOperand(1).getReg(), - physSPReg = Is64Bit ? X86::RSP : X86::ESP; + physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = BB; ++MBBIter; @@ -17575,12 +20995,12 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, // Add code to the main basic block to check if the stack limit has been hit, // and if so, jump to mallocMBB otherwise to bumpMBB. BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) + BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) .addReg(tmpSPVReg).addReg(sizeVReg); - BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr)) + BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) .addReg(SPLimitVReg); - BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB); + BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB); // bumpMBB simply decreases the stack pointer, since we know the current // stacklet has enough space. @@ -17588,12 +21008,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addReg(SPLimitVReg); BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) .addReg(SPLimitVReg); - BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Calls into a routine in libgcc to allocate more space from the heap. - const uint32_t *RegMask = - MF->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); - if (Is64Bit) { + const uint32_t *RegMask = MF->getTarget() + .getSubtargetImpl() + ->getRegisterInfo() + ->getCallPreservedMask(CallingConv::C); + if (IsLP64) { BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) .addReg(sizeVReg); BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) @@ -17601,6 +21023,14 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addRegMask(RegMask) .addReg(X86::RDI, RegState::Implicit) .addReg(X86::RAX, RegState::ImplicitDefine); + } else if (Is64Bit) { + BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) + .addReg(sizeVReg); + BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) + .addExternalSymbol("__morestack_allocate_stack_space") + .addRegMask(RegMask) + .addReg(X86::EDI, RegState::Implicit) + .addReg(X86::EAX, RegState::ImplicitDefine); } else { BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) .addImm(12); @@ -17616,8 +21046,8 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, .addImm(16); BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) - .addReg(Is64Bit ? X86::RAX : X86::EAX); - BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB); + .addReg(IsLP64 ? X86::RAX : X86::EAX); + BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); // Set up the CFG correctly. BB->addSuccessor(bumpMBB); @@ -17641,10 +21071,10 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = BB->getParent()->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); - assert(!Subtarget->isTargetMacho()); + assert(!Subtarget->isTargetMachO()); // The lowering is pretty easy: we're just emitting the call to _alloca. The // non-trivial part is impdef of ESP. @@ -17674,8 +21104,10 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, .addReg(X86::RAX); } } else { - const char *StackProbeSymbol = - Subtarget->isTargetKnownWindowsMSVC() ? "_chkstk" : "_alloca"; + const char *StackProbeSymbol = (Subtarget->isTargetKnownWindowsMSVC() || + Subtarget->isTargetWindowsItanium()) + ? "_chkstk" + : "_alloca"; BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) .addExternalSymbol(StackProbeSymbol) @@ -17698,8 +21130,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // or EAX and doing an indirect call. The return value will then // be in the normal return register. MachineFunction *F = BB->getParent(); - const X86InstrInfo *TII - = static_cast<const X86InstrInfo*>(F->getTarget().getInstrInfo()); + const X86InstrInfo *TII = + static_cast<const X86InstrInfo *>(F->getSubtarget().getInstrInfo()); DebugLoc DL = MI->getDebugLoc(); assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); @@ -17708,8 +21140,10 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // Get a register mask for the lowered call. // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. - const uint32_t *RegMask = - F->getTarget().getRegisterInfo()->getCallPreservedMask(CallingConv::C); + const uint32_t *RegMask = F->getTarget() + .getSubtargetImpl() + ->getRegisterInfo() + ->getCallPreservedMask(CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) @@ -17754,7 +21188,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); @@ -17795,6 +21229,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // v = phi(main, restore) // // restoreMBB: + // if base pointer being used, load it from frame // v_restore = 1 MachineBasicBlock *thisMBB = MBB; @@ -17860,8 +21295,8 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) .addMBB(restoreMBB); - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); MIB.addRegMask(RegInfo->getNoPreservedMask()); thisMBB->addSuccessor(mainMBB); thisMBB->addSuccessor(restoreMBB); @@ -17878,8 +21313,20 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, .addReg(restoreDstReg).addMBB(restoreMBB); // restoreMBB: + if (RegInfo->hasBasePointer(*MF)) { + const X86Subtarget &STI = MF->getTarget().getSubtarget<X86Subtarget>(); + const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); + X86FI->setRestoreBasePointer(MF); + unsigned FramePtr = RegInfo->getFrameRegister(*MF); + unsigned BasePtr = RegInfo->getBaseRegister(); + unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; + addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), + FramePtr, true, X86FI->getRestoreBasePointerOffset()) + .setMIFlag(MachineInstr::FrameSetup); + } BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); - BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); + BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); restoreMBB->addSuccessor(sinkMBB); MI->eraseFromParent(); @@ -17891,7 +21338,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI->getDebugLoc(); MachineFunction *MF = MBB->getParent(); - const TargetInstrInfo *TII = MF->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); // Memory Reference @@ -17906,8 +21353,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; unsigned Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. - const X86RegisterInfo *RegInfo = - static_cast<const X86RegisterInfo*>(MF->getTarget().getRegisterInfo()); + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; unsigned SP = RegInfo->getStackRegister(); @@ -17951,7 +21398,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Replace 213-type (isel default) FMA3 instructions with 231-type for // accumulator loops. Writing back to the accumulator allows the coalescer -// to remove extra copies in the loop. +// to remove extra copies in the loop. MachineBasicBlock * X86TargetLowering::emitFMA3Instr(MachineInstr *MI, MachineBasicBlock *MBB) const { @@ -18006,6 +21453,11 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, case X86::VFNMSUBPSr213r: NewFMAOpc = X86::VFNMSUBPSr231r; break; case X86::VFNMSUBSDr213r: NewFMAOpc = X86::VFNMSUBSDr231r; break; case X86::VFNMSUBSSr213r: NewFMAOpc = X86::VFNMSUBSSr231r; break; + case X86::VFMADDSUBPDr213r: NewFMAOpc = X86::VFMADDSUBPDr231r; break; + case X86::VFMADDSUBPSr213r: NewFMAOpc = X86::VFMADDSUBPSr231r; break; + case X86::VFMSUBADDPDr213r: NewFMAOpc = X86::VFMSUBADDPDr231r; break; + case X86::VFMSUBADDPSr213r: NewFMAOpc = X86::VFMSUBADDPSr231r; break; + case X86::VFMADDPDr213rY: NewFMAOpc = X86::VFMADDPDr231rY; break; case X86::VFMADDPSr213rY: NewFMAOpc = X86::VFMADDPSr231rY; break; case X86::VFMSUBPDr213rY: NewFMAOpc = X86::VFMSUBPDr231rY; break; @@ -18014,10 +21466,14 @@ X86TargetLowering::emitFMA3Instr(MachineInstr *MI, case X86::VFNMADDPSr213rY: NewFMAOpc = X86::VFNMADDPSr231rY; break; case X86::VFNMSUBPDr213rY: NewFMAOpc = X86::VFNMSUBPDr231rY; break; case X86::VFNMSUBPSr213rY: NewFMAOpc = X86::VFNMSUBPSr231rY; break; + case X86::VFMADDSUBPDr213rY: NewFMAOpc = X86::VFMADDSUBPDr231rY; break; + case X86::VFMADDSUBPSr213rY: NewFMAOpc = X86::VFMADDSUBPSr231rY; break; + case X86::VFMSUBADDPDr213rY: NewFMAOpc = X86::VFMSUBADDPDr231rY; break; + case X86::VFMSUBADDPSr213rY: NewFMAOpc = X86::VFMSUBADDPSr231rY; break; default: llvm_unreachable("Unrecognized FMA variant."); } - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(NewFMAOpc)) .addOperand(MI->getOperand(0)) @@ -18048,9 +21504,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); case X86::SEG_ALLOCA_32: - return EmitLoweredSegAlloca(MI, BB, false); case X86::SEG_ALLOCA_64: - return EmitLoweredSegAlloca(MI, BB, true); + return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); @@ -18083,7 +21538,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::FP80_TO_INT32_IN_MEM: case X86::FP80_TO_INT64_IN_MEM: { MachineFunction *F = BB->getParent(); - const TargetInstrInfo *TII = F->getTarget().getInstrInfo(); + const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); // Change the floating point control register to use "round towards zero" @@ -18167,7 +21622,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRM128MEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRM(MI, BB, BB->getParent()->getTarget().getInstrInfo()); + return EmitPCMPSTRM(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); // String/text processing lowering. case X86::PCMPISTRIREG: @@ -18180,15 +21635,16 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VPCMPESTRIMEM: assert(Subtarget->hasSSE42() && "Target must have SSE4.2 or AVX features enabled"); - return EmitPCMPSTRI(MI, BB, BB->getParent()->getTarget().getInstrInfo()); + return EmitPCMPSTRI(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB, BB->getParent()->getTarget().getInstrInfo(), Subtarget); + return EmitMonitor(MI, BB, BB->getParent()->getSubtarget().getInstrInfo(), + Subtarget); // xbegin case X86::XBEGIN: - return EmitXBegin(MI, BB, BB->getParent()->getTarget().getInstrInfo()); + return EmitXBegin(MI, BB, BB->getParent()->getSubtarget().getInstrInfo()); case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); @@ -18204,6 +21660,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::EH_SjLj_LongJmp64: return emitEHSjLjLongJmp(MI, BB); + case TargetOpcode::STATEPOINT: + // As an implementation detail, STATEPOINT shares the STACKMAP format at + // this point in the process. We diverge later. + return emitPatchPoint(MI, BB); + case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); @@ -18224,6 +21685,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VFNMSUBPSr213r: case X86::VFNMSUBSDr213r: case X86::VFNMSUBSSr213r: + case X86::VFMADDSUBPDr213r: + case X86::VFMADDSUBPSr213r: + case X86::VFMSUBADDPDr213r: + case X86::VFMSUBADDPSr213r: case X86::VFMADDPDr213rY: case X86::VFMADDPSr213rY: case X86::VFMSUBPDr213rY: @@ -18232,6 +21697,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::VFNMADDPSr213rY: case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPSr213rY: + case X86::VFMADDSUBPDr213rY: + case X86::VFMADDSUBPSr213rY: + case X86::VFMSUBADDPDr213rY: + case X86::VFMSUBADDPSr213rY: return emitFMA3Instr(MI, BB); } } @@ -18461,6 +21930,329 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// \brief Combine an arbitrary chain of shuffles into a single instruction if +/// possible. +/// +/// This is the leaf of the recursive combinine below. When we have found some +/// chain of single-use x86 shuffle instructions and accumulated the combined +/// shuffle mask represented by them, this will try to pattern match that mask +/// into either a single instruction if there is a special purpose instruction +/// for this operation, or into a PSHUFB instruction which is a fully general +/// instruction but should only be used to replace chains over a certain depth. +static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, + int Depth, bool HasPSHUFB, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); + + // Find the operand that enters the chain. Note that multiple uses are OK + // here, we're not going to remove the operand we find. + SDValue Input = Op.getOperand(0); + while (Input.getOpcode() == ISD::BITCAST) + Input = Input.getOperand(0); + + MVT VT = Input.getSimpleValueType(); + MVT RootVT = Root.getSimpleValueType(); + SDLoc DL(Root); + + // Just remove no-op shuffle masks. + if (Mask.size() == 1) { + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Input), + /*AddTo*/ true); + return true; + } + + // Use the float domain if the operand type is a floating point type. + bool FloatDomain = VT.isFloatingPoint(); + + // For floating point shuffles, we don't have free copies in the shuffle + // instructions or the ability to load as part of the instruction, so + // canonicalize their shuffles to UNPCK or MOV variants. + // + // Note that even with AVX we prefer the PSHUFD form of shuffle for integer + // vectors because it can have a load folded into it that UNPCK cannot. This + // doesn't preclude something switching to the shorter encoding post-RA. + if (FloatDomain) { + if (Mask.equals(0, 0) || Mask.equals(1, 1)) { + bool Lo = Mask.equals(0, 0); + unsigned Shuffle; + MVT ShuffleVT; + // Check if we have SSE3 which will let us use MOVDDUP. That instruction + // is no slower than UNPCKLPD but has the option to fold the input operand + // into even an unaligned memory load. + if (Lo && Subtarget->hasSSE3()) { + Shuffle = X86ISD::MOVDDUP; + ShuffleVT = MVT::v2f64; + } else { + // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller + // than the UNPCK variants. + Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS; + ShuffleVT = MVT::v4f32; + } + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + if (Shuffle == X86ISD::MOVDDUP) + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + else + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + if (Subtarget->hasSSE3() && + (Mask.equals(0, 0, 2, 2) || Mask.equals(1, 1, 3, 3))) { + bool Lo = Mask.equals(0, 0, 2, 2); + unsigned Shuffle = Lo ? X86ISD::MOVSLDUP : X86ISD::MOVSHDUP; + MVT ShuffleVT = MVT::v4f32; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + if (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3)) { + bool Lo = Mask.equals(0, 0, 1, 1); + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + MVT ShuffleVT = MVT::v4f32; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + } + + // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK + // variants as none of these have single-instruction variants that are + // superior to the UNPCK formulation. + if (!FloatDomain && + (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) || + Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) || + Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, + 15))) { + bool Lo = Mask[0] == 0; + unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + if (Depth == 1 && Root->getOpcode() == Shuffle) + return false; // Nothing to do! + MVT ShuffleVT; + switch (Mask.size()) { + case 8: + ShuffleVT = MVT::v8i16; + break; + case 16: + ShuffleVT = MVT::v16i8; + break; + default: + llvm_unreachable("Impossible mask size!"); + }; + Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input); + DCI.AddToWorklist(Op.getNode()); + Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + + // Don't try to re-form single instruction chains under any circumstances now + // that we've done encoding canonicalization for them. + if (Depth < 2) + return false; + + // If we have 3 or more shuffle instructions or a chain involving PSHUFB, we + // can replace them with a single PSHUFB instruction profitably. Intel's + // manuals suggest only using PSHUFB if doing so replacing 5 instructions, but + // in practice PSHUFB tends to be *very* fast so we're more aggressive. + if ((Depth >= 3 || HasPSHUFB) && Subtarget->hasSSSE3()) { + SmallVector<SDValue, 16> PSHUFBMask; + assert(Mask.size() <= 16 && "Can't shuffle elements smaller than bytes!"); + int Ratio = 16 / Mask.size(); + for (unsigned i = 0; i < 16; ++i) { + if (Mask[i / Ratio] == SM_SentinelUndef) { + PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); + continue; + } + int M = Mask[i / Ratio] != SM_SentinelZero + ? Ratio * Mask[i / Ratio] + i % Ratio + : 255; + PSHUFBMask.push_back(DAG.getConstant(M, MVT::i8)); + } + Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Input); + DCI.AddToWorklist(Op.getNode()); + SDValue PSHUFBMaskOp = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask); + DCI.AddToWorklist(PSHUFBMaskOp.getNode()); + Op = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, Op, PSHUFBMaskOp); + DCI.AddToWorklist(Op.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op), + /*AddTo*/ true); + return true; + } + + // Failed to find any combines. + return false; +} + +/// \brief Fully generic combining of x86 shuffle instructions. +/// +/// This should be the last combine run over the x86 shuffle instructions. Once +/// they have been fully optimized, this will recursively consider all chains +/// of single-use shuffle instructions, build a generic model of the cumulative +/// shuffle operation, and check for simpler instructions which implement this +/// operation. We use this primarily for two purposes: +/// +/// 1) Collapse generic shuffles to specialized single instructions when +/// equivalent. In most cases, this is just an encoding size win, but +/// sometimes we will collapse multiple generic shuffles into a single +/// special-purpose shuffle. +/// 2) Look for sequences of shuffle instructions with 3 or more total +/// instructions, and replace them with the slightly more expensive SSSE3 +/// PSHUFB instruction if available. We do this as the last combining step +/// to ensure we avoid using PSHUFB if we can implement the shuffle with +/// a suitable short sequence of other instructions. The PHUFB will either +/// use a register or have to read from memory and so is slightly (but only +/// slightly) more expensive than the other shuffle instructions. +/// +/// Because this is inherently a quadratic operation (for each shuffle in +/// a chain, we recurse up the chain), the depth is limited to 8 instructions. +/// This should never be an issue in practice as the shuffle lowering doesn't +/// produce sequences of more than 8 instructions. +/// +/// FIXME: We will currently miss some cases where the redundant shuffling +/// would simplify under the threshold for PSHUFB formation because of +/// combine-ordering. To fix this, we should do the redundant instruction +/// combining in this recursive walk. +static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, + ArrayRef<int> RootMask, + int Depth, bool HasPSHUFB, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // Bound the depth of our recursive combine because this is ultimately + // quadratic in nature. + if (Depth > 8) + return false; + + // Directly rip through bitcasts to find the underlying operand. + while (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).hasOneUse()) + Op = Op.getOperand(0); + + MVT VT = Op.getSimpleValueType(); + if (!VT.isVector()) + return false; // Bail if we hit a non-vector. + // FIXME: This routine should be taught about 256-bit shuffles, or a 256-bit + // version should be added. + if (VT.getSizeInBits() != 128) + return false; + + assert(Root.getSimpleValueType().isVector() && + "Shuffles operate on vector types!"); + assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && + "Can only combine shuffles of the same vector register size."); + + if (!isTargetShuffle(Op.getOpcode())) + return false; + SmallVector<int, 16> OpMask; + bool IsUnary; + bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, OpMask, IsUnary); + // We only can combine unary shuffles which we can decode the mask for. + if (!HaveMask || !IsUnary) + return false; + + assert(VT.getVectorNumElements() == OpMask.size() && + "Different mask size from vector size!"); + assert(((RootMask.size() > OpMask.size() && + RootMask.size() % OpMask.size() == 0) || + (OpMask.size() > RootMask.size() && + OpMask.size() % RootMask.size() == 0) || + OpMask.size() == RootMask.size()) && + "The smaller number of elements must divide the larger."); + int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size()); + int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size()); + assert(((RootRatio == 1 && OpRatio == 1) || + (RootRatio == 1) != (OpRatio == 1)) && + "Must not have a ratio for both incoming and op masks!"); + + SmallVector<int, 16> Mask; + Mask.reserve(std::max(OpMask.size(), RootMask.size())); + + // Merge this shuffle operation's mask into our accumulated mask. Note that + // this shuffle's mask will be the first applied to the input, followed by the + // root mask to get us all the way to the root value arrangement. The reason + // for this order is that we are recursing up the operation chain. + for (int i = 0, e = std::max(OpMask.size(), RootMask.size()); i < e; ++i) { + int RootIdx = i / RootRatio; + if (RootMask[RootIdx] < 0) { + // This is a zero or undef lane, we're done. + Mask.push_back(RootMask[RootIdx]); + continue; + } + + int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio; + int OpIdx = RootMaskedIdx / OpRatio; + if (OpMask[OpIdx] < 0) { + // The incoming lanes are zero or undef, it doesn't matter which ones we + // are using. + Mask.push_back(OpMask[OpIdx]); + continue; + } + + // Ok, we have non-zero lanes, map them through. + Mask.push_back(OpMask[OpIdx] * OpRatio + + RootMaskedIdx % OpRatio); + } + + // See if we can recurse into the operand to combine more things. + switch (Op.getOpcode()) { + case X86ISD::PSHUFB: + HasPSHUFB = true; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + if (Op.getOperand(0).hasOneUse() && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); + // We can't check for single use, we have to check that this shuffle is the only user. + if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; + } + + // Minor canonicalization of the accumulated shuffle mask to make it easier + // to match below. All this does is detect masks with squential pairs of + // elements, and shrink them to the half-width mask. It does this in a loop + // so it will reduce the size of the mask to the minimal width mask which + // performs an equivalent shuffle. + SmallVector<int, 16> WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + WidenedMask.clear(); + } + + return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, + Subtarget); +} + /// \brief Get the PSHUF-style mask from PSHUF node. /// /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 @@ -18493,19 +22285,23 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { /// We walk up the chain and look for a combinable shuffle, skipping over /// shuffles that we could hoist this shuffle's transformation past without /// altering anything. -static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, - SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue +combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { assert(N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!"); SDLoc DL(N); - // Walk up a single-use chain looking for a combinable shuffle. + // Walk up a single-use chain looking for a combinable shuffle. Keep a stack + // of the shuffles in the chain so that we can form a fresh chain to replace + // this one. + SmallVector<SDValue, 8> Chain; SDValue V = N.getOperand(0); for (; V.hasOneUse(); V = V.getOperand(0)) { switch (V.getOpcode()) { default: - return false; // Nothing combined! + return SDValue(); // Nothing combined! case ISD::BITCAST: // Skip bitcasts as we always know the type for the target specific @@ -18521,8 +22317,9 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, // dword shuffle, and the high words are self-contained. if (Mask[0] != 0 || Mask[1] != 1 || !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) - return false; + return SDValue(); + Chain.push_back(V); continue; case X86ISD::PSHUFHW: @@ -18530,8 +22327,9 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, // dword shuffle, and the low words are self-contained. if (Mask[2] != 2 || Mask[3] != 3 || !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) - return false; + return SDValue(); + Chain.push_back(V); continue; case X86ISD::UNPCKL: @@ -18539,25 +22337,28 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. if (V.getValueType() != MVT::v16i8 && V.getValueType() != MVT::v8i16) - return false; + return SDValue(); // Search for a half-shuffle which we can combine with. unsigned CombineOp = V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; if (V.getOperand(0) != V.getOperand(1) || !V->isOnlyUserOf(V.getOperand(0).getNode())) - return false; + return SDValue(); + Chain.push_back(V); V = V.getOperand(0); do { switch (V.getOpcode()) { default: - return false; // Nothing to combine. + return SDValue(); // Nothing to combine. case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: if (V.getOpcode() == CombineOp) break; + Chain.push_back(V); + // Fallthrough! case ISD::BITCAST: V = V.getOperand(0); @@ -18573,10 +22374,7 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, if (!V.hasOneUse()) // We fell out of the loop without finding a viable combining instruction. - return false; - - // Record the old value to use in RAUW-ing. - SDValue Old = V; + return SDValue(); // Merge this node's mask and our incoming mask. SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); @@ -18585,20 +22383,34 @@ static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); - // It is possible that one of the combinable shuffles was completely absorbed - // by the other, just replace it and revisit all users in that case. - if (Old.getNode() == V.getNode()) { - DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo=*/true); - return true; - } + // Rebuild the chain around this new shuffle. + while (!Chain.empty()) { + SDValue W = Chain.pop_back_val(); - // Replace N with its operand as we're going to combine that shuffle away. - DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + if (V.getValueType() != W.getOperand(0).getValueType()) + V = DAG.getNode(ISD::BITCAST, DL, W.getOperand(0).getValueType(), V); - // Replace the combinable shuffle with the combined one, updating all users - // so that we re-evaluate the chain here. - DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); - return true; + switch (W.getOpcode()) { + default: + llvm_unreachable("Only PSHUF and UNPCK instructions get here!"); + + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); + break; + + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); + break; + } + } + if (V.getValueType() != N.getValueType()) + V = DAG.getNode(ISD::BITCAST, DL, N.getValueType(), V); + + // Return the new chain to replace N. + return V; } /// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. @@ -18634,26 +22446,6 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, // Other-half shuffles are no-ops. continue; - - case X86ISD::PSHUFD: { - // We can only handle pshufd if the half we are combining either stays in - // its half, or switches to the other half. Bail if one of these isn't - // true. - SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); - int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2; - if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) || - (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2))) - return false; - - // Map the mask through the pshufd and keep walking up the chain. - for (int i = 0; i < 4; ++i) - Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2; - - // Switch halves if the pshufd does. - CombineOpcode = - VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; - continue; - } } // Break out of the loop if we break out of the switch. break; @@ -18663,7 +22455,11 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, // We fell out of the loop without finding a viable combining instruction. return false; - // Record the old value to use in RAUW-ing. + // Combine away the bottom node as its shuffle will be accumulated into + // a preceding shuffle. + DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true); + + // Record the old value. SDValue Old = V; // Merge this node's mask and our incoming mask (adjusted to account for all @@ -18674,12 +22470,13 @@ static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask, V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0), getV4X86ShuffleImm8ForMask(Mask, DAG)); - // Replace N with its operand as we're going to combine that shuffle away. - DAG.ReplaceAllUsesWith(N, N.getOperand(0)); + // Check that the shuffles didn't cancel each other out. If not, we need to + // combine to the new one. + if (Old != V) + // Replace the combinable shuffle with the combined one, updating all users + // so that we re-evaluate the chain here. + DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); - // Replace the combinable shuffle with the combined one, updating all users - // so that we re-evaluate the chain here. - DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true); return true; } @@ -18720,13 +22517,13 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return SDValue(); // We combined away this shuffle, so we're done. // See if this reduces to a PSHUFD which is no more expensive and can - // combine with more operations. - if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 && - areAdjacentMasksSequential(Mask)) { - int DMask[] = {-1, -1, -1, -1}; + // combine with more operations. Note that it has to at least flip the + // dwords as otherwise it would have been removed as a no-op. + if (Mask[0] == 2 && Mask[1] == 3 && Mask[2] == 0 && Mask[3] == 1) { + int DMask[] = {0, 1, 2, 3}; int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; - DMask[DOffset + 0] = DOffset + Mask[0] / 2; - DMask[DOffset + 1] = DOffset + Mask[2] / 2; + DMask[DOffset + 0] = DOffset + 1; + DMask[DOffset + 1] = DOffset + 0; V = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V); DCI.AddToWorklist(V.getNode()); V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V, @@ -18779,8 +22576,8 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFD: - if (combineRedundantDWordShuffle(N, Mask, DAG, DCI)) - return SDValue(); // We combined away this shuffle. + if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI)) + return NewN; break; } @@ -18788,6 +22585,61 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return SDValue(); } +/// \brief Try to combine a shuffle into a target-specific add-sub node. +/// +/// We combine this directly on the abstract vector shuffle nodes so it is +/// easier to generically match. We also insert dummy vector shuffle nodes for +/// the operands which explicitly discard the lanes which are unused by this +/// operation to try to flow through the rest of the combiner the fact that +/// they're unused. +static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // We only handle target-independent shuffles. + // FIXME: It would be easy and harmless to use the target shuffle mask + // extraction tool to support more. + if (N->getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + + auto *SVN = cast<ShuffleVectorSDNode>(N); + ArrayRef<int> Mask = SVN->getMask(); + SDValue V1 = N->getOperand(0); + SDValue V2 = N->getOperand(1); + + // We require the first shuffle operand to be the SUB node, and the second to + // be the ADD node. + // FIXME: We should support the commuted patterns. + if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) + return SDValue(); + + // If there are other uses of these operations we can't fold them. + if (!V1->hasOneUse() || !V2->hasOneUse()) + return SDValue(); + + // Ensure that both operations have the same operands. Note that we can + // commute the FADD operands. + SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1); + if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && + (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) + return SDValue(); + + // We're looking for blends between FADD and FSUB nodes. We insist on these + // nodes being lined up in a specific expected pattern. + if (!(isShuffleEquivalent(Mask, 0, 3) || + isShuffleEquivalent(Mask, 0, 5, 2, 7) || + isShuffleEquivalent(Mask, 0, 9, 2, 11, 4, 13, 6, 15))) + return SDValue(); + + // Only specific types are legal at this point, assert so we notice if and + // when these change. + assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f32 || + VT == MVT::v4f64) && + "Unknown vector type encountered!"); + + return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS); +} + /// PerformShuffleCombine - Performs several different shuffle combines. static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -18797,54 +22649,17 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - // Canonicalize shuffles that perform 'addsub' on packed float vectors - // according to the rule: - // (shuffle (FADD A, B), (FSUB A, B), Mask) -> - // (shuffle (FSUB A, -B), (FADD A, -B), Mask) - // - // Where 'Mask' is: - // <0,5,2,7> -- for v4f32 and v4f64 shuffles; - // <0,3> -- for v2f64 shuffles; - // <0,9,2,11,4,13,6,15> -- for v8f32 shuffles. - // - // This helps pattern-matching more SSE3/AVX ADDSUB instructions - // during ISel stage. - if (N->getOpcode() == ISD::VECTOR_SHUFFLE && - ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || - (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - N0->getOpcode() == ISD::FADD && N1->getOpcode() == ISD::FSUB && - // Operands to the FADD and FSUB must be the same. - ((N0->getOperand(0) == N1->getOperand(0) && - N0->getOperand(1) == N1->getOperand(1)) || - // FADD is commutable. See if by commuting the operands of the FADD - // we would still be able to match the operands of the FSUB dag node. - (N0->getOperand(1) == N1->getOperand(0) && - N0->getOperand(0) == N1->getOperand(1))) && - N0->getOperand(0)->getOpcode() != ISD::UNDEF && - N0->getOperand(1)->getOpcode() != ISD::UNDEF) { - - ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N); - unsigned NumElts = VT.getVectorNumElements(); - ArrayRef<int> Mask = SV->getMask(); - bool CanFold = true; - - for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) - CanFold = Mask[i] == (int)((i & 1) ? i + NumElts : i); - - if (CanFold) { - SDValue Op0 = N1->getOperand(0); - SDValue Op1 = DAG.getNode(ISD::FNEG, dl, VT, N1->getOperand(1)); - SDValue Sub = DAG.getNode(ISD::FSUB, dl, VT, Op0, Op1); - SDValue Add = DAG.getNode(ISD::FADD, dl, VT, Op0, Op1); - return DAG.getVectorShuffle(VT, dl, Sub, Add, Mask); - } - } - // Don't create instructions with illegal types after legalize types has run. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType())) return SDValue(); + // If we have legalized the vector types, look for blends of FADD and FSUB + // nodes that we can fuse into an ADDSUB node. + if (TLI.isTypeLegal(VT) && Subtarget->hasSSE3()) + if (SDValue AddSub = combineShuffleToAddSub(N, DAG)) + return AddSub; + // Combine 256-bit vector shuffles. This is only profitable when in AVX mode if (Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) @@ -18869,7 +22684,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, EVT SVT = BC0.getValueType(); unsigned Opcode = BC0.getOpcode(); unsigned NumElts = VT.getVectorNumElements(); - + if (BC0.hasOneUse() && SVT.isVector() && SVT.getVectorNumElements() * 2 == NumElts && TLI.isOperationLegal(Opcode, VT)) { @@ -18921,6 +22736,18 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, PerformTargetShuffleCombine(SDValue(N, 0), DAG, DCI, Subtarget); if (Shuffle.getNode()) return Shuffle; + + // Try recursively combining arbitrary sequences of x86 shuffle + // instructions into higher-order shuffles. We do this after combining + // specific PSHUF instruction sequences into their minimal form so that we + // can evaluate how many specialized shuffle instructions are involved in + // a particular chain. + SmallVector<int, 1> NonceMask; // Just a placeholder. + NonceMask.push_back(0); + if (combineX86ShufflesRecursively(SDValue(N, 0), SDValue(N, 0), NonceMask, + /*Depth*/ 1, /*HasPSHUFB*/ false, DAG, + DCI, Subtarget)) + return SDValue(); // This routine will use CombineTo to replace N. } return SDValue(); @@ -18938,7 +22765,7 @@ static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, /// XFormVExtractWithShuffleIntoLoad - Check if a vector extract from a target /// specific shuffle of a load can be folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but -/// shuffles have been customed lowered so we need to handle those here. +/// shuffles have been custom lowered so we need to handle those here. static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalizeOps()) @@ -18950,20 +22777,20 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!isa<ConstantSDNode>(EltNo)) return SDValue(); - EVT VT = InVec.getValueType(); + EVT OriginalVT = InVec.getValueType(); - bool HasShuffleIntoBitcast = false; if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); - if (BCVT.getVectorNumElements() != VT.getVectorNumElements()) + if (BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements()) return SDValue(); InVec = InVec.getOperand(0); - HasShuffleIntoBitcast = true; } + EVT CurrentVT = InVec.getValueType(); + if (!isTargetShuffle(InVec.getOpcode())) return SDValue(); @@ -18973,12 +22800,12 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleMask; bool UnaryShuffle; - if (!getTargetShuffleMask(InVec.getNode(), VT.getSimpleVT(), ShuffleMask, - UnaryShuffle)) + if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), + ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. - unsigned NumElems = VT.getVectorNumElements(); + unsigned NumElems = CurrentVT.getVectorNumElements(); int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); int Idx = (Elt > (int)NumElems) ? -1 : ShuffleMask[Elt]; SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) @@ -19004,35 +22831,37 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile()) return SDValue(); - if (HasShuffleIntoBitcast) { - // If there's a bitcast before the shuffle, check if the load type and - // alignment is valid. - unsigned Align = LN0->getAlignment(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = TLI.getDataLayout()-> - getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); + EVT EltVT = N->getValueType(0); + // If there's a bitcast before the shuffle, check if the load type and + // alignment is valid. + unsigned Align = LN0->getAlignment(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NewAlign = TLI.getDataLayout()->getABITypeAlignment( + EltVT.getTypeForEVT(*DAG.getContext())); - if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) - return SDValue(); - } + if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT)) + return SDValue(); // All checks match so transform back to vector_shuffle so that DAG combiner // can finish the job SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1); - Shuffle = DAG.getVectorShuffle(InVec.getValueType(), dl, + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) + : InVec.getOperand(1); + Shuffle = DAG.getVectorShuffle(CurrentVT, dl, InVec.getOperand(0), Shuffle, &ShuffleMask[0]); - Shuffle = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); + Shuffle = DAG.getNode(ISD::BITCAST, dl, OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, EltNo); } /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index /// generation and convert it from being a bunch of shuffles and extracts -/// to a simple store and scalar loads to extract the elements. +/// into a somewhat faster sequence. For i686, the best sequence is apparently +/// storing the value and loading scalars back, while for x64 we should +/// use 64-bit extracts and shifts. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); @@ -19091,36 +22920,61 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // Ok, we've now decided to do the transformation. + // If 64-bit shifts are legal, use the extract-shift sequence, + // otherwise bounce the vector off the cache. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Vals[4]; SDLoc dl(InputVector); - // Store the value to a temporary stack slot. - SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); - SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, - MachinePointerInfo(), false, false, 0); + if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) { + SDValue Cst = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, InputVector); + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(0, VecIdxTy)); + SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst, + DAG.getConstant(1, VecIdxTy)); + + SDValue ShAmt = DAG.getConstant(32, + DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64)); + Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); + Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt)); + Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf); + Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, + DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt)); + } else { + // Store the value to a temporary stack slot. + SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, + MachinePointerInfo(), false, false, 0); + + EVT ElementType = InputVector.getValueType().getVectorElementType(); + unsigned EltSize = ElementType.getSizeInBits() / 8; - // Replace each use (extract) with a load of the appropriate element. - for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), - UE = Uses.end(); UI != UE; ++UI) { - SDNode *Extract = *UI; + // Replace each use (extract) with a load of the appropriate element. + for (unsigned i = 0; i < 4; ++i) { + uint64_t Offset = EltSize * i; + SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); - // cOMpute the element's address. - SDValue Idx = Extract->getOperand(1); - unsigned EltSize = - InputVector.getValueType().getVectorElementType().getSizeInBits()/8; - uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); + SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), + StackPtr, OffsetVal); - SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), - StackPtr, OffsetVal); + // Load the scalar. + Vals[i] = DAG.getLoad(ElementType, dl, Ch, + ScalarAddr, MachinePointerInfo(), + false, false, false, 0); - // Load the scalar. - SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, - ScalarAddr, MachinePointerInfo(), - false, false, false, 0); + } + } + + // Replace the extracts + for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), + UE = Uses.end(); UI != UE; ++UI) { + SDNode *Extract = *UI; - // Replace the exact with the load. - DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); + SDValue Idx = Extract->getOperand(1); + uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]); } // The replacement was made in place; don't return anything. @@ -19137,6 +22991,21 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, bool NeedSplit = false; switch (VT.getSimpleVT().SimpleTy) { default: return std::make_pair(0, false); + case MVT::v4i64: + case MVT::v2i64: + if (!Subtarget->hasVLX()) + return std::make_pair(0, false); + break; + case MVT::v64i8: + case MVT::v32i16: + if (!Subtarget->hasBWI()) + return std::make_pair(0, false); + break; + case MVT::v16i32: + case MVT::v8i64: + if (!Subtarget->hasAVX512()) + return std::make_pair(0, false); + break; case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: @@ -19203,7 +23072,7 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, } static SDValue -TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, +transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc dl(N); SDValue Cond = N->getOperand(0); @@ -19216,25 +23085,21 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, Cond = CondSrc->getOperand(0); } - MVT VT = N->getSimpleValueType(0); - MVT EltVT = VT.getVectorElementType(); - unsigned NumElems = VT.getVectorNumElements(); - // There is no blend with immediate in AVX-512. - if (VT.is512BitVector()) - return SDValue(); - - if (!Subtarget->hasSSE41() || EltVT == MVT::i8) - return SDValue(); - if (!Subtarget->hasInt256() && VT == MVT::v16i16) + if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) return SDValue(); - if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) + // A vselect where all conditions and data are constants can be optimized into + // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). + if (ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && + ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) return SDValue(); unsigned MaskValue = 0; if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue)) return SDValue(); + MVT VT = N->getSimpleValueType(0); + unsigned NumElems = VT.getVectorNumElements(); SmallVector<int, 8> ShuffleMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { // Be sure we emit undef where we can. @@ -19244,6 +23109,9 @@ TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1); } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isShuffleMaskLegal(ShuffleMask, VT)) + return SDValue(); return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]); } @@ -19264,8 +23132,9 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // instructions match the semantics of the common C idiom x<y?x:y but not // x<=y?x:y, because of how they handle negative zero (which can be // ignored in unsafe-math mode). + // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && TLI.isTypeLegal(VT) && + VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -19408,13 +23277,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() && CondVT.getVectorElementType() == MVT::i1) { // v16i8 (select v16i1, v16i8, v16i8) does not have a proper - // lowering on AVX-512. In this case we convert it to + // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. - // The same situation for all 128 and 256-bit vectors of i8 and i16 + // The same situation for all 128 and 256-bit vectors of i8 and i16. + // Since SKX these selects have a proper lowering. EVT OpVT = LHS.getValueType(); if ((OpVT.is128BitVector() || OpVT.is256BitVector()) && (OpVT.getVectorElementType() == MVT::i8 || - OpVT.getVectorElementType() == MVT::i16)) { + OpVT.getVectorElementType() == MVT::i16) && + !(Subtarget->hasBWI() && Subtarget->hasVLX())) { Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond); DCI.AddToWorklist(Cond.getNode()); return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS); @@ -19634,22 +23505,22 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opc, DL, VT, LHS, RHS); } - // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. - if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && - // Check if SETCC has already been promoted - TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT && - // Check that condition value type matches vselect operand type - CondVT == VT) { - + // Simplify vector selection if condition value type matches vselect + // operand type + if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { assert(Cond.getValueType().isVector() && "vector select expects a vector selector!"); bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); - if (!TValIsAllOnes && !FValIsAllZeros) { - // Try invert the condition if true value is not all 1s and false value - // is not all 0s. + // Try invert the condition if true value is not all 1s and false value + // is not all 0s. + if (!TValIsAllOnes && !FValIsAllZeros && + // Check if the selector will be produced by CMPP*/PCMP* + Cond.getOpcode() == ISD::SETCC && + // Check if SETCC has already been promoted + TLI.getSetCCResultType(*DAG.getContext(), VT) == CondVT) { bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); @@ -19681,81 +23552,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // Try to fold this VSELECT into a MOVSS/MOVSD - if (N->getOpcode() == ISD::VSELECT && - Cond.getOpcode() == ISD::BUILD_VECTOR && !DCI.isBeforeLegalize()) { - if (VT == MVT::v4i32 || VT == MVT::v4f32 || - (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) { - bool CanFold = false; - unsigned NumElems = Cond.getNumOperands(); - SDValue A = LHS; - SDValue B = RHS; - - if (isZero(Cond.getOperand(0))) { - CanFold = true; - - // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B) - // fold (vselect <0,-1> -> (movsd A, B) - for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) - CanFold = isAllOnes(Cond.getOperand(i)); - } else if (isAllOnes(Cond.getOperand(0))) { - CanFold = true; - std::swap(A, B); - - // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A) - // fold (vselect <-1,0> -> (movsd B, A) - for (unsigned i = 1, e = NumElems; i != e && CanFold; ++i) - CanFold = isZero(Cond.getOperand(i)); - } - - if (CanFold) { - if (VT == MVT::v4i32 || VT == MVT::v4f32) - return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG); - return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG); - } - - if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) { - // fold (v4i32: vselect <0,0,-1,-1>, A, B) -> - // (v4i32 (bitcast (movsd (v2i64 (bitcast A)), - // (v2i64 (bitcast B))))) - // - // fold (v4f32: vselect <0,0,-1,-1>, A, B) -> - // (v4f32 (bitcast (movsd (v2f64 (bitcast A)), - // (v2f64 (bitcast B))))) - // - // fold (v4i32: vselect <-1,-1,0,0>, A, B) -> - // (v4i32 (bitcast (movsd (v2i64 (bitcast B)), - // (v2i64 (bitcast A))))) - // - // fold (v4f32: vselect <-1,-1,0,0>, A, B) -> - // (v4f32 (bitcast (movsd (v2f64 (bitcast B)), - // (v2f64 (bitcast A))))) - - CanFold = (isZero(Cond.getOperand(0)) && - isZero(Cond.getOperand(1)) && - isAllOnes(Cond.getOperand(2)) && - isAllOnes(Cond.getOperand(3))); - - if (!CanFold && isAllOnes(Cond.getOperand(0)) && - isAllOnes(Cond.getOperand(1)) && - isZero(Cond.getOperand(2)) && - isZero(Cond.getOperand(3))) { - CanFold = true; - std::swap(LHS, RHS); - } - - if (CanFold) { - EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64; - SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS); - SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS); - SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA, - NewB, DAG); - return DAG.getNode(ISD::BITCAST, DL, VT, Select); - } - } - } - } - // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits @@ -19767,22 +23563,17 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // build_vector of constants. This will be taken care in a later // condition. (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 && - VT != MVT::v8i16)) { + VT != MVT::v8i16) && + // Don't optimize vector of constants. Those are handled by + // the generic code and all the bits must be properly set for + // the generic optimizer. + !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) return SDValue(); - // Check all uses of that condition operand to check whether it will be - // consumed by non-BLEND instructions, which may depend on all bits are set - // properly. - for (SDNode::use_iterator I = Cond->use_begin(), - E = Cond->use_end(); I != E; ++I) - if (I->getOpcode() != ISD::VSELECT) - // TODO: Add other opcodes eventually lowered into BLEND. - return SDValue(); - assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); @@ -19790,8 +23581,45 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), DCI.isBeforeLegalizeOps()); if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) || - TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, TLO)) - DCI.CommitTargetLoweringOpt(TLO); + TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne, + TLO)) { + // If we changed the computation somewhere in the DAG, this change + // will affect all users of Cond. + // Make sure it is fine and update all the nodes so that we do not + // use the generic VSELECT anymore. Otherwise, we may perform + // wrong optimizations as we messed up with the actual expectation + // for the vector boolean values. + if (Cond != TLO.Old) { + // Check all uses of that condition operand to check whether it will be + // consumed by non-BLEND instructions, which may depend on all bits are + // set properly. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + if (I->getOpcode() != ISD::VSELECT) + // TODO: Add other opcodes eventually lowered into BLEND. + return SDValue(); + + // Update all the users of the condition, before committing the change, + // so that the VSELECT optimizations that expect the correct vector + // boolean value will not be triggered. + for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end(); + I != E; ++I) + DAG.ReplaceAllUsesOfValueWith( + SDValue(*I, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0), + Cond, I->getOperand(1), I->getOperand(2))); + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(); + } + // At this point, only Cond is changed. Change the condition + // just for N to keep the opportunity to optimize all other + // users their own way. + DAG.ReplaceAllUsesOfValueWith( + SDValue(N, 0), + DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0), + TLO.New, N->getOperand(1), N->getOperand(2))); + return SDValue(); + } } // We should generate an X86ISD::BLENDI from a vselect if its argument @@ -19805,8 +23633,10 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // Iff we find this pattern and the build_vectors are built from // constants, we translate the vselect into a shuffle_vector that we // know will be matched by LowerVECTOR_SHUFFLEtoBlend. - if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) { - SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); + if ((N->getOpcode() == ISD::VSELECT || + N->getOpcode() == X86ISD::SHRUNKBLEND) && + !DCI.isBeforeLegalize()) { + SDValue Shuffle = transformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget); if (Shuffle.getNode()) return Shuffle; } @@ -20163,7 +23993,7 @@ static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, // fold (blend A, B, allOnes) -> B if (ISD::isBuildVectorAllOnes(Mask.getNode())) return Op1; - + // Simplify the case where the mask is a constant i32 value. if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { if (C->isNullValue()) @@ -20871,13 +24701,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, EVT MemVT = Ld->getMemoryVT(); SDLoc dl(Ld); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned RegSz = RegVT.getSizeInBits(); - // On Sandybridge unaligned 256bit loads are inefficient. + // For chips with slow 32-byte unaligned loads, break the 32-byte operation + // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); unsigned Alignment = Ld->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; - if (RegVT.is256BitVector() && !Subtarget->hasInt256() && + if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) @@ -20907,156 +24737,169 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, NewVec, TF, true); } - // If this is a vector EXT Load then attempt to optimize it using a - // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the - // expansion is still better than scalar code. - // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll - // emit a shuffle and a arithmetic shift. - // TODO: It is possible to support ZExt by zeroing the undef values - // during the shuffle phase or after the shuffle. - if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() && - (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) { - assert(MemVT != RegVT && "Cannot extend to the same type"); - assert(MemVT.isVector() && "Must load a vector from memory"); - - unsigned NumElems = RegVT.getVectorNumElements(); - unsigned MemSz = MemVT.getSizeInBits(); - assert(RegSz > MemSz && "Register size must be greater than the mem size"); - - if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256()) - return SDValue(); - - // All sizes must be a power of two. - if (!isPowerOf2_32(RegSz * MemSz * NumElems)) - return SDValue(); - - // Attempt to load the original value using scalar loads. - // Find the largest scalar type that divides the total loaded size. - MVT SclrLoadTy = MVT::i8; - for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; - tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { - MVT Tp = (MVT::SimpleValueType)tp; - if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) { - SclrLoadTy = Tp; - } - } - - // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64. - if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 && - (64 <= MemSz)) - SclrLoadTy = MVT::f64; - - // Calculate the number of scalar loads that we need to perform - // in order to load our vector from memory. - unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits(); - if (Ext == ISD::SEXTLOAD && NumLoads > 1) - return SDValue(); - - unsigned loadRegZize = RegSz; - if (Ext == ISD::SEXTLOAD && RegSz == 256) - loadRegZize /= 2; - - // Represent our vector as a sequence of elements which are the - // largest scalar that we can load. - EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy, - loadRegZize/SclrLoadTy.getSizeInBits()); - - // Represent the data using the same element type that is stored in - // memory. In practice, we ''widen'' MemVT. - EVT WideVecVT = - EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - loadRegZize/MemVT.getScalarType().getSizeInBits()); + return SDValue(); +} - assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && - "Invalid vector type"); +/// PerformMLOADCombine - Resolve extending loads +static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N); + if (Mld->getExtensionType() != ISD::SEXTLOAD) + return SDValue(); - // We can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) - return SDValue(); + EVT VT = Mld->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElems = VT.getVectorNumElements(); + EVT LdVT = Mld->getMemoryVT(); + SDLoc dl(Mld); + + assert(LdVT != VT && "Cannot extend to the same type"); + unsigned ToSz = VT.getVectorElementType().getSizeInBits(); + unsigned FromSz = LdVT.getVectorElementType().getSizeInBits(); + // From, To sizes and ElemCount must be pow of two + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for extending masked load"); + + unsigned SizeRatio = ToSz / FromSz; + assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits()); + + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + LdVT.getScalarType(), NumElems*SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + + // Convert Src0 value + SDValue WideSrc0 = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mld->getSrc0()); + if (Mld->getSrc0().getOpcode() != ISD::UNDEF) { + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; - SmallVector<SDValue, 8> Chains; - SDValue Ptr = Ld->getBasePtr(); - SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits()/8, - TLI.getPointerTy()); - SDValue Res = DAG.getUNDEF(LoadUnitVecVT); - - for (unsigned i = 0; i < NumLoads; ++i) { - // Perform a single load. - SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), - Ptr, Ld->getPointerInfo(), - Ld->isVolatile(), Ld->isNonTemporal(), - Ld->isInvariant(), Ld->getAlignment()); - Chains.push_back(ScalarLoad.getValue(1)); - // Create the first element type using SCALAR_TO_VECTOR in order to avoid - // another round of DAGCombining. - if (i == 0) - Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad); - else - Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res, - ScalarLoad, DAG.getIntPtrConstant(i)); + // Can't shuffle using an illegal type. + assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal"); + WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, + DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); + } + // Prepare the new mask + SDValue NewMask; + SDValue Mask = Mld->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type + NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) + ShuffleVec[i] = NumElems*SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, WideVecVT), + &ShuffleVec[0]); + } + else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); + + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector<SDValue, 16> Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; + + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); + } + + SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(), + Mld->getBasePtr(), NewMask, WideSrc0, + Mld->getMemoryVT(), Mld->getMemOperand(), + ISD::NON_EXTLOAD); + SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); + return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); - Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment); - } +} +/// PerformMSTORECombine - Resolve truncating stores +static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N); + if (!Mst->isTruncatingStore()) + return SDValue(); - SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + EVT VT = Mst->getValue().getValueType(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned NumElems = VT.getVectorNumElements(); + EVT StVT = Mst->getMemoryVT(); + SDLoc dl(Mst); - // Bitcast the loaded value to a vector of the original element type, in - // the size of the target vector type. - SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res); - unsigned SizeRatio = RegSz/MemSz; + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromSz = VT.getVectorElementType().getSizeInBits(); + unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); - if (Ext == ISD::SEXTLOAD) { - // If we have SSE4.1 we can directly emit a VSEXT node. - if (Subtarget->hasSSE41()) { - SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec); - return DCI.CombineTo(N, Sext, TF, true); - } + // From, To sizes and ElemCount must be pow of two + assert (isPowerOf2_32(NumElems * FromSz * ToSz) && + "Unexpected size for truncating masked store"); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + assert (((NumElems * FromSz) % ToSz) == 0 && + "Unexpected ratio for truncating masked store"); - // Otherwise we'll shuffle the small elements in the high bits of the - // larger type and perform an arithmetic shift. If the shift is not legal - // it's better to scalarize. - if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT)) - return SDValue(); + unsigned SizeRatio = FromSz / ToSz; + assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits()); - // Redistribute the loaded elements into the different locations. - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i*SizeRatio + SizeRatio-1] = i; + // Create a type on which we perform the shuffle + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), + StVT.getScalarType(), NumElems*SizeRatio); - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); + SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mst->getValue()); + SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i] = i * SizeRatio; - // Build the arithmetic shift. - unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - - MemVT.getVectorElementType().getSizeInBits(); - Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff, - DAG.getConstant(Amt, RegVT)); + // Can't shuffle using an illegal type. + assert (TLI.isTypeLegal(WideVecVT) && "WideVecVT should be legal"); - return DCI.CombineTo(N, Shuff, TF, true); - } + SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); - // Redistribute the loaded elements into the different locations. - SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); + SDValue NewMask; + SDValue Mask = Mst->getMask(); + if (Mask.getValueType() == VT) { + // Mask and original value have the same type + NewMask = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Mask); for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i*SizeRatio] = i; + ShuffleVec[i] = i * SizeRatio; + for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) + ShuffleVec[i] = NumElems*SizeRatio; + NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, + DAG.getConstant(0, WideVecVT), + &ShuffleVec[0]); + } + else { + assert(Mask.getValueType().getVectorElementType() == MVT::i1); + unsigned WidenNumElts = NumElems*SizeRatio; + unsigned MaskNumElts = VT.getVectorNumElements(); + EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WidenNumElts); - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + unsigned NumConcat = WidenNumElts / MaskNumElts; + SmallVector<SDValue, 16> Ops(NumConcat); + SDValue ZeroVal = DAG.getConstant(0, Mask.getValueType()); + Ops[0] = Mask; + for (unsigned i = 1; i != NumConcat; ++i) + Ops[i] = ZeroVal; - // Bitcast to the requested type. - Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff); - // Replace the original load with the new sequence - // and return the new chain. - return DCI.CombineTo(N, Shuff, TF, true); + NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - return SDValue(); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), + NewMask, StVT, Mst->getMemOperand(), false); } - /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -21067,13 +24910,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // If we are saving a concatenation of two XMM registers, perform two stores. - // On Sandy Bridge, 256-bit memory operations are executed by two - // 128-bit ports. However, on Haswell it is better to issue a single 256-bit - // memory operation. + // If we are saving a concatenation of two XMM registers and 32-byte stores + // are slow, such as on Sandy Bridge, perform two 16-byte stores. unsigned Alignment = St->getAlignment(); bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; - if (VT.is256BitVector() && !Subtarget->hasInt256() && + if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && StVT == VT && !IsAligned) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) @@ -21139,9 +24980,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // Find the largest store unit MVT StoreType = MVT::i8; - for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE; - tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) { - MVT Tp = (MVT::SimpleValueType)tp; + for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz) StoreType = Tp; } @@ -21287,7 +25126,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal" +/// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements /// of its first operand, then on successive elements of its second operand, @@ -21413,7 +25252,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { return true; } -/// PerformFADDCombine - Do target-specific dag combines on floating point adds. +/// Do target-specific dag combines on floating point adds. static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); @@ -21428,7 +25267,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformFSUBCombine - Do target-specific dag combines on floating point subs. +/// Do target-specific dag combines on floating point subs. static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { EVT VT = N->getValueType(0); @@ -21443,8 +25282,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and -/// X86ISD::FXOR nodes. +/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x @@ -21458,8 +25296,7 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and -/// X86ISD::FMAX nodes. +/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); @@ -21480,7 +25317,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } -/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. +/// Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { // FAND(0.0, x) -> 0.0 // FAND(x, 0.0) -> 0.0 @@ -21493,7 +25330,7 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes +/// Do target-specific dag combines on X86ISD::FANDN nodes static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { // FANDN(x, 0.0) -> 0.0 // FANDN(0.0, x) -> x @@ -21576,13 +25413,29 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // (i8,i32 sext (sdivrem (i8 x, i8 y)) -> + // (i8,i32 (sdivrem_sext_hreg (i8 x, i8 y) + // This exposes the sext to the sdivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::SDIVREM && N0.getResNo() == 1 && + N0.getValueType() == MVT::i8 && VT == MVT::i32) { + SDLoc dl(N); + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::SDIVREM8_SEXT_HREG, dl, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + if (!DCI.isBeforeLegalizeOps()) return SDValue(); if (!Subtarget->hasFp256()) return SDValue(); - EVT VT = N->getValueType(0); if (VT.isVector() && VT.getSizeInBits() == 256) { SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); if (R.getNode()) @@ -21675,6 +25528,20 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, return R; } + // (i8,i32 zext (udivrem (i8 x, i8 y)) -> + // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) + // This exposes the zext to the udivrem lowering, so that it directly extends + // from AH (which we otherwise need to do contortions to access). + if (N0.getOpcode() == ISD::UDIVREM && + N0.getResNo() == 1 && N0.getValueType() == MVT::i8 && + (VT == MVT::i32 || VT == MVT::i64)) { + SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDValue R = DAG.getNode(X86ISD::UDIVREM8_ZEXT_HREG, dl, NodeTys, + N0.getOperand(0), N0.getOperand(1)); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + return R.getValue(1); + } + return SDValue(); } @@ -21863,14 +25730,14 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) return SDValue(); - // Now check that the other operand of the AND is a constant splat. We could + // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { - // Bail out if the vector isn't a constant splat. - if (!BV->getConstantSplatNode()) + // Bail out if the vector isn't a constant. + if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. @@ -22044,18 +25911,68 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, /// performVZEXTCombine - Performs build vector combines static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget *Subtarget) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + MVT VT = N->getSimpleValueType(0); + SDValue Op = N->getOperand(0); + MVT OpVT = Op.getSimpleValueType(); + MVT OpEltVT = OpVT.getVectorElementType(); + unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements(); + // (vzext (bitcast (vzext (x)) -> (vzext x) - SDValue In = N->getOperand(0); - while (In.getOpcode() == ISD::BITCAST) - In = In.getOperand(0); + SDValue V = Op; + while (V.getOpcode() == ISD::BITCAST) + V = V.getOperand(0); - if (In.getOpcode() != X86ISD::VZEXT) - return SDValue(); + if (V != Op && V.getOpcode() == X86ISD::VZEXT) { + MVT InnerVT = V.getSimpleValueType(); + MVT InnerEltVT = InnerVT.getVectorElementType(); - return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0), - In.getOperand(0)); + // If the element sizes match exactly, we can just do one larger vzext. This + // is always an exact type match as vzext operates on integer types. + if (OpEltVT == InnerEltVT) { + assert(OpVT == InnerVT && "Types must match for vzext!"); + return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0)); + } + + // The only other way we can combine them is if only a single element of the + // inner vzext is used in the input to the outer vzext. + if (InnerEltVT.getSizeInBits() < InputBits) + return SDValue(); + + // In this case, the inner vzext is completely dead because we're going to + // only look at bits inside of the low element. Just do the outer vzext on + // a bitcast of the input to the inner. + return DAG.getNode(X86ISD::VZEXT, DL, VT, + DAG.getNode(ISD::BITCAST, DL, OpVT, V)); + } + + // Check if we can bypass extracting and re-inserting an element of an input + // vector. Essentialy: + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) + if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && + V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { + SDValue ExtractedV = V.getOperand(0); + SDValue OrigV = ExtractedV.getOperand(0); + if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1))) + if (ExtractIdx->getZExtValue() == 0) { + MVT OrigVT = OrigV.getSimpleValueType(); + // Extract a subvector if necessary... + if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { + int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits(); + OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(), + OrigVT.getVectorNumElements() / Ratio); + OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV, + DAG.getIntPtrConstant(0)); + } + Op = DAG.getNode(ISD::BITCAST, DL, OpVT, OrigV); + return DAG.getNode(X86ISD::VZEXT, DL, VT, Op); + } + } + + return SDValue(); } SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, @@ -22066,7 +25983,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::EXTRACT_VECTOR_ELT: return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: - case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); + case ISD::SELECT: + case X86ISD::SHRUNKBLEND: + return PerformSELECTCombine(N, DAG, DCI, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); @@ -22079,7 +25998,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); + case ISD::MLOAD: return PerformMLOADCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::MSTORE: return PerformMSTORECombine(N, DAG, Subtarget); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); @@ -22107,12 +26028,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::UNPCKL: case X86ISD::MOVHLPS: case X86ISD::MOVLHPS: + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::MOVSS: case X86ISD::MOVSD: - case X86ISD::VPERMILP: + case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); @@ -22545,6 +26467,23 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } } return; + case 'L': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || + (Subtarget->is64Bit() && C->getZExtValue() == 0xffffffff)) { + Result = DAG.getTargetConstant(C->getSExtValue(), Op.getValueType()); + break; + } + } + return; + case 'M': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 3) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; case 'N': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { if (C->getZExtValue() <= 255) { @@ -22553,6 +26492,14 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, } } return; + case 'O': + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + if (C->getZExtValue() <= 127) { + Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); + break; + } + } + return; case 'e': { // 32-bit signed value if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { @@ -22762,14 +26709,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, Constraint[5] == ')' && Constraint[6] == '}') { - Res.first = X86::ST0+Constraint[4]-'0'; + Res.first = X86::FP0+Constraint[4]-'0'; Res.second = &X86::RFP80RegClass; return Res; } // GCC allows "st(0)" to be called just plain "st". if (StringRef("{st}").equals_lower(Constraint)) { - Res.first = X86::ST0; + Res.first = X86::FP0; Res.second = &X86::RFP80RegClass; return Res; } @@ -22897,7 +26844,7 @@ int X86TargetLowering::getScalingFactorCost(const AddrMode &AM, // "load" ports instead of the dedicated "store" port. // E.g., on Haswell: // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. - // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. + // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. if (isLegalAddressingMode(AM, Ty)) // Scale represents reg2 * scale, thus account for 1 // as soon as we use a second register. diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index c8cdce7c7664..ba077a8a0b46 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86ISELLOWERING_H -#define X86ISELLOWERING_H +#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H +#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" @@ -187,12 +187,17 @@ namespace llvm { /// PSIGN - Copy integer sign. PSIGN, - /// BLENDV - Blend where the selector is a register. - BLENDV, - /// BLENDI - Blend where the selector is an immediate. BLENDI, + /// SHRUNKBLEND - Blend where the condition has been shrunk. + /// This is used to emphasize that the condition mask is + /// no more valid for generic VSELECT optimizations. + SHRUNKBLEND, + + /// ADDSUB - Combined add and sub on an FP vector. + ADDSUB, + // SUBUS - Integer sub with unsigned saturation. SUBUS, @@ -301,6 +306,13 @@ namespace llvm { UMUL, // LOW, HI, FLAGS = umul LHS, RHS + // 8-bit SMUL/UMUL - AX, FLAGS = smul8/umul8 AL, RHS + SMUL8, UMUL8, + + // 8-bit divrem that zero-extend the high result (AH). + UDIVREM8_ZEXT_HREG, + SDIVREM8_SEXT_HREG, + // MUL_IMM - X86 specific multiply by immediate. MUL_IMM, @@ -320,7 +332,10 @@ namespace llvm { // Several flavors of instructions with vector shuffle behaviors. PACKSS, PACKUS, + // Intra-lane alignr PALIGNR, + // AVX512 inter-lane alignr + VALIGN, PSHUFD, PSHUFHW, PSHUFLW, @@ -337,7 +352,8 @@ namespace llvm { MOVSS, UNPCKL, UNPCKH, - VPERMILP, + VPERMILPV, + VPERMILPI, VPERMV, VPERMV3, VPERMIV3, @@ -350,9 +366,9 @@ namespace llvm { VINSERT, VEXTRACT, - // PMULUDQ - Vector multiply packed unsigned doubleword integers + // Vector multiply packed unsigned doubleword integers PMULUDQ, - // PMULUDQ - Vector multiply packed signed doubleword integers + // Vector multiply packed signed doubleword integers PMULDQ, // FMA nodes @@ -363,20 +379,23 @@ namespace llvm { FMADDSUB, FMSUBADD, - // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, - // according to %al. An operator is needed so that this can be expanded - // with control flow. + // Compress and expand + COMPRESS, + EXPAND, + + // Save xmm argument registers to the stack, according to %al. An operator + // is needed so that this can be expanded with control flow. VASTART_SAVE_XMM_REGS, - // WIN_ALLOCA - Windows's _chkstk call to do stack probing. + // Windows's _chkstk call to do stack probing. WIN_ALLOCA, - // SEG_ALLOCA - For allocating variable amounts of stack space when using + // For allocating variable amounts of stack space when using // segmented stacks. Check if the current stacklet has enough space, and // falls back to heap allocation if not. SEG_ALLOCA, - // WIN_FTOL - Windows's _ftol2 runtime routine to do fptoui. + // Windows's _ftol2 runtime routine to do fptoui. WIN_FTOL, // Memory barrier @@ -385,38 +404,40 @@ namespace llvm { SFENCE, LFENCE, - // FNSTSW16r - Store FP status word into i16 register. + // Store FP status word into i16 register. FNSTSW16r, - // SAHF - Store contents of %ah into %eflags. + // Store contents of %ah into %eflags. SAHF, - // RDRAND - Get a random integer and indicate whether it is valid in CF. + // Get a random integer and indicate whether it is valid in CF. RDRAND, - // RDSEED - Get a NIST SP800-90B & C compliant random integer and + // Get a NIST SP800-90B & C compliant random integer and // indicate whether it is valid in CF. RDSEED, - // PCMP*STRI PCMPISTRI, PCMPESTRI, - // XTEST - Test if in transactional execution. + // Test if in transactional execution. XTEST, - // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap. + // ERI instructions + RSQRT28, RCP28, EXP2, + + // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, LCMPXCHG16_DAG, - // VZEXT_LOAD - Load, scalar_to_vector, and zero extend. + // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, - // FNSTCW16m - Store FP control world into i16 memory. + // Store FP control world into i16 memory. FNSTCW16m, - /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the + /// This instruction implements FP_TO_SINT with the /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It /// has two inputs (token chain and address) and two outputs (int value @@ -425,7 +446,7 @@ namespace llvm { FP_TO_INT32_IN_MEM, FP_TO_INT64_IN_MEM, - /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the + /// This instruction implements SINT_TO_FP with the /// integer source in memory and FP reg result. This corresponds to the /// X86::FILD*m instructions. It has three inputs (token chain, address, /// and source type) and two outputs (FP value and token chain). FILD_FLAG @@ -433,19 +454,19 @@ namespace llvm { FILD, FILD_FLAG, - /// FLD - This instruction implements an extending load to FP stack slots. + /// This instruction implements an extending load to FP stack slots. /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain /// operand, ptr to load from, and a ValueType node indicating the type /// to load to. FLD, - /// FST - This instruction implements a truncating store to FP stack + /// This instruction implements a truncating store to FP stack /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a /// chain operand, value to store, address, and a ValueType to store it /// as. FST, - /// VAARG_64 - This instruction grabs the address of the next argument + /// This instruction grabs the address of the next argument /// from a va_list. (reads and modifies the va_list in memory) VAARG_64 @@ -457,67 +478,76 @@ namespace llvm { /// Define some predicates that are used for node matching. namespace X86 { - /// isVEXTRACT128Index - Return true if the specified + /// Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions. bool isVEXTRACT128Index(SDNode *N); - /// isVINSERT128Index - Return true if the specified + /// Return true if the specified /// INSERT_SUBVECTOR operand specifies a subvector insert that is /// suitable for input to VINSERTF128, VINSERTI128 instructions. bool isVINSERT128Index(SDNode *N); - /// isVEXTRACT256Index - Return true if the specified + /// Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions. bool isVEXTRACT256Index(SDNode *N); - /// isVINSERT256Index - Return true if the specified + /// Return true if the specified /// INSERT_SUBVECTOR operand specifies a subvector insert that is /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions. bool isVINSERT256Index(SDNode *N); - /// getExtractVEXTRACT128Immediate - Return the appropriate + /// Return the appropriate /// immediate to extract the specified EXTRACT_SUBVECTOR index /// with VEXTRACTF128, VEXTRACTI128 instructions. unsigned getExtractVEXTRACT128Immediate(SDNode *N); - /// getInsertVINSERT128Immediate - Return the appropriate + /// Return the appropriate /// immediate to insert at the specified INSERT_SUBVECTOR index /// with VINSERTF128, VINSERT128 instructions. unsigned getInsertVINSERT128Immediate(SDNode *N); - /// getExtractVEXTRACT256Immediate - Return the appropriate + /// Return the appropriate /// immediate to extract the specified EXTRACT_SUBVECTOR index /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions. unsigned getExtractVEXTRACT256Immediate(SDNode *N); - /// getInsertVINSERT256Immediate - Return the appropriate + /// Return the appropriate /// immediate to insert at the specified INSERT_SUBVECTOR index /// with VINSERTF64x4, VINSERTI64x4 instructions. unsigned getInsertVINSERT256Immediate(SDNode *N); - /// isZeroNode - Returns true if Elt is a constant zero or a floating point - /// constant +0.0. + /// Returns true if Elt is a constant zero or floating point constant +0.0. bool isZeroNode(SDValue Elt); - /// isOffsetSuitableForCodeModel - Returns true of the given offset can be + /// Returns true of the given offset can be /// fit into displacement field of the instruction. bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement = true); - /// isCalleePop - Determines whether the callee is required to pop its + /// Determines whether the callee is required to pop its /// own arguments. Callee pop is necessary to support tail calls. bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt); + + /// AVX512 static rounding constants. These need to match the values in + /// avx512fintrin.h. + enum STATIC_ROUNDING { + TO_NEAREST_INT = 0, + TO_NEG_INF = 1, + TO_POS_INF = 2, + TO_ZERO = 3, + CUR_DIRECTION = 4 + }; } //===--------------------------------------------------------------------===// - // X86TargetLowering - X86 Implementation of the TargetLowering interface + // X86 Implementation of the TargetLowering interface class X86TargetLowering final : public TargetLowering { public: - explicit X86TargetLowering(X86TargetMachine &TM); + explicit X86TargetLowering(const X86TargetMachine &TM); unsigned getJumpTableEncoding() const override; @@ -528,21 +558,20 @@ namespace llvm { const MachineBasicBlock *MBB, unsigned uid, MCContext &Ctx) const override; - /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC - /// jumptable. + /// Returns relocation base for the given PIC jumptable. SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const override; const MCExpr * getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const override; - /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate + /// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contains are placed at 16-byte boundaries while the rest are at /// 4-byte boundaries. unsigned getByValTypeAlignment(Type *Ty) const override; - /// getOptimalMemOpType - Returns the target specific optimal type for load + /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it @@ -557,7 +586,7 @@ namespace llvm { bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override; - /// isSafeMemOpType - Returns true if it's safe to use load / store of the + /// Returns true if it's safe to use load / store of the /// specified type to expand memcpy / memset inline. This is mostly true /// for all types except for some special cases. For example, on X86 /// targets without SSE2 f64 load / store are done with fldl / fstpl which @@ -565,17 +594,17 @@ namespace llvm { /// legal as the hook is used before type legalization. bool isSafeMemOpType(MVT VT) const override; - /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// Returns true if the target allows /// unaligned memory accesses. of the specified type. Returns whether it /// is "fast" by reference in the second argument. - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *Fast) const override; - /// LowerOperation - Provide custom lowering hooks for some operations. + /// Provide custom lowering hooks for some operations. /// SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - /// ReplaceNodeResults - Replace the results of node with an illegal result + /// Replace the results of node with an illegal result /// type with new values built out of custom code. /// void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results, @@ -584,13 +613,13 @@ namespace llvm { SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - /// isTypeDesirableForOp - Return true if the target has native support for + /// Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 /// instruction encodings are longer and some i16 instructions are slow. bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override; - /// isTypeDesirable - Return true if the target has native support for the + /// Return true if the target has native support for the /// specified value type and it is 'desirable' to use the type. e.g. On x86 /// i16 is legal, but undesirable since i16 instruction encodings are longer /// and some i16 instructions are slow. @@ -601,24 +630,25 @@ namespace llvm { MachineBasicBlock *MBB) const override; - /// getTargetNodeName - This method returns the name of a target specific - /// DAG node. + /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; - /// getSetCCResultType - Return the value type to use for ISD::SETCC. + bool isCheapToSpeculateCttz() const override; + + bool isCheapToSpeculateCtlz() const override; + + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; - /// computeKnownBitsForTargetNode - Determine which of the bits specified - /// in Mask are known to be either zero or one and return them in the - /// KnownZero/KnownOne bitsets. + /// Determine which of the bits specified in Mask are known to be either + /// zero or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth = 0) const override; - // ComputeNumSignBitsForTargetNode - Determine the number of bits in the - // operation that are sign bits. + /// Determine the number of bits in the operation that are sign bits. unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG, unsigned Depth) const override; @@ -641,16 +671,15 @@ namespace llvm { const char *LowerXConstraint(EVT ConstraintVT) const override; - /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops - /// vector. If it is invalid, don't add anything to Ops. If hasMemory is - /// true it means one of the asm constraint of the inline asm instruction - /// being processed is 'm'. + /// Lower the specified operand into the Ops vector. If it is invalid, don't + /// add anything to Ops. If hasMemory is true it means one of the asm + /// constraint of the inline asm instruction being processed is 'm'. void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const override; - /// getRegForInlineAsmConstraint - Given a physical register constraint + /// Given a physical register constraint /// (e.g. {edx}), return the register number and the register class for the /// register. This should only be used for C_Register constraints. On /// error, this returns a register number of 0. @@ -658,17 +687,17 @@ namespace llvm { getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const override; - /// isLegalAddressingMode - Return true if the addressing mode represented + /// Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; - /// isLegalICmpImmediate - Return true if the specified immediate is legal + /// Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can /// compare a register against the immediate without having to materialize /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; - /// isLegalAddImmediate - Return true if the specified immediate is legal + /// Return true if the specified immediate is legal /// add immediate, that is the target has add instructions which can /// add a register and the immediate without having to materialize /// the immediate into a register. @@ -683,7 +712,7 @@ namespace llvm { bool isVectorShiftByScalarCheap(Type *Ty) const override; - /// isTruncateFree - Return true if it's free to truncate a value of + /// Return true if it's free to truncate a value of /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in /// register EAX to i16 by referencing its sub-register AX. bool isTruncateFree(Type *Ty1, Type *Ty2) const override; @@ -691,7 +720,7 @@ namespace llvm { bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; - /// isZExtFree - Return true if any actual instruction that defines a + /// Return true if any actual instruction that defines a /// value of type Ty1 implicit zero-extends the value to Ty2 in the result /// register. This does not necessarily include registers defined in /// unknown ways, such as incoming arguments, or copies from unknown @@ -703,37 +732,35 @@ namespace llvm { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster - /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be - /// expanded to FMAs when this method returns true, otherwise fmuladd is - /// expanded to fmul + fadd. + /// Return true if an FMA operation is faster than a pair of fmul and fadd + /// instructions. fmuladd intrinsics will be expanded to FMAs when this + /// method returns true, otherwise fmuladd is expanded to fmul + fadd. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; - /// isNarrowingProfitable - Return true if it's profitable to narrow + /// Return true if it's profitable to narrow /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow /// from i32 to i8 but not from i32 to i16. bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; - /// isFPImmLegal - Returns true if the target can instruction select the + /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - /// isShuffleMaskLegal - Targets can use this to indicate that they only - /// support *some* VECTOR_SHUFFLE operations, those with specific masks. - /// By default, if a target supports the VECTOR_SHUFFLE node, all mask - /// values are assumed to be legal. + /// Targets can use this to indicate that they only support *some* + /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a + /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to + /// be legal. bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const override; - /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is - /// used by Targets can use this to indicate if there is a suitable - /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant - /// pool entry. + /// Similar to isShuffleMaskLegal. This is used by Targets can use this to + /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to + /// replace a VAND with a constant pool entry. bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, EVT VT) const override; - /// ShouldShrinkFPConstant - If true, then instruction selection should + /// If true, then instruction selection should /// seek to shrink the FP constant of the specified type to a smaller type /// in order to save space and / or reduce runtime. bool ShouldShrinkFPConstant(EVT VT) const override { @@ -743,23 +770,27 @@ namespace llvm { return !X86ScalarSSEf64 || VT == MVT::f80; } + /// Return true if we believe it is correct and profitable to reduce the + /// load node to a smaller type. + bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, + EVT NewVT) const override; + const X86Subtarget* getSubtarget() const { return Subtarget; } - /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is - /// computed in an SSE register, not on the X87 floating point stack. + /// Return true if the specified scalar FP type is computed in an SSE + /// register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } - /// isTargetFTOL - Return true if the target uses the MSVC _ftol2 routine - /// for fptoui. + /// Return true if the target uses the MSVC _ftol2 routine for fptoui. bool isTargetFTOL() const; - /// isIntegerTypeFTOL - Return true if the MSVC _ftol2 routine should be - /// used for fptoui to the given type. + /// Return true if the MSVC _ftol2 routine should be used for fptoui to the + /// given type. bool isIntegerTypeFTOL(EVT VT) const { return isTargetFTOL() && VT == MVT::i64; } @@ -769,6 +800,10 @@ namespace llvm { bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + /// Return true if EXTRACT_SUBVECTOR is cheap for this result type + /// with this index. + bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; + /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { return nullptr; // nothing to do, move along. @@ -776,15 +811,14 @@ namespace llvm { unsigned getRegisterByName(const char* RegName, EVT VT) const override; - /// createFastISel - This method returns a target specific FastISel object, + /// This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; - /// getStackCookieLocation - Return true if the target stores stack - /// protector cookies at a fixed offset in some non-standard address - /// space, and populates the address space and offset as - /// appropriate. + /// Return true if the target stores stack protector cookies at a fixed + /// offset in some non-standard address space, and populates the address + /// space and offset as appropriate. bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const override; @@ -796,6 +830,7 @@ namespace llvm { /// \brief Reset the operation actions based on target options. void resetOperationActions() override; + bool useLoadStackGuardNode() const override; /// \brief Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -804,7 +839,7 @@ namespace llvm { findRepresentativeClass(MVT VT) const override; private: - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; const DataLayout *TD; @@ -813,17 +848,16 @@ namespace llvm { /// the operation actions unless we have to. TargetOptions TO; - /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 - /// floating point ops. + /// Select between SSE or x87 floating point ops. /// When SSE is available, use it for f32 operations. /// When SSE2 is available, use it for f64 operations. bool X86ScalarSSEf32; bool X86ScalarSSEf64; - /// LegalFPImmediates - A list of legal fp immediates. + /// A list of legal FP immediates. std::vector<APFloat> LegalFPImmediates; - /// addLegalFPImmediate - Indicate that this x86 target can instruction + /// Indicate that this x86 target can instruction /// select the specified FP immediate natively. void addLegalFPImmediate(const APFloat& Imm) { LegalFPImmediates.push_back(Imm); @@ -847,9 +881,8 @@ namespace llvm { // Call lowering helpers. - /// IsEligibleForTailCallOptimization - Check whether the call is eligible - /// for tail call optimization. Targets which want to do tail call - /// optimization should implement this function. + /// Check whether the call is eligible for tail call optimization. Targets + /// that want to do tail call optimization should implement this function. bool IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, @@ -936,7 +969,7 @@ namespace llvm { bool mayBeEmittedAsTailCall(CallInst *CI) const override; - MVT getTypeForExtArgOrReturn(MVT VT, + EVT getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, @@ -946,6 +979,15 @@ namespace llvm { const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; + bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override; + bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + + LoadInst * + lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; + + bool needsCmpXchgNb(const Type *MemType) const; + /// Utility function to emit atomic-load-arith operations (and, or, xor, /// nand, max, min, umax, umin). It takes the corresponding instruction to /// expand, the associated machine basic block, and the associated X86 @@ -975,8 +1017,7 @@ namespace llvm { MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI, - MachineBasicBlock *BB, - bool Is64Bit) const; + MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI, MachineBasicBlock *BB) const; @@ -1005,6 +1046,15 @@ namespace llvm { /// Convert a comparison if required by the subtarget. SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; + + /// Use rsqrt* to speed up sqrt calculations. + SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps, + bool &UseOneConstNR) const override; + + /// Use rcp* to speed up fdiv calculations. + SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, + unsigned &RefinementSteps) const override; }; namespace X86 { diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index d2894088b80f..c4ec3df7b382 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1,3 +1,273 @@ +// Group template arguments that can be derived from the vector type (EltNum x +// EltVT). These are things like the register class for the writemask, etc. +// The idea is to pass one of these as the template argument rather than the +// individual arguments. +// The template is also used for scalar types, in this case numelts is 1. +class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, + string suffix = ""> { + RegisterClass RC = rc; + ValueType EltVT = eltvt; + int NumElts = numelts; + + // Corresponding mask register class. + RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts); + + // Corresponding write-mask register class. + RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM"); + + // The GPR register class that can hold the write mask. Use GR8 for fewer + // than 8 elements. Use shift-right and equal to work around the lack of + // !lt in tablegen. + RegisterClass MRC = + !cast<RegisterClass>("GR" # + !if (!eq (!srl(NumElts, 3), 0), 8, NumElts)); + + // Suffix used in the instruction mnemonic. + string Suffix = suffix; + + // VTName is a string name for vector VT. For vector types it will be + // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32 + // It is a little bit complex for scalar types, where NumElts = 1. + // In this case we build v4f32 or v2f64 + string VTName = "v" # !if (!eq (NumElts, 1), + !if (!eq (EltVT.Size, 32), 4, + !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT; + + // The vector VT. + ValueType VT = !cast<ValueType>(VTName); + + string EltTypeName = !cast<string>(EltVT); + // Size of the element type in bits, e.g. 32 for v16i32. + string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName)); + int EltSize = EltVT.Size; + + // "i" for integer types and "f" for floating-point types + string TypeVariantName = !subst(EltSizeName, "", EltTypeName); + + // Size of RC in bits, e.g. 512 for VR512. + int Size = VT.Size; + + // The corresponding memory operand, e.g. i512mem for VR512. + X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem"); + X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem"); + + // Load patterns + // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64 + // due to load promotion during legalization + PatFrag LdFrag = !cast<PatFrag>("load" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + VTName)), VTName)); + PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); + + // Load patterns used for memory operands. We only have this defined in + // case of i64 element types for sub-512 integer vectors. For now, keep + // MemOpFrag undefined in these cases. + PatFrag MemOpFrag = + !if (!eq (NumElts#EltTypeName, "1f32"), !cast<PatFrag>("memopfsf32"), + !if (!eq (NumElts#EltTypeName, "1f64"), !cast<PatFrag>("memopfsf64"), + !if (!eq (TypeVariantName, "f"), !cast<PatFrag>("memop" # VTName), + !if (!eq (EltTypeName, "i64"), !cast<PatFrag>("memop" # VTName), + !if (!eq (VTName, "v16i32"), !cast<PatFrag>("memop" # VTName), ?))))); + + // The corresponding float type, e.g. v16f32 for v16i32 + // Note: For EltSize < 32, FloatVT is illegal and TableGen + // fails to compile, so we choose FloatVT = VT + ValueType FloatVT = !cast<ValueType>( + !if (!eq (!srl(EltSize,5),0), + VTName, + !if (!eq(TypeVariantName, "i"), + "v" # NumElts # "f" # EltSize, + VTName))); + + // The string to specify embedded broadcast in assembly. + string BroadcastStr = "{1to" # NumElts # "}"; + + // 8-bit compressed displacement tuple/subvector format. This is only + // defined for NumElts <= 8. + CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0), + !cast<CD8VForm>("CD8VT" # NumElts), ?); + + SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm, + !if (!eq (Size, 256), sub_ymm, ?)); + + Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle, + !if (!eq (EltTypeName, "f64"), SSEPackedDouble, + SSEPackedInt)); + + // A vector type of the same width with element type i32. This is used to + // create the canonical constant zero node ImmAllZerosV. + ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); + dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); +} + +def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">; +def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">; +def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; +def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; +def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; +def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; + +// "x" in v32i8x_info means RC = VR256X +def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">; +def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; +def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; +def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; +def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; +def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; + +def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">; +def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; +def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; +def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; +def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; +def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; + +// We map scalar types to the smallest (128-bit) vector type +// with the appropriate element type. This allows to use the same masking logic. +def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; +def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; + +class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256, + X86VectorVTInfo i128> { + X86VectorVTInfo info512 = i512; + X86VectorVTInfo info256 = i256; + X86VectorVTInfo info128 = i128; +} + +def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info, + v16i8x_info>; +def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info, + v8i16x_info>; +def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info, + v4i32x_info>; +def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info, + v2i64x_info>; +def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info, + v4f32x_info>; +def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info, + v2f64x_info>; + +// This multiclass generates the masking variants from the non-masking +// variant. It only provides the assembly pieces for the masking variants. +// It assumes custom ISel patterns for masking which can be provided as +// template arguments. +multiclass AVX512_maskable_custom<bits<8> O, Format F, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern, + list<dag> MaskingPattern, + list<dag> ZeroMaskingPattern, + string Round = "", + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def NAME: AVX512<O, F, Outs, Ins, + OpcodeStr#"\t{"#AttSrcAsm#", $dst "#Round#"|"# + "$dst "#Round#", "#IntelSrcAsm#"}", + Pattern, itin>; + + // Prefer over VMOV*rrk Pat<> + let AddedComplexity = 20 in + def NAME#k: AVX512<O, F, Outs, MaskingIns, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}"#Round#"|"# + "$dst {${mask}}"#Round#", "#IntelSrcAsm#"}", + MaskingPattern, itin>, + EVEX_K { + // In case of the 3src subclass this is overridden with a let. + string Constraints = MaskingConstraint; + } + let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> + def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}"#Round#"|"# + "$dst {${mask}} {z}"#Round#", "#IntelSrcAsm#"}", + ZeroMaskingPattern, + itin>, + EVEX_KZ; +} + + +// Common base class of AVX512_maskable and AVX512_maskable_3src. +multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, + SDNode Select = vselect, string Round = "", + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, MaskingRHS)], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], + Round, MaskingConstraint, NoItinerary, IsCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the vector instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. +multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common<O, F, _, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect, + Round, "$src0 = $dst", itin, IsCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the scalar instruction. +multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common<O, F, _, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select, + Round, "$src0 = $dst", itin, IsCommutable>; + +// Similar to AVX512_maskable but in this case one of the source operands +// ($src1) is already tied to $dst so we just use that for the preserved +// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude +// $src1. +multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common<O, F, _, Outs, + !con((ins _.RC:$src1), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>; + + +multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern> : + AVX512_maskable_custom<O, F, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], "", + "$src0 = $dst">; + // Bitcasts between 512-bit vector types. Return the original type since // no instruction is needed for the conversion let Predicates = [HasAVX512] in { @@ -17,6 +287,7 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; + def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>; def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; @@ -116,119 +387,92 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // -// -- 32x8 form -- -let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { -def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR128X:$src2, i8imm:$src3), - "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512; -let mayLoad = 1 in -def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, f128mem:$src2, i8imm:$src3), - "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; -} - -// -- 64x4 fp form -- -let hasSideEffects = 0, ExeDomain = SSEPackedDouble in { -def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR256X:$src2, i8imm:$src3), - "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W; -let mayLoad = 1 in -def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i256mem:$src2, i8imm:$src3), - "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; + +multiclass vinsert_for_size_no_alt<int Opcode, + X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst), + (ins VR512:$src1, From.RC:$src2, i8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts # + "\t{$src3, $src2, $src1, $dst|" + "$dst, $src1, $src2, $src3}", + [(set To.RC:$dst, (vinsert_insert:$src3 (To.VT VR512:$src1), + (From.VT From.RC:$src2), + (iPTR imm)))]>, + EVEX_4V, EVEX_V512; + + let mayLoad = 1 in + def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst), + (ins VR512:$src1, From.MemOp:$src2, i8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts # + "\t{$src3, $src2, $src1, $dst|" + "$dst, $src1, $src2, $src3}", + []>, + EVEX_4V, EVEX_V512, EVEX_CD8<From.EltSize, From.CD8TupleForm>; + } } -// -- 32x4 integer form -- -let hasSideEffects = 0 in { -def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR128X:$src2, i8imm:$src3), - "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512; -let mayLoad = 1 in -def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i128mem:$src2, i8imm:$src3), - "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +multiclass vinsert_for_size<int Opcode, + X86VectorVTInfo From, X86VectorVTInfo To, + X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, + PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm> : + vinsert_for_size_no_alt<Opcode, From, To, + vinsert_insert, INSERT_get_vinsert_imm> { + // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for + // vinserti32x4. Only add this if 64x2 and friends are not supported + // natively via AVX512DQ. + let Predicates = [NoDQI] in + def : Pat<(vinsert_insert:$ins + (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)), + (AltTo.VT (!cast<Instruction>(NAME # From.EltSize # "x4rr") + VR512:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm VR512:$ins)))>; } -let hasSideEffects = 0 in { -// -- 64x4 form -- -def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR256X:$src2, i8imm:$src3), - "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W; -let mayLoad = 1 in -def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i256mem:$src2, i8imm:$src3), - "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} - -def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2), - (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (v2f64 VR128X:$src2), - (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v2i64 VR128X:$src2), - (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2), - (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; - -def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2), - (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), - (bc_v4i32 (loadv2i64 addr:$src2)), - (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (loadv2f64 addr:$src2), - (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (loadv2i64 addr:$src2), - (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; - -def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (v8f32 VR256X:$src2), - (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (v4f64 VR256X:$src2), - (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v4i64 VR256X:$src2), - (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2), - (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; - -def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (loadv8f32 addr:$src2), - (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (loadv4f64 addr:$src2), - (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8i64 VR512:$src1), (loadv4i64 addr:$src2), - (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1), - (bc_v8i32 (loadv4i64 addr:$src2)), - (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; +multiclass vinsert_for_type<ValueType EltVT32, int Opcode128, + ValueType EltVT64, int Opcode256> { + defm NAME # "32x4" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert, + INSERT_get_vinsert128_imm>; + let Predicates = [HasDQI] in + defm NAME # "64x2" : vinsert_for_size_no_alt<Opcode128, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert, + INSERT_get_vinsert128_imm>, VEX_W; + defm NAME # "64x4" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert, + INSERT_get_vinsert256_imm>, VEX_W; + let Predicates = [HasDQI] in + defm NAME # "32x8" : vinsert_for_size_no_alt<Opcode256, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert, + INSERT_get_vinsert256_imm>; +} + +defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>; +defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>; // vinsertps - insert f32 to XMM def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3), + (ins VR128X:$src1, VR128X:$src2, i8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, EVEX_4V; def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3), + (ins VR128X:$src1, f32mem:$src2, i8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), @@ -237,106 +481,90 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), //===----------------------------------------------------------------------===// // AVX-512 VECTOR EXTRACT //--- -let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { -// -- 32x4 form -- -def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512; -def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs), - (ins f128mem:$dst, VR512:$src1, i8imm:$src2), - "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; - -// -- 64x4 form -- -def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W; -let mayStore = 1 in -def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs), - (ins f256mem:$dst, VR512:$src1, i8imm:$src2), - "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; + +multiclass vextract_for_size<int Opcode, + X86VectorVTInfo From, X86VectorVTInfo To, + X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, + PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), + (ins VR512:$src1, i8imm:$idx), + "vextract" # To.EltTypeName # "x4", + "$idx, $src1", "$src1, $idx", + [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1), + (iPTR imm)))]>, + AVX512AIi8Base, EVEX, EVEX_V512; + let mayStore = 1 in + def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, VR512:$src1, i8imm:$src2), + "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|" + "$dst, $src1, $src2}", + []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>; + } + + // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for + // vextracti32x4 + def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)), + (AltTo.VT (!cast<Instruction>(NAME # To.EltSize # "x4rr") + VR512:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; + + // A 128/256-bit subvector extract from the first 512-bit vector position is + // a subregister copy that needs no instruction. + def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))), + (To.VT + (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>; + + // And for the alternative types. + def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))), + (AltTo.VT + (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>; + + // Intrinsic call with masking. + def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x4rrk") To.RC:$src0, + (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), + VR512:$src1, imm:$idx)>; + + // Intrinsic call with zero-masking. + def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x4rrkz") + (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), + VR512:$src1, imm:$idx)>; + + // Intrinsic call without masking. + def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), + (!cast<Instruction>(NAME # To.EltSize # "x4rr") + VR512:$src1, imm:$idx)>; } -let hasSideEffects = 0 in { -// -- 32x4 form -- -def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512; -def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs), - (ins i128mem:$dst, VR512:$src1, i8imm:$src2), - "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; - -// -- 64x4 form -- -def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W; -let mayStore = 1 in -def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs), - (ins i256mem:$dst, VR512:$src1, i8imm:$src2), - "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} - -def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), - (v4f32 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)), - (v4i32 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), - (v2f64 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), - (v2i64 (VEXTRACTI32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - - -def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), - (v8f32 (VEXTRACTF64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)), - (v8i32 (VEXTRACTI64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), - (v4f64 (VEXTRACTF64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), - (v4i64 (VEXTRACTI64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -// A 256-bit subvector extract from the first 512-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>; -def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>; -def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>; -def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>; - -// zmm -> xmm -def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; -def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; -def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; -def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +multiclass vextract_for_type<ValueType EltVT32, int Opcode32, + ValueType EltVT64, int Opcode64> { + defm NAME # "32x4" : vextract_for_size<Opcode32, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + vextract128_extract, + EXTRACT_get_vextract128_imm>; + defm NAME # "64x4" : vextract_for_size<Opcode64, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256>, + vextract256_extract, + EXTRACT_get_vextract256_imm>, VEX_W; +} +defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>; +defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>; // A 128-bit subvector insert to the first 512-bit vector position // is a subregister copy that needs no instruction. @@ -368,13 +596,13 @@ def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), - (ins VR128X:$src1, u32u8imm:$src2), + (ins VR128X:$src1, i32i8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, EVEX; def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), - (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2), + (ins f32mem:$dst, VR128X:$src1, i32i8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>; @@ -382,105 +610,175 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), //===---------------------------------------------------------------------===// // AVX-512 BROADCAST //--- -multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr, - RegisterClass DestRC, - RegisterClass SrcRC, X86MemOperand x86memop> { - def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - []>, EVEX; - def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),[]>, EVEX; +multiclass avx512_fp_broadcast<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, + ValueType svt, X86VectorVTInfo _> { + defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins SrcRC:$src), "vbroadcast"## !subst("p", "s", _.Suffix), + "$src", "$src", (_.VT (OpNode (svt SrcRC:$src)))>, + T8PD, EVEX; + + let mayLoad = 1 in { + defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), + "vbroadcast"##!subst("p", "s", _.Suffix), "$src", "$src", + (_.VT (OpNode (_.ScalarLdFrag addr:$src)))>, + T8PD, EVEX; + } +} + +multiclass avx512_fp_broadcast_vl<bits<8> opc, SDNode OpNode, + AVX512VLVectorVTInfo _> { + defm Z : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info512>, + EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info256>, + EVEX_V256; + } } + let ExeDomain = SSEPackedSingle in { - defm VBROADCASTSSZ : avx512_fp_broadcast<0x18, "vbroadcastss", VR512, - VR128X, f32mem>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, X86VBroadcast, + avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>; + let Predicates = [HasVLX] in { + defm VBROADCASTSSZ128 : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X, + v4f32, v4f32x_info>, EVEX_V128, + EVEX_CD8<32, CD8VT1>; + } } let ExeDomain = SSEPackedDouble in { - defm VBROADCASTSDZ : avx512_fp_broadcast<0x19, "vbroadcastsd", VR512, - VR128X, f64mem>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, X86VBroadcast, + avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +// avx512_broadcast_pat introduces patterns for broadcast with a scalar argument. +// Later, we can canonize broadcast instructions before ISel phase and +// eliminate additional patterns on ISel. +// SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar +// representations of source +multiclass avx512_broadcast_pat<string InstName, SDNode OpNode, + X86VectorVTInfo _, RegisterClass SrcRC_v, + RegisterClass SrcRC_s> { + def : Pat<(_.VT (OpNode (_.EltVT SrcRC_s:$src))), + (!cast<Instruction>(InstName##"r") + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + let AddedComplexity = 30 in { + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.RC:$src0)), + (!cast<Instruction>(InstName##"rk") _.RC:$src0, _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + + def : Pat<(_.VT(vselect _.KRCWM:$mask, + (OpNode (_.EltVT SrcRC_s:$src)), _.ImmAllZerosV)), + (!cast<Instruction>(InstName##"rkz") _.KRCWM:$mask, + (COPY_TO_REGCLASS SrcRC_s:$src, SrcRC_v))>; + } +} + +defm : avx512_broadcast_pat<"VBROADCASTSSZ", X86VBroadcast, v16f32_info, + VR128X, FR32X>; +defm : avx512_broadcast_pat<"VBROADCASTSDZ", X86VBroadcast, v8f64_info, + VR128X, FR64X>; + +let Predicates = [HasVLX] in { + defm : avx512_broadcast_pat<"VBROADCASTSSZ256", X86VBroadcast, + v8f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSSZ128", X86VBroadcast, + v4f32x_info, VR128X, FR32X>; + defm : avx512_broadcast_pat<"VBROADCASTSDZ256", X86VBroadcast, + v4f64x_info, VR128X, FR64X>; } def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))), - (VBROADCASTSSZrm addr:$src)>; + (VBROADCASTSSZm addr:$src)>; def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))), - (VBROADCASTSDZrm addr:$src)>; + (VBROADCASTSDZm addr:$src)>; def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src), - (VBROADCASTSSZrm addr:$src)>; + (VBROADCASTSSZm addr:$src)>; def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src), - (VBROADCASTSDZrm addr:$src)>; - -multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr, - RegisterClass SrcRC, RegisterClass KRC> { - def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V512; - def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), - (ins KRC:$mask, SrcRC:$src), - !strconcat(OpcodeStr, - " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), - []>, EVEX, EVEX_V512, EVEX_KZ; -} - -defm VPBROADCASTDr : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>; -defm VPBROADCASTQr : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>, - VEX_W; - + (VBROADCASTSDZm addr:$src)>; + +multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _, + RegisterClass SrcRC> { + defm r : AVX512_maskable_in_asm<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins SrcRC:$src), "vpbroadcast"##_.Suffix, + "$src", "$src", []>, T8PD, EVEX; +} + +multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _, + RegisterClass SrcRC, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256; + defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128; + } +} + +defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR32, + HasBWI>; +defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR32, + HasBWI>; +defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32, + HasAVX512>; +defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64, + HasAVX512>, VEX_W; + def : Pat <(v16i32 (X86vzext VK16WM:$mask)), - (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>; + (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>; def : Pat <(v8i64 (X86vzext VK8WM:$mask)), - (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>; + (VPBROADCASTQrZrkz VK8WM:$mask, (i64 (MOV64ri 0x1)))>; def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))), - (VPBROADCASTDrZrr GR32:$src)>; + (VPBROADCASTDrZr GR32:$src)>; def : Pat<(v16i32 (X86VBroadcastm VK16WM:$mask, (i32 GR32:$src))), - (VPBROADCASTDrZkrr VK16WM:$mask, GR32:$src)>; + (VPBROADCASTDrZrkz VK16WM:$mask, GR32:$src)>; def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))), - (VPBROADCASTQrZrr GR64:$src)>; + (VPBROADCASTQrZr GR64:$src)>; def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))), - (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>; + (VPBROADCASTQrZrkz VK8WM:$mask, GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))), - (VPBROADCASTDrZrr GR32:$src)>; + (VPBROADCASTDrZr GR32:$src)>; def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))), - (VPBROADCASTQrZrr GR64:$src)>; + (VPBROADCASTQrZr GR64:$src)>; def : Pat<(v16i32 (int_x86_avx512_mask_pbroadcast_d_gpr_512 (i32 GR32:$src), (v16i32 immAllZerosV), (i16 GR16:$mask))), - (VPBROADCASTDrZkrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; + (VPBROADCASTDrZrkz (COPY_TO_REGCLASS GR16:$mask, VK16WM), GR32:$src)>; def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))), - (VPBROADCASTQrZkrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; + (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, RegisterClass KRC> { def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, VR128X:$src), - !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, (ld_frag addr:$src))))]>, EVEX, EVEX_KZ; } } @@ -497,12 +795,12 @@ multiclass avx512_int_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass KRC> { let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX; def krm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; } } @@ -519,27 +817,32 @@ def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), (VPBROADCASTQZrr VR128X:$src)>; -def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), - (VBROADCASTSSZrr VR128X:$src)>; -def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), - (VBROADCASTSDZrr VR128X:$src)>; +def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), + (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), + (VBROADCASTSDZr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; + +def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), + (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), + (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), - (VBROADCASTSSZrr VR128X:$src)>; + (VBROADCASTSSZr VR128X:$src)>; def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), - (VBROADCASTSDZrr VR128X:$src)>; - + (VBROADCASTSDZr VR128X:$src)>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), - (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; + (VBROADCASTSSZr (COPY_TO_REGCLASS FR32X:$src, VR128X))>; def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), - (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; + (VBROADCASTSDZr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; let Predicates = [HasAVX512] in { def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), - (EXTRACT_SUBREG + (EXTRACT_SUBREG (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), addr:$src)), sub_ymm)>; } @@ -548,64 +851,107 @@ def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), //--- multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, - RegisterClass DstRC, RegisterClass KRC, - ValueType OpVT, ValueType SrcVT> { -def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - []>, EVEX; + RegisterClass KRC> { +let Predicates = [HasCDI] in +def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX, EVEX_V512; + +let Predicates = [HasCDI, HasVLX] in { +def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX, EVEX_V128; +def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>, EVEX, EVEX_V256; +} } let Predicates = [HasCDI] in { -defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, - VK16, v16i32, v16i1>, EVEX_V512; -defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, - VK8, v8i64, v8i1>, EVEX_V512, VEX_W; +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", + VK16>; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", + VK8>, VEX_W; } //===----------------------------------------------------------------------===// // AVX-512 - VPERM // // -- immediate form -- -multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, - SDNode OpNode, PatFrag mem_frag, - X86MemOperand x86memop, ValueType OpVT> { - def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), +multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, i8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, EVEX; - def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src1, i8imm:$src2), + def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.MemOp:$src1, i8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (OpVT (OpNode (mem_frag addr:$src1), - (i8 imm:$src2))))]>, EVEX; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (OpNode (_.MemOpFrag addr:$src1), + (i8 imm:$src2))))]>, + EVEX, EVEX_CD8<_.EltSize, CD8VF>; +} } -defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64, - i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, - f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _, + X86VectorVTInfo Ctrl> : + avx512_perm_imm<OpcImm, "vpermil" # _.Suffix, X86VPermilpi, _> { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), + !strconcat("vpermil" # _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (X86VPermilpv _.RC:$src1, + (Ctrl.VT Ctrl.RC:$src2))))]>, + EVEX_4V; + def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.MemOp:$src2), + !strconcat("vpermil" # _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (_.VT (X86VPermilpv _.RC:$src1, + (Ctrl.VT (Ctrl.MemOpFrag addr:$src2)))))]>, + EVEX_4V; + } +} + +defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", X86VPermi, v8i64_info>, + EVEX_V512, VEX_W; +defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", X86VPermi, v8f64_info>, + EVEX_V512, VEX_W; + +defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>, + EVEX_V512; +defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>, + EVEX_V512, VEX_W; + +def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPSZri VR512:$src1, imm:$imm)>; +def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPDZri VR512:$src1, imm:$imm)>; // -- VPERM - register form -- -multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, +multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V; def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>, EVEX_4V; @@ -613,13 +959,13 @@ multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, defm VPERMDZ : avx512_perm<0x36, "vpermd", VR512, memopv16i32, i512mem, v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, +defm VPERMQZ : avx512_perm<0x36, "vpermq", VR512, memopv8i64, i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; let ExeDomain = SSEPackedSingle in defm VPERMPSZ : avx512_perm<0x16, "vpermps", VR512, memopv16f32, f512mem, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, +defm VPERMPDZ : avx512_perm<0x16, "vpermpd", VR512, memopv8f64, f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; // -- VPERM2I - 3 source operands form -- @@ -630,7 +976,7 @@ let Constraints = "$src1 = $dst" in { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, EVEX_4V; @@ -638,7 +984,7 @@ let Constraints = "$src1 = $dst" in { def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst {${mask}}|" + "\t{$src3, $src2, $dst {${mask}}|" "$dst {${mask}}, $src2, $src3}"), [(set RC:$dst, (OpVT (vselect KRC:$mask, (OpNode RC:$src1, RC:$src2, @@ -650,7 +996,7 @@ let Constraints = "$src1 = $dst" in { def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2, RC:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst {${mask}} {z} |", + "\t{$src3, $src2, $dst {${mask}} {z} |", "$dst {${mask}} {z}, $src2, $src3}"), [(set RC:$dst, (OpVT (vselect KRC:$mask, (OpNode RC:$src1, RC:$src2, @@ -662,7 +1008,7 @@ let Constraints = "$src1 = $dst" in { def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, (mem_frag addr:$src3))))]>, EVEX_4V; @@ -670,7 +1016,7 @@ let Constraints = "$src1 = $dst" in { def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst {${mask}}|" + "\t{$src3, $src2, $dst {${mask}}|" "$dst {${mask}}, $src2, $src3}"), [(set RC:$dst, (OpVT (vselect KRC:$mask, @@ -683,7 +1029,7 @@ let Constraints = "$src1 = $dst" in { def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst {${mask}} {z}|" + "\t{$src3, $src2, $dst {${mask}} {z}|" "$dst {${mask}} {z}, $src2, $src3}"), [(set RC:$dst, (OpVT (vselect KRC:$mask, @@ -739,77 +1085,110 @@ defm VPERMT2PD : avx512_perm_table_3src<0x7F, "pd", VR512, memopv8f64, i512mem, //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // -multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, - RegisterClass KRC, RegisterClass RC, - X86MemOperand x86memop, PatFrag mem_frag, - SDNode OpNode, ValueType vt> { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, RC:$src2), +multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2), - (vt RC:$src1)))]>, EVEX_4V, EVEX_K; - let mayLoad = 1 in - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), + "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + []>, EVEX_4V; + def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (_.VT _.RC:$src2)))]>, EVEX_4V, EVEX_K; + def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), + []>, EVEX_4V, EVEX_KZ; + let mayLoad = 1 in { + def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - []>, EVEX_4V, EVEX_K; + "\t{$src2, $src1, ${dst} |${dst}, $src1, $src2}"), + []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + [(set _.RC:$dst, (X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, + EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; + def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), + []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; + } + } } +multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + + def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.RC:$dst,(X86select _.KRCWM:$mask, (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>, + EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + + def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, "}"), + []>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + +} + +multiclass blendmask_dq <bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, + avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>, + avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>, + avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + } +} + +multiclass blendmask_bw <bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasBWI] in + defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasBWI, HasVLX] in { + defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + } +} + + +defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>; +defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W; +defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>; +defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W; +defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; +defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; -let ExeDomain = SSEPackedSingle in -defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", - VK16WM, VR512, f512mem, - memopv16f32, vselect, v16f32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; -let ExeDomain = SSEPackedDouble in -defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", - VK8WM, VR512, f512mem, - memopv8f64, vselect, v8f64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; - -def : Pat<(v16f32 (int_x86_avx512_mask_blend_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), (i16 GR16:$mask))), - (VBLENDMPSZrr (COPY_TO_REGCLASS GR16:$mask, VK16WM), - VR512:$src1, VR512:$src2)>; - -def : Pat<(v8f64 (int_x86_avx512_mask_blend_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), (i8 GR8:$mask))), - (VBLENDMPDZrr (COPY_TO_REGCLASS GR8:$mask, VK8WM), - VR512:$src1, VR512:$src2)>; - -defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", - VK16WM, VR512, f512mem, - memopv16i32, vselect, v16i32>, - EVEX_CD8<32, CD8VF>, EVEX_V512; - -defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", - VK8WM, VR512, f512mem, - memopv8i64, vselect, v8i64>, - VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512; - -def : Pat<(v16i32 (int_x86_avx512_mask_blend_d_512 (v16i32 VR512:$src1), - (v16i32 VR512:$src2), (i16 GR16:$mask))), - (VPBLENDMDZrr (COPY_TO_REGCLASS GR16:$mask, VK16), - VR512:$src1, VR512:$src2)>; - -def : Pat<(v8i64 (int_x86_avx512_mask_blend_q_512 (v8i64 VR512:$src1), - (v8i64 VR512:$src2), (i8 GR8:$mask))), - (VPBLENDMQZrr (COPY_TO_REGCLASS GR8:$mask, VK8), - VR512:$src1, VR512:$src2)>; let Predicates = [HasAVX512] in { def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1), (v8f32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (EXTRACT_SUBREG + (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (EXTRACT_SUBREG - (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), + (EXTRACT_SUBREG + (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; } @@ -850,98 +1229,295 @@ defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, AVXCC, X86cmpms, f64, loadf64, XD, VEX_W; } -multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt> { +multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { def rr : AVX512BI<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + let mayLoad = 1 in def rm : AVX512BI<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))], + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2)))))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rrk : AVX512BI<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmk : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; } -defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, - memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, - memopv8i64, X86pcmpeqm, v8i64>, T8PD, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_packed<opc, OpcodeStr, OpNode, _> { + let mayLoad = 1 in { + def rmb : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", + "|$dst, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + def rmbk : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + } +} -defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, - memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, - memopv8i64, X86pcmpgtm, v8i64>, T8PD, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPGTDZrr + (COPY_TO_REGCLASS (VPCMPGTDZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPEQDZrr + (COPY_TO_REGCLASS (VPCMPEQDZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; -multiclass avx512_icmp_cc<bits<8> opc, RegisterClass WMRC, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt, Operand CC, string Suffix> { +multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> { def rri : AVX512AIi8<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc))], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + let mayLoad = 1 in def rmi : AVX512AIi8<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), !strconcat("vpcmp${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2), - imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rrik : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, + AVXCC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)))], + IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmik : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, + AVXCC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512AIi8<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), - !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", + "$dst, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + def rmi_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", + "$dst, $src1, $src2, $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; def rrik_alt : AVX512AIi8<opc, MRMSrcReg, - (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, RC:$src2, i8imm:$cc), + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, + i8imm:$cc), !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"), + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; - def rmi_alt : AVX512AIi8<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), - !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; def rmik_alt : AVX512AIi8<opc, MRMSrcMem, - (outs KRC:$dst), (ins WMRC:$mask, RC:$src1, x86memop:$src2, i8imm:$cc), + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, + i8imm:$cc), !strconcat("vpcmp", Suffix, - "\t{$cc, $src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2, $cc}"), + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; } } -defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32, - X86cmpm, v16i32, AVXCC, "d">, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32, - X86cmpmu, v16i32, AVXCC, "ud">, - EVEX_V512, EVEX_CD8<32, CD8VF>; +multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_cc<opc, Suffix, OpNode, _> { + let mayLoad = 1 in { + def rmib : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, + AVXCC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + def rmibk : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2, AVXCC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + imm:$cc)))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + } + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rmib_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, + i8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + def rmibk_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2, i8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), + [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + } +} + +multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128; + } +} -defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64, - X86cmpm, v8i64, AVXCC, "q">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64, - X86cmpmu, v8i64, AVXCC, "uq">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; + +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; + +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; + +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; // avx512_cmp_packed - compare packed instructions multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, @@ -950,17 +1526,17 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, def rri : AVX512PIi8<0xC2, MRMSrcReg, (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (X86cmpm (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>; def rrib: AVX512PIi8<0xC2, MRMSrcReg, (outs KRC:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), + "\t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), [], d>, EVEX_B; def rmi : AVX512PIi8<0xC2, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), !strconcat("vcmp${cc}", suffix, - " \t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [(set KRC:$dst, (X86cmpm (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>; @@ -969,11 +1545,11 @@ multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC, def rri_alt : AVX512PIi8<0xC2, MRMSrcReg, (outs KRC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc), !strconcat("vcmp", suffix, - " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem, (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc), !strconcat("vcmp", suffix, - " \t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; + "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), [], d>; } } @@ -1001,25 +1577,25 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), imm:$cc), VK8)>; def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), imm:$cc, (i16 -1), + (v16f32 VR512:$src2), i32immZExt5:$cc, (i16 -1), FROUND_NO_EXC)), (COPY_TO_REGCLASS (VCMPPSZrrib VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR16)>; - + def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), imm:$cc, (i8 -1), + (v8f64 VR512:$src2), i32immZExt5:$cc, (i8 -1), FROUND_NO_EXC)), (COPY_TO_REGCLASS (VCMPPDZrrib VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR8)>; def : Pat<(i16 (int_x86_avx512_mask_cmp_ps_512 (v16f32 VR512:$src1), - (v16f32 VR512:$src2), imm:$cc, (i16 -1), + (v16f32 VR512:$src2), i32immZExt5:$cc, (i16 -1), FROUND_CURRENT)), (COPY_TO_REGCLASS (VCMPPSZrri VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR16)>; def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), - (v8f64 VR512:$src2), imm:$cc, (i8 -1), + (v8f64 VR512:$src2), i32immZExt5:$cc, (i8 -1), FROUND_CURRENT)), (COPY_TO_REGCLASS (VCMPPDZrri VR512:$src1, VR512:$src2, (I8Imm imm:$cc)), GR8)>; @@ -1031,17 +1607,17 @@ def : Pat<(i8 (int_x86_avx512_mask_cmp_pd_512 (v8f64 VR512:$src1), // multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, string OpcodeStr, RegisterClass KRC, - ValueType vt, X86MemOperand x86memop> { + ValueType vvt, ValueType ivt, X86MemOperand x86memop> { let hasSideEffects = 0 in { def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; let mayLoad = 1 in def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - [(set KRC:$dst, (vt (load addr:$src)))]>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (vvt (bitconvert (ivt (load addr:$src)))))]>; let mayStore = 1 in def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } } @@ -1050,38 +1626,88 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, RegisterClass KRC, RegisterClass GRC> { let hasSideEffects = 0 in { def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; } } -let Predicates = [HasAVX512] in { - defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, - VEX, PS; - defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, +let Predicates = [HasDQI] in + defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8, + i8mem>, + avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>, + VEX, PD; + +let Predicates = [HasAVX512] in + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16, + i16mem>, + avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, VEX, PS; + +let Predicates = [HasBWI] in { + defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1, i32, + i32mem>, VEX, PD, VEX_W; + defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>, + VEX, XD; +} + +let Predicates = [HasBWI] in { + defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64, + i64mem>, VEX, PS, VEX_W; + defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>, + VEX, XD, VEX_W; } +// GR from/to mask register +let Predicates = [HasDQI] in { + def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), + (KMOVBkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit))>; + def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), + (EXTRACT_SUBREG (KMOVBrk VK8:$src), sub_8bit)>; +} let Predicates = [HasAVX512] in { - // GR16 from/to 16-bit mask def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>; def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), (KMOVDkr GR32:$src)>; + def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), (KMOVDrk VK32:$src)>; +} +let Predicates = [HasBWI] in { + def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), (KMOVQkr GR64:$src)>; + def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), (KMOVQrk VK64:$src)>; +} - // Store kreg in memory - def : Pat<(store (v16i1 VK16:$src), addr:$dst), +// Load/store kreg +let Predicates = [HasDQI] in { + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), + (KMOVBmk addr:$dst, VK8:$src)>; +} +let Predicates = [HasAVX512] in { + def : Pat<(store (i16 (bitconvert (v16i1 VK16:$src))), addr:$dst), (KMOVWmk addr:$dst, VK16:$src)>; - - def : Pat<(store VK8:$src, addr:$dst), + def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK8:$src, VK16))>; - def : Pat<(i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK1)>; - - def : Pat<(v8i1 (load addr:$src)), + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), (COPY_TO_REGCLASS (KMOVWkm addr:$src), VK8)>; +} +let Predicates = [HasBWI] in { + def : Pat<(store (i32 (bitconvert (v32i1 VK32:$src))), addr:$dst), + (KMOVDmk addr:$dst, VK32:$src)>; +} +let Predicates = [HasBWI] in { + def : Pat<(store (i64 (bitconvert (v64i1 VK64:$src))), addr:$dst), + (KMOVQmk addr:$dst, VK64:$src)>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(i1 (trunc (i64 GR64:$src))), + (COPY_TO_REGCLASS (KMOVWkr (AND32ri (EXTRACT_SUBREG $src, sub_32bit), + (i32 1))), VK1)>; def : Pat<(i1 (trunc (i32 GR32:$src))), (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>; @@ -1094,7 +1720,7 @@ let Predicates = [HasAVX512] in { (COPY_TO_REGCLASS (KMOVWkr (AND32ri (SUBREG_TO_REG (i32 0), $src, sub_16bit), (i32 1))), VK1)>; - + def : Pat<(i32 (zext VK1:$src)), (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; def : Pat<(i8 (zext VK1:$src)), @@ -1113,6 +1739,14 @@ let Predicates = [HasAVX512] in { def : Pat<(v8i1 (scalar_to_vector VK1:$src)), (COPY_TO_REGCLASS VK1:$src, VK8)>; } +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK32)>; + def : Pat<(v64i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK64)>; +} + + // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. let Predicates = [HasAVX512] in { // GR from/to 8-bit mask without native support @@ -1129,26 +1763,38 @@ let Predicates = [HasAVX512] in { (COPY_TO_REGCLASS VK16:$src, VK1)>; def : Pat<(i1 (X86Vextract VK8:$src, (iPTR 0))), (COPY_TO_REGCLASS VK8:$src, VK1)>; - +} +let Predicates = [HasBWI] in { + def : Pat<(i1 (X86Vextract VK32:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK32:$src, VK1)>; + def : Pat<(i1 (X86Vextract VK64:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK64:$src, VK1)>; } // Mask unary operation // - KNOT multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr, - RegisterClass KRC, SDPatternOperator OpNode> { - let Predicates = [HasAVX512] in + RegisterClass KRC, SDPatternOperator OpNode, + Predicate prd> { + let Predicates = [prd] in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set KRC:$dst, (OpNode KRC:$src))]>; } -multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode> { - defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX, PS; +multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, + HasDQI>, VEX, PD; + defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, + HasAVX512>, VEX, PS; + defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, + HasBWI>, VEX, PD, VEX_W; + defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, + HasBWI>, VEX, PS, VEX_W; } -defm KNOT : avx512_mask_unop_w<0x44, "knot", not>; +defm KNOT : avx512_mask_unop_all<0x44, "knot", not>; multiclass avx512_mask_unop_int<string IntName, string InstName> { let Predicates = [HasAVX512] in @@ -1159,43 +1805,60 @@ multiclass avx512_mask_unop_int<string IntName, string InstName> { } defm : avx512_mask_unop_int<"knot", "KNOT">; +let Predicates = [HasDQI] in +def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (KNOTBrr VK8:$src1)>; +let Predicates = [HasAVX512] in def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>; +let Predicates = [HasBWI] in +def : Pat<(xor VK32:$src1, (v32i1 immAllOnesV)), (KNOTDrr VK32:$src1)>; +let Predicates = [HasBWI] in +def : Pat<(xor VK64:$src1, (v64i1 immAllOnesV)), (KNOTQrr VK64:$src1)>; + +// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit +let Predicates = [HasAVX512] in { def : Pat<(xor VK8:$src1, (v8i1 immAllOnesV)), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>; -// With AVX-512, 8-bit mask is promoted to 16-bit mask. def : Pat<(not VK8:$src), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; +} // Mask binary operation // - KAND, KANDN, KOR, KXNOR, KXOR multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, - RegisterClass KRC, SDPatternOperator OpNode> { - let Predicates = [HasAVX512] in + RegisterClass KRC, SDPatternOperator OpNode, + Predicate prd> { + let Predicates = [prd] in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>; } -multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode> { - defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX_4V, VEX_L, PS; +multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode> { + defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, + HasDQI>, VEX_4V, VEX_L, PD; + defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, + HasAVX512>, VEX_4V, VEX_L, PS; + defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, + HasBWI>, VEX_4V, VEX_L, VEX_W, PD; + defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, + HasBWI>, VEX_4V, VEX_L, VEX_W, PS; } def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; let isCommutable = 1 in { - defm KAND : avx512_mask_binop_w<0x41, "kand", and>; - let isCommutable = 0 in - defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>; - defm KOR : avx512_mask_binop_w<0x45, "kor", or>; - defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>; - defm KXOR : avx512_mask_binop_w<0x47, "kxor", xor>; + defm KAND : avx512_mask_binop_all<0x41, "kand", and>; + defm KOR : avx512_mask_binop_all<0x45, "kor", or>; + defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor>; + defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor>; } +let isCommutable = 0 in + defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn>; def : Pat<(xor VK1:$src1, VK1:$src2), (COPY_TO_REGCLASS (KXORWrr (COPY_TO_REGCLASS VK1:$src1, VK16), @@ -1245,7 +1908,7 @@ multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX512] in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; } multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> { @@ -1274,7 +1937,7 @@ multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, SDNode OpNode> { let Predicates = [HasAVX512], Defs = [EFLAGS] in def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1|$src1, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; } @@ -1295,7 +1958,7 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, let Predicates = [HasAVX512] in def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm), !strconcat(OpcodeStr, - " \t{$imm, $src, $dst|$dst, $src, $imm}"), + "\t{$imm, $src, $dst|$dst, $src, $imm}"), [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>; } @@ -1341,6 +2004,17 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +let Predicates = [HasVLX] in { + def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; +} + def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; @@ -1350,104 +2024,176 @@ def : Pat<(v8i1 (X86vsrli VK8:$src, (i8 imm:$imm))), // AVX-512 - Aligned and unaligned load and store // -multiclass avx512_load<bits<8> opc, RegisterClass RC, RegisterClass KRC, - X86MemOperand x86memop, PatFrag ld_frag, - string asm, Domain d, - ValueType vt, bit IsReMaterializable = 1> { +multiclass avx512_load<bits<8> opc, string OpcodeStr, PatFrag ld_frag, + RegisterClass KRC, RegisterClass RC, + ValueType vt, ValueType zvt, X86MemOperand memop, + Domain d, bit IsReMaterializable = 1> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>, - EVEX; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + d>, EVEX; def rrkz : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), - !strconcat(asm, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [], d>, EVEX, EVEX_KZ; + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), [], d>, EVEX, EVEX_KZ; } - let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in - def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), - [(set (vt RC:$dst), (ld_frag addr:$src))], d>, EVEX; - let Constraints = "$src1 = $dst", hasSideEffects = 0 in { - def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(asm, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, - EVEX, EVEX_K; - let mayLoad = 1 in - def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, x86memop:$src2), - !strconcat(asm, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - [], d>, EVEX, EVEX_K; + let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable, + SchedRW = [WriteLoad] in + def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (vt (bitconvert (ld_frag addr:$src))))], + d>, EVEX; + + let AddedComplexity = 20 in { + let Constraints = "$src0 = $dst", hasSideEffects = 0 in { + let hasSideEffects = 0 in + def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src0, KRC:$mask, RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set RC:$dst, (vt (vselect KRC:$mask, + (vt RC:$src1), + (vt RC:$src0))))], + d>, EVEX, EVEX_K; + let mayLoad = 1, SchedRW = [WriteLoad] in + def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src0, KRC:$mask, memop:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set RC:$dst, (vt + (vselect KRC:$mask, + (vt (bitconvert (ld_frag addr:$src1))), + (vt RC:$src0))))], + d>, EVEX, EVEX_K; + } + let mayLoad = 1, SchedRW = [WriteLoad] in + def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), + (ins KRC:$mask, memop:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), + [(set RC:$dst, (vt + (vselect KRC:$mask, + (vt (bitconvert (ld_frag addr:$src))), + (vt (bitconvert (zvt immAllZerosV))))))], + d>, EVEX, EVEX_KZ; + } +} + +multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, string ld_pat, + string elty, string elsz, string vsz512, + string vsz256, string vsz128, Domain d, + Predicate prd, bit IsReMaterializable = 1> { + let Predicates = [prd] in + defm Z : avx512_load<opc, OpcodeStr, + !cast<PatFrag>(ld_pat##"v"##vsz512##elty##elsz), + !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, + !cast<ValueType>("v"##vsz512##elty##elsz), v16i32, + !cast<X86MemOperand>(elty##"512mem"), d, + IsReMaterializable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_load<opc, OpcodeStr, + !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), + "v"##vsz256##elty##elsz, "v4i64")), + !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, + !cast<ValueType>("v"##vsz256##elty##elsz), v8i32, + !cast<X86MemOperand>(elty##"256mem"), d, + IsReMaterializable>, EVEX_V256; + + defm Z128 : avx512_load<opc, OpcodeStr, + !cast<PatFrag>(ld_pat##!if(!eq(elty,"f"), + "v"##vsz128##elty##elsz, "v2i64")), + !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, + !cast<ValueType>("v"##vsz128##elty##elsz), v4i32, + !cast<X86MemOperand>(elty##"128mem"), d, + IsReMaterializable>, EVEX_V128; } - let mayLoad = 1 in - def rmkz : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, x86memop:$src2), - !strconcat(asm, - " \t{$src2, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src2}"), - [], d>, EVEX, EVEX_KZ; } -multiclass avx512_store<bits<8> opc, RegisterClass RC, RegisterClass KRC, - X86MemOperand x86memop, PatFrag store_frag, - string asm, Domain d, ValueType vt> { + +multiclass avx512_store<bits<8> opc, string OpcodeStr, PatFrag st_frag, + ValueType OpVT, RegisterClass KRC, RegisterClass RC, + X86MemOperand memop, Domain d> { let isAsmParserOnly = 1, hasSideEffects = 0 in { def rr_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [], d>, + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], d>, EVEX; let Constraints = "$src1 = $dst" in - def alt_rrk : AVX512PI<opc, MRMDestReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(asm, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, + def rrk_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), + (ins RC:$src1, KRC:$mask, RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>, EVEX, EVEX_K; - def alt_rrkz : AVX512PI<opc, MRMDestReg, (outs RC:$dst), + def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), - !strconcat(asm, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), [], d>, EVEX, EVEX_KZ; } let mayStore = 1 in { - def mr : AVX512PI<opc, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), - [(store_frag (vt RC:$src), addr:$dst)], d>, EVEX; + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(st_frag (OpVT RC:$src), addr:$dst)], d>, EVEX; def mrk : AVX512PI<opc, MRMDestMem, (outs), - (ins x86memop:$dst, KRC:$mask, RC:$src), - !strconcat(asm, - " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + (ins memop:$dst, KRC:$mask, RC:$src), + !strconcat(OpcodeStr, + "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), [], d>, EVEX, EVEX_K; - def mrkz : AVX512PI<opc, MRMDestMem, (outs), - (ins x86memop:$dst, KRC:$mask, RC:$src), - !strconcat(asm, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [], d>, EVEX, EVEX_KZ; } } -defm VMOVAPSZ : avx512_load<0x28, VR512, VK16WM, f512mem, alignedloadv16f32, - "vmovaps", SSEPackedSingle, v16f32>, - avx512_store<0x29, VR512, VK16WM, f512mem, alignedstore512, - "vmovaps", SSEPackedSingle, v16f32>, - PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVAPDZ : avx512_load<0x28, VR512, VK8WM, f512mem, alignedloadv8f64, - "vmovapd", SSEPackedDouble, v8f64>, - avx512_store<0x29, VR512, VK8WM, f512mem, alignedstore512, - "vmovapd", SSEPackedDouble, v8f64>, - PD, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VMOVUPSZ : avx512_load<0x10, VR512, VK16WM, f512mem, loadv16f32, - "vmovups", SSEPackedSingle, v16f32>, - avx512_store<0x11, VR512, VK16WM, f512mem, store, - "vmovups", SSEPackedSingle, v16f32>, - PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVUPDZ : avx512_load<0x10, VR512, VK8WM, f512mem, loadv8f64, - "vmovupd", SSEPackedDouble, v8f64, 0>, - avx512_store<0x11, VR512, VK8WM, f512mem, store, - "vmovupd", SSEPackedDouble, v8f64>, - PD, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + +multiclass avx512_store_vl<bits<8> opc, string OpcodeStr, string st_pat, + string st_suff_512, string st_suff_256, + string st_suff_128, string elty, string elsz, + string vsz512, string vsz256, string vsz128, + Domain d, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_512), + !cast<ValueType>("v"##vsz512##elty##elsz), + !cast<RegisterClass>("VK"##vsz512##"WM"), VR512, + !cast<X86MemOperand>(elty##"512mem"), d>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_256), + !cast<ValueType>("v"##vsz256##elty##elsz), + !cast<RegisterClass>("VK"##vsz256##"WM"), VR256X, + !cast<X86MemOperand>(elty##"256mem"), d>, EVEX_V256; + + defm Z128 : avx512_store<opc, OpcodeStr, !cast<PatFrag>(st_pat##st_suff_128), + !cast<ValueType>("v"##vsz128##elty##elsz), + !cast<RegisterClass>("VK"##vsz128##"WM"), VR128X, + !cast<X86MemOperand>(elty##"128mem"), d>, EVEX_V128; + } +} + +defm VMOVAPS : avx512_load_vl<0x28, "vmovaps", "alignedload", "f", "32", + "16", "8", "4", SSEPackedSingle, HasAVX512>, + avx512_store_vl<0x29, "vmovaps", "alignedstore", + "512", "256", "", "f", "32", "16", "8", "4", + SSEPackedSingle, HasAVX512>, + PS, EVEX_CD8<32, CD8VF>; + +defm VMOVAPD : avx512_load_vl<0x28, "vmovapd", "alignedload", "f", "64", + "8", "4", "2", SSEPackedDouble, HasAVX512>, + avx512_store_vl<0x29, "vmovapd", "alignedstore", + "512", "256", "", "f", "64", "8", "4", "2", + SSEPackedDouble, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVUPS : avx512_load_vl<0x10, "vmovups", "load", "f", "32", + "16", "8", "4", SSEPackedSingle, HasAVX512>, + avx512_store_vl<0x11, "vmovups", "store", "", "", "", "f", "32", + "16", "8", "4", SSEPackedSingle, HasAVX512>, + PS, EVEX_CD8<32, CD8VF>; + +defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", "load", "f", "64", + "8", "4", "2", SSEPackedDouble, HasAVX512, 0>, + avx512_store_vl<0x11, "vmovupd", "store", "", "", "", "f", "64", + "8", "4", "2", SSEPackedDouble, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr, - (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), + (bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)), (VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr, @@ -1463,75 +2209,160 @@ def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; -defm VMOVDQA32: avx512_load<0x6F, VR512, VK16WM, i512mem, alignedloadv16i32, - "vmovdqa32", SSEPackedInt, v16i32>, - avx512_store<0x7F, VR512, VK16WM, i512mem, alignedstore512, - "vmovdqa32", SSEPackedInt, v16i32>, - PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVDQA64: avx512_load<0x6F, VR512, VK8WM, i512mem, alignedloadv8i64, - "vmovdqa64", SSEPackedInt, v8i64>, - avx512_store<0x7F, VR512, VK8WM, i512mem, alignedstore512, - "vmovdqa64", SSEPackedInt, v8i64>, - PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VMOVDQU32: avx512_load<0x6F, VR512, VK16WM, i512mem, load, - "vmovdqu32", SSEPackedInt, v16i32>, - avx512_store<0x7F, VR512, VK16WM, i512mem, store, - "vmovdqu32", SSEPackedInt, v16i32>, - XS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load, - "vmovdqu64", SSEPackedInt, v8i64>, - avx512_store<0x7F, VR512, VK8WM, i512mem, store, - "vmovdqu64", SSEPackedInt, v8i64>, - XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), + (VMOVUPSZmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src)), + (VMOVUPSZmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src)), + (VMOVUPDZmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, + (bc_v16f32 (v16i32 immAllZerosV)))), + (VMOVUPSZrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16f32 (masked_load addr:$ptr, VK16WM:$mask, (v16f32 VR512:$src0))), + (VMOVUPSZrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8f64 (v16i32 immAllZerosV)))), + (VMOVUPDZrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f64 (masked_load addr:$ptr, VK8WM:$mask, (v8f64 VR512:$src0))), + (VMOVUPDZrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), + (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + +defm VMOVDQA32 : avx512_load_vl<0x6F, "vmovdqa32", "alignedload", "i", "32", + "16", "8", "4", SSEPackedInt, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqa32", "alignedstore", + "512", "256", "", "i", "32", "16", "8", "4", + SSEPackedInt, HasAVX512>, + PD, EVEX_CD8<32, CD8VF>; + +defm VMOVDQA64 : avx512_load_vl<0x6F, "vmovdqa64", "alignedload", "i", "64", + "8", "4", "2", SSEPackedInt, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqa64", "alignedstore", + "512", "256", "", "i", "64", "8", "4", "2", + SSEPackedInt, HasAVX512>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", "load", "i", "8", + "64", "32", "16", SSEPackedInt, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu8", "store", "", "", "", + "i", "8", "64", "32", "16", SSEPackedInt, + HasBWI>, XD, EVEX_CD8<8, CD8VF>; + +defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", "load", "i", "16", + "32", "16", "8", SSEPackedInt, HasBWI>, + avx512_store_vl<0x7F, "vmovdqu16", "store", "", "", "", + "i", "16", "32", "16", "8", SSEPackedInt, + HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>; + +defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", "load", "i", "32", + "16", "8", "4", SSEPackedInt, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu32", "store", "", "", "", + "i", "32", "16", "8", "4", SSEPackedInt, + HasAVX512>, XS, EVEX_CD8<32, CD8VF>; + +defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", "load", "i", "64", + "8", "4", "2", SSEPackedInt, HasAVX512>, + avx512_store_vl<0x7F, "vmovdqu64", "store", "", "", "", + "i", "64", "8", "4", "2", SSEPackedInt, + HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>; def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr, (v16i32 immAllZerosV), GR16:$mask)), - (VMOVDQU32rmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; + (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>; def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr, - (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)), - (VMOVDQU64rmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; + (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)), + (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>; def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src), - GR16:$mask), - (VMOVDQU32mrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), + GR16:$mask), + (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src)>; def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src), - GR8:$mask), - (VMOVDQU64mrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), + GR8:$mask), + (VMOVDQU64Zmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; let AddedComplexity = 20 in { def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src), - (bc_v8i64 (v16i32 immAllZerosV)))), - (VMOVDQU64rrkz VK8WM:$mask, VR512:$src)>; + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64Zrrkz VK8WM:$mask, VR512:$src)>; def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), - (v8i64 VR512:$src))), - (VMOVDQU64rrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), + (v8i64 VR512:$src))), + (VMOVDQU64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), VK8), VR512:$src)>; def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src), (v16i32 immAllZerosV))), - (VMOVDQU32rrkz VK16WM:$mask, VR512:$src)>; + (VMOVDQU32Zrrkz VK16WM:$mask, VR512:$src)>; def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), - (v16i32 VR512:$src))), - (VMOVDQU32rrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; - -def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), - (v16f32 VR512:$src2))), - (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>; -def : Pat<(v8f64 (vselect VK8WM:$mask, (v8f64 VR512:$src1), - (v8f64 VR512:$src2))), - (VMOVUPDZrrk VR512:$src2, VK8WM:$mask, VR512:$src1)>; -def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src1), - (v16i32 VR512:$src2))), - (VMOVDQU32rrk VR512:$src2, VK16WM:$mask, VR512:$src1)>; -def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), - (v8i64 VR512:$src2))), - (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>; + (v16i32 VR512:$src))), + (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 immAllZerosV))), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, undef)), + (VMOVDQU32Zrmkz VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v16i32 (masked_load addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src0))), + (VMOVDQU32Zrmk VR512:$src0, VK16WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, + (bc_v8i64 (v16i32 immAllZerosV)))), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (VMOVDQU64Zrmkz VK8WM:$mask, addr:$ptr)>; + +def: Pat<(v8i64 (masked_load addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src0))), + (VMOVDQU64Zrmk VR512:$src0, VK8WM:$mask, addr:$ptr)>; + +def: Pat<(masked_store addr:$ptr, VK16WM:$mask, (v16i32 VR512:$src)), + (VMOVDQU32Zmrk addr:$ptr, VK16WM:$mask, VR512:$src)>; + +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i64 VR512:$src)), + (VMOVDQU64Zmrk addr:$ptr, VK8WM:$mask, VR512:$src)>; + +// SKX replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Z256mrk addr:$ptr, VK8WM:$mask, VR256:$src)>; + +// KNL replacement +def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), + (VMOVDQU32Zmrk addr:$ptr, + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; + +def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), + (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz + (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; + + // Move Int Doubleword to Packed Double Int // def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), @@ -1638,12 +2469,12 @@ def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar <string asm, RegisterClass RC, +multiclass avx512_move_scalar <string asm, RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, PatFrag mem_pat> { let hasSideEffects = 0 in { - def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), - !strconcat(asm, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128X:$dst, (vt (OpNode VR128X:$src1, (scalar_to_vector RC:$src2))))], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG; @@ -1651,16 +2482,22 @@ multiclass avx512_move_scalar <string asm, RegisterClass RC, def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3), !strconcat(asm, - " \t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"), + "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"), [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K; def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, EVEX, VEX_LIG; + let mayStore = 1 in { def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), - !strconcat(asm, " \t{$src, $dst|$dst, $src}"), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, EVEX, VEX_LIG; + def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src), + !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + [], IIC_SSE_MOV_S_MR>, + EVEX, VEX_LIG, EVEX_K; + } // mayStore } //hasSideEffects = 0 } @@ -1680,6 +2517,10 @@ def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; +def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), + (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), + (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + // For the disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), @@ -1710,7 +2551,7 @@ let Predicates = [HasAVX512] in { // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (v4f32 (V_SET0)), + (VMOVSSZrr (v4f32 (V_SET0)), (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), (SUBREG_TO_REG (i32 0), @@ -1839,7 +2680,7 @@ let AddedComplexity = 15 in def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set VR128X:$dst, (v2i64 (X86vzmovl + [(set VR128X:$dst, (v2i64 (X86vzmovl (v2i64 VR128X:$src))))], IIC_SSE_MOVQ_RR>, EVEX, VEX_W; @@ -1861,7 +2702,7 @@ let Predicates = [HasAVX512] in { (VMOV64toPQIZrr GR64:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (VMOVDI2PDIZrr GR32:$src)>; - + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), @@ -1898,136 +2739,201 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), //===----------------------------------------------------------------------===// // AVX-512 - Non-temporals //===----------------------------------------------------------------------===// +let SchedRW = [WriteLoad] in { + def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", + [(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))], + SSEPackedInt>, EVEX, T8PD, EVEX_V512, + EVEX_CD8<64, CD8VF>; + + let Predicates = [HasAVX512, HasVLX] in { + def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst), + (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V256, + EVEX_CD8<64, CD8VF>; + + def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst), + (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", [], + SSEPackedInt>, EVEX, T8PD, EVEX_V128, + EVEX_CD8<64, CD8VF>; + } +} -def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst), - (ins i512mem:$src), - "vmovntdqa\t{$src, $dst|$dst, $src}", - [(set VR512:$dst, - (int_x86_avx512_movntdqa addr:$src))]>, - EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; - -// Prefer non-temporal over temporal versions -let AddedComplexity = 400, SchedRW = [WriteStore] in { - -def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs), - (ins f512mem:$dst, VR512:$src), - "vmovntps\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v16f32 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; - -def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs), - (ins f512mem:$dst, VR512:$src), - "vmovntpd\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8f64 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - - -def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs), - (ins i512mem:$dst, VR512:$src), - "vmovntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v8i64 VR512:$src), - addr:$dst)], - IIC_SSE_MOVNT>, - EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; +multiclass avx512_movnt<bits<8> opc, string OpcodeStr, PatFrag st_frag, + ValueType OpVT, RegisterClass RC, X86MemOperand memop, + Domain d, InstrItinClass itin = IIC_SSE_MOVNT> { + let SchedRW = [WriteStore], mayStore = 1, + AddedComplexity = 400 in + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(st_frag (OpVT RC:$src), addr:$dst)], d, itin>, EVEX; } +multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, PatFrag st_frag, + string elty, string elsz, string vsz512, + string vsz256, string vsz128, Domain d, + Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> { + let Predicates = [prd] in + defm Z : avx512_movnt<opc, OpcodeStr, st_frag, + !cast<ValueType>("v"##vsz512##elty##elsz), VR512, + !cast<X86MemOperand>(elty##"512mem"), d, itin>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_movnt<opc, OpcodeStr, st_frag, + !cast<ValueType>("v"##vsz256##elty##elsz), VR256X, + !cast<X86MemOperand>(elty##"256mem"), d, itin>, + EVEX_V256; + + defm Z128 : avx512_movnt<opc, OpcodeStr, st_frag, + !cast<ValueType>("v"##vsz128##elty##elsz), VR128X, + !cast<X86MemOperand>(elty##"128mem"), d, itin>, + EVEX_V128; + } +} + +defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore, + "i", "64", "8", "4", "2", SSEPackedInt, + HasAVX512>, PD, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore, + "f", "64", "8", "4", "2", SSEPackedDouble, + HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore, + "f", "32", "16", "8", "4", SSEPackedSingle, + HasAVX512>, PS, EVEX_CD8<32, CD8VF>; + //===----------------------------------------------------------------------===// // AVX-512 - Integer arithmetic // multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, RegisterClass KRC, - RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, PatFrag scalar_mfrag, - X86MemOperand x86scalar_mop, string BrdcstStr, - OpndItins itins, bit IsCommutable = 0> { - let isCommutable = IsCommutable in - def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], - itins.rr>, EVEX_4V; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), (OpVT RC:$src2)), - RC:$src0)))], - itins.rr>, EVEX_4V, EVEX_K; - def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" , - "|$dst {${mask}} {z}, $src1, $src2}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), (OpVT RC:$src2)), - (OpVT immAllZerosV))))], - itins.rr>, EVEX_4V, EVEX_KZ; + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + "", itins.rr, IsCommutable>, + AVX512BIBase, EVEX_4V; + + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (bitconvert (_.LdFrag addr:$src2)))), + "", itins.rm>, + AVX512BIBase, EVEX_4V; +} + +multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> : + avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> { + let mayLoad = 1 in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))), + "", itins.rm>, + AVX512BIBase, EVEX_4V, EVEX_B; +} + +multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, itins, + IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, itins, + IsCommutable>, EVEX_V256; + defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, itins, + IsCommutable>, EVEX_V128; } +} - let mayLoad = 1 in { - def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))], - itins.rm>, EVEX_4V; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)), - RC:$src0)))], - itins.rm>, EVEX_4V, EVEX_K; - def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), (memop_frag addr:$src2)), - (OpVT immAllZerosV))))], - itins.rm>, EVEX_4V, EVEX_KZ; - } - def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"), - [(set RC:$dst, (OpNode RC:$src1, - (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))], - itins.rm>, EVEX_4V, EVEX_B; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src0, KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", - BrdcstStr, "}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), - (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))), - RC:$src0)))], - itins.rm>, EVEX_4V, EVEX_B, EVEX_K; - def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}", - BrdcstStr, "}"), - [(set RC:$dst, (OpVT (vselect KRC:$mask, - (OpNode (OpVT RC:$src1), - (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))), - (OpVT immAllZerosV))))], - itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ; - } +multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, itins, + IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, itins, + IsCommutable>, EVEX_V256; + defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, itins, + IsCommutable>, EVEX_V128; } } +multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info, + itins, prd, IsCommutable>, + VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info, + itins, prd, IsCommutable>, EVEX_CD8<32, CD8VF>; +} + +multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info, + itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>; +} + +multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info, + itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>; +} + +multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr, OpNode, itins, prd, + IsCommutable>; + + defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr, OpNode, itins, prd, + IsCommutable>; +} + +multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr, OpNode, itins, prd, + IsCommutable>; + + defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr, OpNode, itins, prd, + IsCommutable>; +} + +multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w, + bits<8> opc_d, bits<8> opc_q, + string OpcodeStr, SDNode OpNode, + OpndItins itins, bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, + itins, HasAVX512, IsCommutable>, + avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, + itins, HasBWI, IsCommutable>; +} + multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT, ValueType SrcVT, RegisterClass KRC, RegisterClass RC, PatFrag memop_frag, X86MemOperand x86memop, @@ -2037,73 +2943,64 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, ValueType DstVT, { def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), [], itins.rr>, EVEX_4V, EVEX_K; def rrkz : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst {${mask}} {z}" , + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}} {z}" , "|$dst {${mask}} {z}, $src1, $src2}"), [], itins.rr>, EVEX_4V, EVEX_KZ; } let mayLoad = 1 in { def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), [], itins.rm>, EVEX_4V, EVEX_K; def rmkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), + "\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), [], itins.rm>, EVEX_4V, EVEX_KZ; def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"), [], itins.rm>, EVEX_4V, EVEX_B; def rmbk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", BrdcstStr, "}"), [], itins.rm>, EVEX_4V, EVEX_B, EVEX_K; def rmbkz : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}", BrdcstStr, "}"), [], itins.rm>, EVEX_4V, EVEX_B, EVEX_KZ; } } -defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 1>, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W; - -defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, + SSE_INTALU_ITINS_P, 1>; +defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub, + SSE_INTALU_ITINS_P, 0>; +defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; +defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, + SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512, memopv8i64, i512mem, loadi64, i64mem, "{1to8}", @@ -2124,41 +3021,33 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pmul_dq_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), (VPMULDQZrr VR512:$src1, VR512:$src2)>; -defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; def : Pat <(v16i32 (int_x86_avx512_mask_pmaxs_d_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), @@ -2223,12 +3112,12 @@ multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode, X86MemOperand x86memop> { def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], IIC_SSE_UNPCK>, EVEX_4V; def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (bitconvert (memop_frag addr:$src2)))))], IIC_SSE_UNPCK>, EVEX_4V; @@ -2250,19 +3139,19 @@ defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, // multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, - SDNode OpNode, PatFrag mem_frag, + SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> { def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>, EVEX; def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, i8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpVT (OpNode (mem_frag addr:$src1), (i8 imm:$src2))))]>, EVEX; @@ -2271,48 +3160,18 @@ multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC, defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedSingle in -defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp, - memopv16f32, i512mem, v16f32>, TAPD, EVEX_V512, - EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp, - memopv8f64, i512mem, v8f64>, TAPD, EVEX_V512, - VEX_W, EVEX_CD8<32, CD8VF>; - -def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))), - (VPERMILPSZri VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))), - (VPERMILPDZri VR512:$src1, imm:$imm)>; - //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// -defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPORDZ : avx512_binop_rm<0xEB, "vpord", or, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPORQZ : avx512_binop_rm<0xEB, "vporq", or, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_BIT_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, + SSE_INTALU_ITINS_P, HasAVX512, 1>; //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic @@ -2340,118 +3199,58 @@ defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>; } multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass KRC, - RegisterClass RC, ValueType vt, - X86MemOperand x86memop, PatFrag mem_frag, - X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, - Domain d, OpndItins itins, bit commutable> { - let isCommutable = commutable in { - def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>, - EVEX_4V; - - def rrk: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}} |$dst {${mask}}, $src1, $src2}"), - [], itins.rr, d>, EVEX_4V, EVEX_K; + X86VectorVTInfo _, bit IsCommutable> { + defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V; + let mayLoad = 1 in { + defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V; + defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))>, + EVEX_4V, EVEX_B; + }//let mayLoad = 1 +} - def rrkz: PI<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), - [], itins.rr, d>, EVEX_4V, EVEX_KZ; - } +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + bit IsCommutable = 0> { + defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, + IsCommutable>, EVEX_V512, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info, + IsCommutable>, EVEX_V512, PD, VEX_W, + EVEX_CD8<64, CD8VF>; - let mayLoad = 1 in { - def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], - itins.rm, d>, EVEX_4V; - - def rmb : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"), - [(set RC:$dst, (OpNode RC:$src1, - (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))], - itins.rm, d>, EVEX_4V, EVEX_B; - - def rmk : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], itins.rm, d>, EVEX_4V, EVEX_K; - - def rmkz : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, $src2}"), - [], itins.rm, d>, EVEX_4V, EVEX_KZ; - - def rmbk : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr, - " \t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}}|$dst {${mask}}, $src1, ${src2}", BrdcstStr, "}"), - [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_K; - - def rmbkz : PI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, x86scalar_mop:$src2), !strconcat(OpcodeStr, - " \t{${src2}", BrdcstStr, - ", $src1, $dst {${mask}} {z}|$dst {${mask}} {z}, $src1, ${src2}", - BrdcstStr, "}"), - [], itins.rm, d>, EVEX_4V, EVEX_B, EVEX_KZ; + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info, + IsCommutable>, EVEX_V128, PS, + EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info, + IsCommutable>, EVEX_V256, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info, + IsCommutable>, EVEX_V128, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info, + IsCommutable>, EVEX_V256, PD, VEX_W, + EVEX_CD8<64, CD8VF>; } } -defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, - EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 1>, - EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 1>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; -defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VK16WM, VR512, v16f32, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, - SSE_ALU_ITINS_P.s, 0>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - -defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 0>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VK8WM, VR512, v8f64, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble, - SSE_ALU_ITINS_P.d, 0>, - EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; +defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>; +defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>; +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, 1>; +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, 1>; +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>; +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>; def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1), (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), @@ -2476,18 +3275,18 @@ def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1), // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// -multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, +multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, + RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, SDNode OpNode, ValueType vt> { def rr : AVX512PI<opc, MRMSrcReg, - (outs KRC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + (outs KRC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], SSEPackedInt>, EVEX_4V; def rm : AVX512PI<opc, MRMSrcMem, - (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode (vt RC:$src1), + (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode (vt RC:$src1), (bitconvert (memop_frag addr:$src2))))], SSEPackedInt>, EVEX_4V; } @@ -2514,154 +3313,122 @@ def : Pat <(i16 (int_x86_avx512_mask_ptestm_d_512 (v16i32 VR512:$src1), def : Pat <(i8 (int_x86_avx512_mask_ptestm_q_512 (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))), (COPY_TO_REGCLASS (VPTESTMQZrr VR512:$src1, VR512:$src2), GR8)>; + //===----------------------------------------------------------------------===// // AVX-512 Shift instructions //===----------------------------------------------------------------------===// multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode, RegisterClass RC, - ValueType vt, X86MemOperand x86memop, PatFrag mem_frag, - RegisterClass KRC> { - def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst), - (ins RC:$src1, i8imm:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (OpNode RC:$src1, (i8 imm:$src2))))], - SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V; - def rik : AVX512BIi8<opc, ImmFormR, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, i8imm:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K; - def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst), - (ins x86memop:$src1, i8imm:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpNode (mem_frag addr:$src1), - (i8 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V; - def mik: AVX512BIi8<opc, ImmFormM, (outs RC:$dst), - (ins KRC:$mask, x86memop:$src1, i8imm:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K; + string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), + (ins _.RC:$src1, i8imm:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIi8Base, EVEX_4V; + defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), + (ins _.MemOp:$src1, i8imm:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_.MemOpFrag addr:$src1), (i8 imm:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIi8Base, EVEX_4V; } multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType vt, ValueType SrcVT, - PatFrag bc_frag, RegisterClass KRC> { - // src2 is always 128-bit - def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, VR128X:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))], - SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V; - def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, VR128X:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K; - def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, i128mem:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (OpNode RC:$src1, - (bc_frag (memopv2i64 addr:$src2)))))], - SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V; - def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, RC:$src1, i128mem:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"), - [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K; + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + // src2 is always 128-bit + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, VR128X:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V; + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, i128mem:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (bc_frag (memopv2i64 addr:$src2)))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, EVEX_4V; +} + +multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, _>, EVEX_V512; +} + +multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, string OpcodeStr, + SDNode OpNode> { + defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32, + v16i32_info>, EVEX_CD8<32, CD8VQ>; + defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64, + v8i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; } defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai, - VR512, v16i32, i512mem, memopv16i32, VK16WM>, + v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra, - VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VQ>; - defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai, - VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512, + v8i64_info>, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; -defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra, - VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512, - EVEX_CD8<64, CD8VQ>, VEX_W; + +defm VPSLL : avx512_shift_types<0xF2, 0xF3, "vpsll", X86vshl>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, "vpsra", X86vsra>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, "vpsrl", X86vsrl>; //===-------------------------------------------------------------------===// // Variable Bit Shifts //===-------------------------------------------------------------------===// multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType vt, - X86MemOperand x86memop, PatFrag mem_frag> { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (vt (OpNode RC:$src1, (vt RC:$src2))))]>, - EVEX_4V; - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, - (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>, - EVEX_4V; + X86VectorVTInfo _> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (_.MemOpFrag addr:$src2))), + " ", SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V; } -defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; -defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, - i512mem, memopv16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, - i512mem, memopv8i64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; +multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo _> { + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; +} + +multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, + avx512vl_i32_info>, EVEX_CD8<32, CD8VQ>; + defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, + avx512vl_i64_info>, EVEX_CD8<64, CD8VQ>, VEX_W; +} + +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>; //===----------------------------------------------------------------------===// // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// -multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, +multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, X86MemOperand x86memop, PatFrag memop_frag> { def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; } @@ -2678,11 +3445,11 @@ multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, ValueType vt, RegisterClass RC, PatFrag mem_frag, X86MemOperand x86memop> { def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX; let mayLoad = 1 in def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX; } @@ -2729,175 +3496,139 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // FMA - Fused Multiply Operations // + let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, SDNode OpNode, ValueType OpVT> { - def r: AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3), - !strconcat(OpcodeStr," \t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, (OpVT(OpNode RC:$src1, RC:$src2, RC:$src3)))]>; +// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching. +multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + SDPatternOperator OpNode = null_frag> { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + AVX512FMA3Base; let mayLoad = 1 in - def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src2, x86memop:$src3), - !strconcat(OpcodeStr, " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, - (mem_frag addr:$src3))))]>; - def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src2, x86scalar_mop:$src3), - !strconcat(OpcodeStr, " \t{${src3}", BrdcstStr, - ", $src2, $dst|$dst, $src2, ${src3}", BrdcstStr, "}"), - [(set RC:$dst, (OpNode RC:$src1, RC:$src2, - (OpVT (X86VBroadcast (scalar_mfrag addr:$src3)))))]>, EVEX_B; -} + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), + (OpNode _.RC:$src1, _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, + AVX512FMA3Base, EVEX_B; + } } // Constraints = "$src1 = $dst" -let ExeDomain = SSEPackedSingle in { - defm VFMADD213PSZ : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMSUB213PSZ : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmaddsub, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsubadd, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD213PSZ : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFNMSUB213PSZ : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; +multiclass avx512_fma3p_forms<bits<8> opc213, bits<8> opc231, + string OpcodeStr, X86VectorVTInfo VTI, + SDPatternOperator OpNode> { + defm v213r : avx512_fma3p_rm<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix), + VTI, OpNode>, EVEX_CD8<VTI.EltSize, CD8VF>; + + defm v231r : avx512_fma3p_rm<opc231, !strconcat(OpcodeStr, "231", VTI.Suffix), + VTI>, EVEX_CD8<VTI.EltSize, CD8VF>; } + +multiclass avx512_fma3p<bits<8> opc213, bits<8> opc231, + string OpcodeStr, + SDPatternOperator OpNode> { +let ExeDomain = SSEPackedSingle in { + defm NAME##PSZ : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v16f32_info, OpNode>, EVEX_V512; + defm NAME##PSZ256 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v8f32x_info, OpNode>, EVEX_V256; + defm NAME##PSZ128 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v4f32x_info, OpNode>, EVEX_V128; + } let ExeDomain = SSEPackedDouble in { - defm VFMADD213PDZ : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmadd, v8f64>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB213PDZ : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmaddsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsubadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + defm NAME##PDZ : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v8f64_info, OpNode>, EVEX_V512, VEX_W; + defm NAME##PDZ256 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v4f64x_info, OpNode>, EVEX_V256, VEX_W; + defm NAME##PDZ128 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, + v2f64x_info, OpNode>, EVEX_V128, VEX_W; + } } +defm VFMADD : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd>; +defm VFMSUB : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub>; +defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub>; +defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd>; +defm VFNMADD : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd>; +defm VFNMSUB : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub>; + let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, SDNode OpNode, ValueType OpVT> { +multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { let mayLoad = 1 in - def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src3, x86memop:$src2), - !strconcat(OpcodeStr, " \t{$src2, $src3, $dst|$dst, $src3, $src2}"), - [(set RC:$dst, (OpVT (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3)))]>; - def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src3, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, - ", $src3, $dst|$dst, $src3, ${src2}", BrdcstStr, "}"), - [(set RC:$dst, (OpNode RC:$src1, - (OpVT (X86VBroadcast (scalar_mfrag addr:$src2))), RC:$src3))]>, EVEX_B; + def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src3, _.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"), + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.MemOpFrag addr:$src2), + _.RC:$src3)))]>; + def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src3, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, + ", $src3, $dst|$dst, $src3, ${src2}", _.BroadcastStr, "}"), + [(set _.RC:$dst, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))), + _.RC:$src3))]>, EVEX_B; } } // Constraints = "$src1 = $dst" +multiclass avx512_fma3p_m132_f<bits<8> opc, + string OpcodeStr, + SDNode OpNode> { + let ExeDomain = SSEPackedSingle in { - defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmaddsub, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsubadd, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -} + defm NAME##PSZ : avx512_fma3p_m132<opc, OpcodeStr##ps, + OpNode,v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ256 : avx512_fma3p_m132<opc, OpcodeStr##ps, + OpNode, v8f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VF>; + defm NAME##PSZ128 : avx512_fma3p_m132<opc, OpcodeStr##ps, + OpNode, v4f32x_info>, EVEX_V128, EVEX_CD8<32, CD8VF>; + } let ExeDomain = SSEPackedDouble in { - defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmadd, v8f64>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmaddsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsubadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + defm NAME##PDZ : avx512_fma3p_m132<opc, OpcodeStr##pd, + OpNode, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ256 : avx512_fma3p_m132<opc, OpcodeStr##pd, + OpNode, v4f64x_info>, EVEX_V256, VEX_W, EVEX_CD8<32, CD8VF>; + defm NAME##PDZ128 : avx512_fma3p_m132<opc, OpcodeStr##pd, + OpNode, v2f64x_info>, EVEX_V128, VEX_W, EVEX_CD8<32, CD8VF>; + } } +defm VFMADD132 : avx512_fma3p_m132_f<0x98, "vfmadd132", X86Fmadd>; +defm VFMSUB132 : avx512_fma3p_m132_f<0x9A, "vfmsub132", X86Fmsub>; +defm VFMADDSUB132 : avx512_fma3p_m132_f<0x96, "vfmaddsub132", X86Fmaddsub>; +defm VFMSUBADD132 : avx512_fma3p_m132_f<0x97, "vfmsubadd132", X86Fmsubadd>; +defm VFNMADD132 : avx512_fma3p_m132_f<0x9C, "vfnmadd132", X86Fnmadd>; +defm VFNMSUB132 : avx512_fma3p_m132_f<0x9E, "vfnmsub132", X86Fnmsub>; + + // Scalar FMA let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType OpVT, - X86MemOperand x86memop, Operand memop, +multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType OpVT, + X86MemOperand x86memop, Operand memop, PatFrag mem_frag> { let isCommutable = 1 in def r : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; let mayLoad = 1 in def m : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, f128mem:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $dst|$dst, $src2, $src3}"), + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (OpVT (OpNode RC:$src2, RC:$src1, (mem_frag addr:$src3))))]>; @@ -2930,12 +3661,12 @@ multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { let hasSideEffects = 0 in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), - !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src), - !strconcat(asm," \t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, EVEX_4V; } // hasSideEffects = 0 } @@ -3003,12 +3734,12 @@ multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstR string asm> { let hasSideEffects = 0 in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG, Requires<[HasAVX512]>; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG, + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG, Requires<[HasAVX512]>; } // hasSideEffects = 0 } @@ -3106,10 +3837,10 @@ multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm> { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX; def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX; } @@ -3182,21 +3913,21 @@ def : Pat<(extloadf32 addr:$src), def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, Requires<[HasAVX512]>; -multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC, - RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, +multiclass avx512_vcvt_fp_with_rc<bits<8> opc, string asm, RegisterClass SrcRC, + RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT, ValueType InVT, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX; def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), - !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), [], d>, EVEX, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX; } // hasSideEffects = 0 @@ -3208,12 +3939,12 @@ multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX; let mayLoad = 1 in def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX; } // hasSideEffects = 0 @@ -3230,7 +3961,7 @@ defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend, EVEX_CD8<32, CD8VH>; def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; - + def : Pat<(v8f32 (int_x86_avx512_mask_cvtpd2ps_512 (v8f64 VR512:$src), (bc_v8f32(v8i32 immAllZerosV)), (i8 -1), (i32 FROUND_CURRENT))), (VCVTPD2PSZrr VR512:$src)>; @@ -3259,7 +3990,7 @@ defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, - memopv8f64, f512mem, v8i32, v8f64, + memopv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3277,7 +4008,7 @@ defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uin memopv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, PS, VEX_W, EVEX_CD8<64, CD8VF>; - + // cvttpd2udq (src, 0, mask-all-ones, sae-current) def : Pat<(v8i32 (int_x86_avx512_mask_cvttpd2udq_512 (v8f64 VR512:$src), (v8i32 immAllZerosV), (i8 -1), FROUND_CURRENT)), @@ -3287,16 +4018,16 @@ defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp, memopv4i64, f256mem, v8f64, v8i32, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; - + defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, memopv16i32, f512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, XD, EVEX_CD8<32, CD8VF>; def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), - (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; - + def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; @@ -3304,7 +4035,7 @@ def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; - + def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>; @@ -3331,14 +4062,14 @@ multiclass avx512_vcvt_fp2int<bits<8> opc, string asm, RegisterClass SrcRC, X86MemOperand x86memop, Domain d> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [], d>, EVEX; def rrb : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), - !strconcat(asm," \t{$rc, $src, $dst|$dst, $src, $rc}"), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), [], d>, EVEX, EVEX_B, EVEX_RC; let mayLoad = 1 in def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(asm," \t{$src, $dst|$dst, $src}"), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [], d>, EVEX; } // hasSideEffects = 0 } @@ -3397,12 +4128,12 @@ multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC, X86MemOperand x86memop> { def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), (ins srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; let hasSideEffects = 0, mayStore = 1 in def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2), - "vcvtps2ph \t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; } defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512, @@ -3449,7 +4180,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } } - + /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop> { @@ -3457,12 +4188,12 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC, def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; } } } @@ -3498,26 +4229,49 @@ def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1), /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, ValueType OpVt> { - def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, - " \t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (OpVt (OpNode RC:$src)))]>, - EVEX; - def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (OpVt (OpNode (mem_frag addr:$src))))]>, - EVEX; -} -defm VRSQRT14PSZ : avx512_fp14_p<0x4E, "vrsqrt14ps", X86frsqrt, VR512, f512mem, - memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRSQRT14PDZ : avx512_fp14_p<0x4E, "vrsqrt14pd", X86frsqrt, VR512, f512mem, - memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VRCP14PSZ : avx512_fp14_p<0x4C, "vrcp14ps", X86frcp, VR512, f512mem, - memopv16f32, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRCP14PDZ : avx512_fp14_p<0x4C, "vrcp14pd", X86frcp, VR512, f512mem, - memopv8f64, v8f64>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + X86VectorVTInfo _> { + defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD; + let mayLoad = 1 in { + defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD; + defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (OpNode (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX, T8PD, EVEX_B; + } +} + +multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v4f32x_info>, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v8f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v2f64x_info>, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v4f64x_info>, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } +} + +defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; +defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src), (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), @@ -3534,148 +4288,100 @@ def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src), (VRCP14PDZr VR512:$src)>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd -multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop> { - let hasSideEffects = 0, Predicates = [HasERI] in { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; - def rrb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - " \t{{sae}, $src2, $src1, $dst|$dst, $src1, $src2, {sae}}"), - []>, EVEX_4V, EVEX_B; - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; - } -} +multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode> { + + defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_CURRENT))>; + + defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B; + + defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT))>; } -defm VRCP28SS : avx512_fp28_s<0xCB, "vrcp28ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRCP28SD : avx512_fp28_s<0xCB, "vrcp28sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; -defm VRSQRT28SS : avx512_fp28_s<0xCD, "vrsqrt28ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRSQRT28SD : avx512_fp28_s<0xCD, "vrsqrt28sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; - -def : Pat <(v4f32 (int_x86_avx512_rcp28_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRCP28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; - -def : Pat <(v2f64 (int_x86_avx512_rcp28_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRCP28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; - -def : Pat <(v4f32 (int_x86_avx512_rsqrt28_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRSQRT28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; - -def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1), - FROUND_NO_EXC)), - (COPY_TO_REGCLASS (VRSQRT28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; +multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>, + EVEX_CD8<32, CD8VT1>; + defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>, + EVEX_CD8<64, CD8VT1>, VEX_W; +} +let hasSideEffects = 0, Predicates = [HasERI] in { + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; +} /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd -multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop> { - let hasSideEffects = 0, Predicates = [HasERI] in { - def r : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, - " \t{$src, $dst|$dst, $src}"), - []>, EVEX; - def rb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, - " \t{{sae}, $src, $dst|$dst, $src, {sae}}"), - []>, EVEX, EVEX_B; - def m : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), - []>, EVEX; - } + +multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + SDNode OpNode> { + + defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>; + + defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, + "$src", "$src", + (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), + "{sae}">, EVEX_B; + + defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (bitconvert (_.LdFrag addr:$src))), + (i32 FROUND_CURRENT))>; + + defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src))), + (i32 FROUND_CURRENT))>, EVEX_B; } -defm VRSQRT28PSZ : avx512_fp28_p<0xCC, "vrsqrt28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRSQRT28PDZ : avx512_fp28_p<0xCC, "vrsqrt28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VRCP28PSZ : avx512_fp28_p<0xCA, "vrcp28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRCP28PDZ : avx512_fp28_p<0xCA, "vrcp28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat <(v16f32 (int_x86_avx512_rsqrt28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRSQRT28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rsqrt28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRSQRT28PDZrb VR512:$src)>; - -def : Pat <(v16f32 (int_x86_avx512_rcp28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRCP28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRCP28PDZrb VR512:$src)>; - -multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - Intrinsic V16F32Int, Intrinsic V8F64Int, - OpndItins itins_s, OpndItins itins_d> { - def PSZrr :AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))], itins_s.rr>, - EVEX, EVEX_V512; - let mayLoad = 1 in - def PSZrm : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, - (OpNode (v16f32 (bitconvert (memopv16f32 addr:$src)))))], - itins_s.rm>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>; +multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>, + EVEX_CD8<32, CD8VF>; + defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>, + VEX_W, EVEX_CD8<32, CD8VF>; +} - def PDZrr : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))], itins_d.rr>, - EVEX, EVEX_V512; +let Predicates = [HasERI], hasSideEffects = 0 in { - let mayLoad = 1 in - def PDZrm : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (OpNode - (v8f64 (bitconvert (memopv16f32 addr:$src)))))], - itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>; + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX, EVEX_V512, T8PD; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX, EVEX_V512, T8PD; +} -let isCodeGenOnly = 1 in { - def PSZr_Int : AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), - !strconcat(OpcodeStr, - "ps\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (V16F32Int VR512:$src))]>, - EVEX, EVEX_V512; - def PSZm_Int : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), - !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, - (V16F32Int (memopv16f32 addr:$src)))]>, EVEX, - EVEX_V512, EVEX_CD8<32, CD8VF>; - def PDZr_Int : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src), - !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (V8F64Int VR512:$src))]>, - EVEX, EVEX_V512, VEX_W; - def PDZm_Int : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src), - !strconcat(OpcodeStr, - "pd\t{$src, $dst|$dst, $src}"), - [(set VR512:$dst, (V8F64Int (memopv8f64 addr:$src)))]>, - EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -} // isCodeGenOnly = 1 +multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.FloatVT (OpNode _.RC:$src))>, EVEX; + let mayLoad = 1 in { + defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.FloatVT + (bitconvert (_.LdFrag addr:$src))))>, EVEX; + + defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (OpNode (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX, EVEX_B; + } } multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, @@ -3691,7 +4397,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, (ins VR128X:$src1, VR128X:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2))], itins_s.rr>, XS, EVEX_4V; let mayLoad = 1 in { @@ -3705,7 +4411,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, (ins VR128X:$src1, ssmem:$src2), !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F32Int VR128X:$src1, sse_load_f32:$src2))], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; } @@ -3719,7 +4425,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, (ins VR128X:$src1, VR128X:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, + [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2))], itins_s.rr>, XD, EVEX_4V, VEX_W; let mayLoad = 1 in { @@ -3733,21 +4439,51 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, (ins VR128X:$src1, sdmem:$src2), !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, sse_load_f64:$src2))]>, + [(set VR128X:$dst, + (F64Int VR128X:$src1, sse_load_f64:$src2))]>, XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; } } +multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + v16f32_info>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + v8f64_info>, + EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v4f32x_info>, + EVEX_V128, PS, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, v8f32x_info>, + EVEX_V256, PS, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v2f64x_info>, + EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, v4f64x_info>, + EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>; + } +} + +defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>; -defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", - int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, - SSE_SQRTSS, SSE_SQRTSD>, - avx512_sqrt_packed<0x51, "vsqrt", fsqrt, - int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512, - SSE_SQRTPS, SSE_SQRTPD>; +defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", + int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, + SSE_SQRTSS, SSE_SQRTSD>; let Predicates = [HasAVX512] in { + def : Pat<(v16f32 (int_x86_avx512_sqrt_ps_512 (v16f32 VR512:$src1), + (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_CURRENT)), + (VSQRTPSZr VR512:$src1)>; + def : Pat<(v8f64 (int_x86_avx512_sqrt_pd_512 (v8f64 VR512:$src1), + (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_CURRENT)), + (VSQRTPDZr VR512:$src1)>; + def : Pat<(f32 (fsqrt FR32X:$src)), (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; def : Pat<(f32 (fsqrt (load addr:$src))), @@ -3856,7 +4592,7 @@ let ExeDomain = GenericDomain in { (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128X:$dst, (F32Int VR128X:$src1, + [(set VR128X:$dst, (F32Int VR128X:$src1, sse_load_f32:$src2, imm:$src3))]>, EVEX_CD8<32, CD8VT1>; @@ -3897,14 +4633,14 @@ let ExeDomain = d in { def r : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX; // Vector intrinsic operation, mem def m : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX; } // ExeDomain } @@ -3935,20 +4671,20 @@ let ExeDomain = d in { def r : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i32i8imm:$src3), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; def m : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i32i8imm:$src3), !strconcat(OpcodeStr, - " \t{$src2, $src1, $dst|$dst, $src1, $src2}"), + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; } // ExeDomain } defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", ssmem, FR32X, SSEPackedSingle>, EVEX_CD8<32, CD8VT1>; - + defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", sdmem, FR64X, SSEPackedDouble>, EVEX_CD8<64, CD8VT1>; @@ -4004,32 +4740,32 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, RegisterClass KRC, X86MemOperand x86memop> { def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), (ins srcRC:$src), - !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>, EVEX; def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), (ins KRC:$mask, srcRC:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), []>, EVEX, EVEX_K; def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), (ins KRC:$mask, srcRC:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX; def mrk : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"), []>, EVEX, EVEX_K; } -defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, +defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; @@ -4083,36 +4819,36 @@ multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass KRC, def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX; def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), []>, EVEX, EVEX_K; def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, SrcRC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), [(set DstRC:$dst, (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>, EVEX; def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr," \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), + !strconcat(OpcodeStr,"\t{$src, $dst {${mask}} |$dst {${mask}}, $src}"), []>, EVEX, EVEX_K; def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr," \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + !strconcat(OpcodeStr,"\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; } @@ -4160,7 +4896,7 @@ let mayLoad = 1, def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb), (ins RC:$src1, KRC:$mask, memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, EVEX, EVEX_K; } @@ -4177,7 +4913,7 @@ defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>, defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; } - + defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512, vy64xmem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>, @@ -4194,7 +4930,7 @@ let mayStore = 1, Constraints = "$mask = $mask_wb" in def mr : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb), (ins memop:$dst, KRC:$mask, RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, EVEX, EVEX_K; } @@ -4227,7 +4963,7 @@ multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeSt RegisterClass KRC, X86MemOperand memop> { let Predicates = [HasPFI], hasSideEffects = 1 in def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src), - !strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"), + !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), []>, EVEX, EVEX_K; } @@ -4242,7 +4978,7 @@ defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd", VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; - + defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; @@ -4287,14 +5023,14 @@ multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop, def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$src3), !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, EVEX_4V, Sched<[WriteShuffle]>; @@ -4317,33 +5053,29 @@ def : Pat<(v8i64 (X86Shufp VR512:$src1, (memopv8i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; -multiclass avx512_alignr<string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop> { - def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, EVEX_4V; +multiclass avx512_valign<X86VectorVTInfo _> { + defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i8imm:$src3), + "valign"##_.Suffix, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86VAlign _.RC:$src2, _.RC:$src1, + (i8 imm:$src3)))>, + AVX512AIi8Base, EVEX_4V; + + // Also match valign of packed floats. + def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), + (!cast<Instruction>(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>; + let mayLoad = 1 in - def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), - !strconcat(OpcodeStr, - " \t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i8imm:$src3), + !strconcat("valign"##_.Suffix, + "\t{$src3, $src2, $src1, $dst|" + "$dst, $src1, $src2, $src3}"), []>, EVEX_4V; } -defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; -def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; -def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>; +defm VALIGND : avx512_valign<v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VALIGNQ : avx512_valign<v8i64_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; @@ -4354,43 +5086,43 @@ multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, ValueType OpVT, X86MemOperand x86memop, X86MemOperand x86scalar_mop, string BrdcstStr> { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX; def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), []>, EVEX, EVEX_K; def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), !strconcat(OpcodeStr, - " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, EVEX; def rmk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, - " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), []>, EVEX, EVEX_K; def rmkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, - " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), + "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; def rmb : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", $dst|$dst, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_B; def rmbk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", $dst {${mask}}|$dst {${mask}}, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_B, EVEX_K; def rmbkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", $dst {${mask}} {z}|$dst {${mask}} {z}, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_B, EVEX_KZ; @@ -4420,54 +5152,54 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pabs_q_512 (v8i64 VR512:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), (VPABSQZrr VR512:$src)>; -multiclass avx512_conflict<bits<8> opc, string OpcodeStr, +multiclass avx512_conflict<bits<8> opc, string OpcodeStr, RegisterClass RC, RegisterClass KRC, X86MemOperand x86memop, X86MemOperand x86scalar_mop, string BrdcstStr> { def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, " \t{$src, ${dst} |${dst}, $src}"), + !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"), []>, EVEX; def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, " \t{$src, ${dst}|${dst}, $src}"), + !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"), []>, EVEX; def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", ${dst}|${dst}, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_B; def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, x86memop:$src), !strconcat(OpcodeStr, - " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), + "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), []>, EVEX, EVEX_KZ; def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, " \t{${src}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}", BrdcstStr, "}"), []>, EVEX, EVEX_KZ, EVEX_B; - + let Constraints = "$src1 = $dst" in { def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, KRC:$mask, RC:$src2), !strconcat(OpcodeStr, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, EVEX, EVEX_K; def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, x86memop:$src2), !strconcat(OpcodeStr, - " \t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>, EVEX, EVEX_K; def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2), - !strconcat(OpcodeStr, " \t{${src2}", BrdcstStr, + !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"), []>, EVEX, EVEX_K, EVEX_B; } @@ -4541,3 +5273,135 @@ def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), def : Pat<(truncstorei1 GR8:$src, addr:$dst), (MOV8mr addr:$dst, GR8:$src)>; +multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { +def rr : AVX512XS8I<opc, MRMDestReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), + !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"), + [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX; +} + +multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo, + string OpcodeStr, Predicate prd> { +let Predicates = [prd] in + defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +multiclass avx512_convert_mask_to_vector<string OpcodeStr> { + defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr, + HasBWI>; + defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr, + HasBWI>, VEX_W; + defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr, + HasDQI>; + defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, + HasDQI>, VEX_W; +} + +defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; + +//===----------------------------------------------------------------------===// +// AVX-512 - COMPRESS and EXPAND +// +multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, + _.ImmAllZerosV)))]>, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + [(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, + _.RC:$src0)))]>, EVEX_K; + + let mayStore = 1 in { + def mrk : AVX5128I<opc, MRMDestMem, (outs), + (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + [(store (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, undef)), + addr:$dst)]>, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + } +} + +multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : compress_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : compress_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : compress_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>, + EVEX; +defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>, + EVEX; +defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>, + EVEX, VEX_W; + +// expand +multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, + string OpcodeStr> { + def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, (_.VT _.RC:$src), + _.ImmAllZerosV)))]>, EVEX_KZ; + + let Constraints = "$src0 = $dst" in + def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, + (_.VT _.RC:$src), _.RC:$src0)))]>, EVEX_K; + + let mayLoad = 1, Constraints = "$src0 = $dst" in + def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", + [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, + (_.VT (bitconvert + (_.LdFrag addr:$src))), + _.RC:$src0)))]>, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + + let mayLoad = 1 in + def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src), + OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, + (_.VT (bitconvert (_.LdFrag addr:$src))), + _.ImmAllZerosV)))]>, + EVEX_KZ, EVEX_CD8<_.EltSize, CD8VT1>; + +} + +multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo> { + defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>, + EVEX; +defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>, + EVEX, VEX_W; +defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, + EVEX; +defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, + EVEX, VEX_W; diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td index f2574cc3700e..78efc4d57111 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -15,13 +15,13 @@ //===----------------------------------------------------------------------===// // LEA - Load Effective Address let SchedRW = [WriteLEA] in { -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def LEA16r : I<0x8D, MRMSrcMem, - (outs GR16:$dst), (ins i32mem:$src), + (outs GR16:$dst), (ins anymem:$src), "lea{w}\t{$src|$dst}, {$dst|$src}", [], IIC_LEA_16>, OpSize16; let isReMaterializable = 1 in def LEA32r : I<0x8D, MRMSrcMem, - (outs GR32:$dst), (ins i32mem:$src), + (outs GR32:$dst), (ins anymem:$src), "lea{l}\t{$src|$dst}, {$dst|$src}", [(set GR32:$dst, lea32addr:$src)], IIC_LEA>, OpSize32, Requires<[Not64BitMode]>; @@ -65,18 +65,18 @@ def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src", [(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)], IIC_MUL8>, Sched<[WriteIMul]>; // AX,DX = AX*GR16 -let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in +let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src), "mul{w}\t$src", [], IIC_MUL16_REG>, OpSize16, Sched<[WriteIMul]>; // EAX,EDX = EAX*GR32 -let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src), "mul{l}\t$src", [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/], IIC_MUL32_REG>, OpSize32, Sched<[WriteIMul]>; // RAX,RDX = RAX*GR64 -let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src), "mul{q}\t$src", [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/], @@ -91,7 +91,7 @@ def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src), [(set AL, (mul AL, (loadi8 addr:$src))), (implicit EFLAGS)], IIC_MUL8>, SchedLoadReg<WriteIMulLd>; // AX,DX = AX*[mem16] -let mayLoad = 1, neverHasSideEffects = 1 in { +let mayLoad = 1, hasSideEffects = 0 in { let Defs = [AX,DX,EFLAGS], Uses = [AX] in def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src), "mul{w}\t$src", @@ -107,7 +107,7 @@ def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src), "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>; } -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { // AL,AH = AL*GR8 let Defs = [AL,EFLAGS,AX], Uses = [AL] in def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", [], @@ -145,7 +145,7 @@ let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src), "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>; } -} // neverHasSideEffects +} // hasSideEffects let Defs = [EFLAGS] in { @@ -456,64 +456,29 @@ def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), "inc{b}\t$dst", [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))], IIC_UNARY_REG>; - -let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. -def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. +def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), "inc{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>, - OpSize16, Requires<[Not64BitMode]>; -def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], + IIC_UNARY_REG>, OpSize16; +def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), "inc{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], - IIC_UNARY_REG>, - OpSize32, Requires<[Not64BitMode]>; + IIC_UNARY_REG>, OpSize32; def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))], IIC_UNARY_REG>; -} // isConvertibleToThreeAddress = 1, CodeSize = 1 - - -// In 64-bit mode, single byte INC and DEC cannot be encoded. -let isConvertibleToThreeAddress = 1, CodeSize = 2 in { -// Can transform into LEA. -def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), - "inc{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], - IIC_UNARY_REG>, - OpSize16, Requires<[In64BitMode]>; -def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), - "inc{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))], - IIC_UNARY_REG>, - OpSize32, Requires<[In64BitMode]>; -def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), - "dec{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], - IIC_UNARY_REG>, - OpSize16, Requires<[In64BitMode]>; -def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), - "dec{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], - IIC_UNARY_REG>, - OpSize32, Requires<[In64BitMode]>; } // isConvertibleToThreeAddress = 1, CodeSize = 2 -let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, - CodeSize = 2 in { -def INC32_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), - "inc{w}\t$dst", [], IIC_UNARY_REG>, - OpSize16, Requires<[Not64BitMode]>; -def INC32_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), - "inc{l}\t$dst", [], IIC_UNARY_REG>, - OpSize32, Requires<[Not64BitMode]>; -def DEC32_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), - "dec{w}\t$dst", [], IIC_UNARY_REG>, - OpSize16, Requires<[Not64BitMode]>; -def DEC32_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), - "dec{l}\t$dst", [], IIC_UNARY_REG>, - OpSize32, Requires<[Not64BitMode]>; -} // isCodeGenOnly = 1, ForceDisassemble = 1, HasSideEffects = 0, CodeSize = 2 - +// Short forms only valid in 32-bit mode. Selected during MCInst lowering. +let CodeSize = 1, hasSideEffects = 0 in { +def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", [], IIC_UNARY_REG>, + OpSize16, Requires<[Not64BitMode]>; +def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", [], IIC_UNARY_REG>, + OpSize32, Requires<[Not64BitMode]>; +} // CodeSize = 1, hasSideEffects = 0 } // Constraints = "$src1 = $dst", SchedRW let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { @@ -522,35 +487,13 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { (implicit EFLAGS)], IIC_UNARY_MEM>; def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", [(store (add (loadi16 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize16, Requires<[Not64BitMode]>; + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16; def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", [(store (add (loadi32 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize32, Requires<[Not64BitMode]>; + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", [(store (add (loadi64 addr:$dst), 1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>; - -// These are duplicates of their 32-bit counterparts. Only needed so X86 knows -// how to unfold them. -// FIXME: What is this for?? -def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", - [(store (add (loadi16 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize16, Requires<[In64BitMode]>; -def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", - [(store (add (loadi32 addr:$dst), 1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize32, Requires<[In64BitMode]>; -def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", - [(store (add (loadi16 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize16, Requires<[In64BitMode]>; -def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", - [(store (add (loadi32 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize32, Requires<[In64BitMode]>; } // CodeSize = 2, SchedRW let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { @@ -559,21 +502,29 @@ def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), "dec{b}\t$dst", [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))], IIC_UNARY_REG>; -let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA. -def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. +def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), "dec{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))], - IIC_UNARY_REG>, - OpSize16, Requires<[Not64BitMode]>; -def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + IIC_UNARY_REG>, OpSize16; +def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), "dec{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))], - IIC_UNARY_REG>, - OpSize32, Requires<[Not64BitMode]>; + IIC_UNARY_REG>, OpSize32; def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))], IIC_UNARY_REG>; -} // CodeSize = 2 +} // isConvertibleToThreeAddress = 1, CodeSize = 2 + +// Short forms only valid in 32-bit mode. Selected during MCInst lowering. +let CodeSize = 1, hasSideEffects = 0 in { +def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", [], IIC_UNARY_REG>, + OpSize16, Requires<[Not64BitMode]>; +def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", [], IIC_UNARY_REG>, + OpSize32, Requires<[Not64BitMode]>; +} // CodeSize = 1, hasSideEffects = 0 } // Constraints = "$src1 = $dst", SchedRW @@ -583,12 +534,10 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { (implicit EFLAGS)], IIC_UNARY_MEM>; def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", [(store (add (loadi16 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize16, Requires<[Not64BitMode]>; + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize16; def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", [(store (add (loadi32 addr:$dst), -1), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>, - OpSize32, Requires<[Not64BitMode]>; + (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", [(store (add (loadi64 addr:$dst), -1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>; @@ -710,15 +659,6 @@ class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, Sched<[WriteALU]>; -// BinOpRR_R - Instructions like "add reg, reg, reg", where the pattern has -// just a regclass (no eflags) as a result. -class BinOpRR_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode> - : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), - [(set typeinfo.RegClass:$dst, - (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], - IIC_BIN_NONMEM>; - // BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has // just a EFLAGS as a result. class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, @@ -825,13 +765,6 @@ class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, let ImmT = typeinfo.ImmEncoding; } -// BinOpRI_R - Instructions like "add reg, reg, imm". -class BinOpRI_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> - : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), - [(set typeinfo.RegClass:$dst, - (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; - // BinOpRI_F - Instructions like "cmp reg, imm". class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDPatternOperator opnode, Format f> @@ -864,30 +797,23 @@ class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, let ImmT = Imm8; // Always 8-bit immediate. } -// BinOpRI8_R - Instructions like "add reg, reg, imm8". -class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> - : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), - [(set typeinfo.RegClass:$dst, - (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; - // BinOpRI8_F - Instructions like "cmp reg, imm8". class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), [(set EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; // BinOpRI8_RF - Instructions like "add reg, reg, imm8". class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; // BinOpRI8_RFF - Instructions like "adc reg, reg, imm8". class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), [(set typeinfo.RegClass:$dst, EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2, @@ -923,8 +849,8 @@ class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>; // BinOpMI - Instructions like "add [mem], imm". -class BinOpMI<string mnemonic, X86TypeInfo typeinfo, - Format f, list<dag> pattern, bits<8> opcode = 0x80, +class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM> : ITy<opcode, f, typeinfo, (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src), @@ -934,27 +860,26 @@ class BinOpMI<string mnemonic, X86TypeInfo typeinfo, } // BinOpMI_RMW - Instructions like "add [mem], imm". -class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo, +class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, SDNode opnode, Format f> - : BinOpMI<mnemonic, typeinfo, f, + : BinOpMI<opcode, mnemonic, typeinfo, f, [(store (opnode (typeinfo.VT (load addr:$dst)), typeinfo.ImmOperator:$src), addr:$dst), (implicit EFLAGS)]>; // BinOpMI_RMW_FF - Instructions like "adc [mem], imm". -class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> - : BinOpMI<mnemonic, typeinfo, f, +class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, [(store (opnode (typeinfo.VT (load addr:$dst)), typeinfo.ImmOperator:$src, EFLAGS), addr:$dst), - (implicit EFLAGS)], 0x80, IIC_BIN_CARRY_MEM>; + (implicit EFLAGS)], IIC_BIN_CARRY_MEM>; // BinOpMI_F - Instructions like "cmp [mem], imm". -class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo, - SDPatternOperator opnode, Format f, bits<8> opcode = 0x80> - : BinOpMI<mnemonic, typeinfo, f, +class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, [(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)), - typeinfo.ImmOperator:$src))], - opcode>; + typeinfo.ImmOperator:$src))]>; // BinOpMI8 - Instructions like "add [mem], imm8". class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, @@ -969,7 +894,7 @@ class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, // BinOpMI8_RMW - Instructions like "add [mem], imm8". class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpMI8<mnemonic, typeinfo, f, [(store (opnode (load addr:$dst), typeinfo.Imm8Operator:$src), addr:$dst), @@ -977,7 +902,7 @@ class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, // BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8". class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpMI8<mnemonic, typeinfo, f, [(store (opnode (load addr:$dst), typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst), @@ -985,7 +910,7 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, // BinOpMI8_F - Instructions like "cmp [mem], imm8". class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, - SDNode opnode, Format f> + SDPatternOperator opnode, Format f> : BinOpMI8<mnemonic, typeinfo, f, [(set EFLAGS, (opnode (load addr:$dst), typeinfo.Imm8Operator:$src))]>; @@ -1023,12 +948,13 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, bit CommutableRR, bit ConvertibleToThreeAddress> { let Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { - let isCommutable = CommutableRR, - isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + let isCommutable = CommutableRR in { def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; - def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; - def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; - def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; + def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; + def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + } // isConvertibleToThreeAddress } // isCommutable def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>; @@ -1041,6 +967,8 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>; def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. @@ -1048,7 +976,6 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>; def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>; - def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>; def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>; def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>; @@ -1066,10 +993,20 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; - def NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>; - def NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>; - def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>; + def NAME#8mi : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + let Constraints = "$src1 = $dst" in + def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1, mayStore = 1 in + def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>; + } } // Defs = [EFLAGS] def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, @@ -1094,12 +1031,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, bit ConvertibleToThreeAddress> { let Uses = [EFLAGS], Defs = [EFLAGS] in { let Constraints = "$src1 = $dst" in { - let isCommutable = CommutableRR, - isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + let isCommutable = CommutableRR in { def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; - def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; - def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; + } // isConvertibleToThreeAddress } // isCommutable def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>; @@ -1112,6 +1050,8 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>; def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>; + def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. @@ -1119,7 +1059,6 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>; def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>; - def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>; def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>; def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>; @@ -1137,10 +1076,20 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; - def NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; - def NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; - def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + def NAME#8mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + let Constraints = "$src1 = $dst" in + def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1, mayStore = 1 in + def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>; + } } // Uses = [EFLAGS], Defs = [EFLAGS] def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL, @@ -1162,12 +1111,13 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, SDNode opnode, bit CommutableRR, bit ConvertibleToThreeAddress> { let Defs = [EFLAGS] in { - let isCommutable = CommutableRR, - isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + let isCommutable = CommutableRR in { def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; - def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; - def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + } } // isCommutable def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; @@ -1180,6 +1130,8 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>; def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; + def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. @@ -1187,7 +1139,6 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>; def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>; - def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>; def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>; def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>; @@ -1204,10 +1155,19 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>; def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>; - def NAME#8mi : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>; - def NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>; - def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>; + def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>; + def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1 in + def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>; + } } // Defs = [EFLAGS] def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, @@ -1272,15 +1232,15 @@ let isCompare = 1 in { def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; - def TEST8mi : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>; - def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>; - def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>; - def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>; + def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>; + def TEST16mi : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>; + def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>; + def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>; // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the // register class is constrained to GR8_NOREX. This pseudo is explicitly // marked side-effect free, since it doesn't have an isel pattern like - // other test instructions. + // other test instructions. let isPseudo = 1, hasSideEffects = 0 in def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask), "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; @@ -1332,7 +1292,7 @@ let Predicates = [HasBMI] in { // MULX Instruction // multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { let isCommutable = 1 in def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), @@ -1355,49 +1315,54 @@ let Predicates = [HasBMI2] in { //===----------------------------------------------------------------------===// // ADCX Instruction // -let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { +let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS], + Constraints = "$src0 = $dst", AddedComplexity = 10 in { let SchedRW = [WriteALU] in { - def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "adcx{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8PD; - - def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "adcx{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8PD, Requires<[In64BitMode]>; + def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src0, GR32:$src), "adcx{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, + (X86adc_flag GR32:$src0, GR32:$src, EFLAGS))], + IIC_BIN_CARRY_NONMEM>, T8PD; + def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src0, GR64:$src), "adcx{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, + (X86adc_flag GR64:$src0, GR64:$src, EFLAGS))], + IIC_BIN_CARRY_NONMEM>, T8PD; } // SchedRW let mayLoad = 1, SchedRW = [WriteALULd] in { - def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "adcx{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8PD; - - def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "adcx{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8PD, Requires<[In64BitMode]>; + def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src0, i32mem:$src), "adcx{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, + (X86adc_flag GR32:$src0, (loadi32 addr:$src), EFLAGS))], + IIC_BIN_CARRY_MEM>, T8PD; + + def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src0, i64mem:$src), "adcx{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, + (X86adc_flag GR64:$src0, (loadi64 addr:$src), EFLAGS))], + IIC_BIN_CARRY_MEM>, T8PD; } } //===----------------------------------------------------------------------===// // ADOX Instruction // -let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in { +let Predicates = [HasADX], hasSideEffects = 0, Defs = [EFLAGS], + Uses = [EFLAGS] in { let SchedRW = [WriteALU] in { def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "adox{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8XS; + "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "adox{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_NONMEM>, T8XS, Requires<[In64BitMode]>; + "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_NONMEM>, T8XS; } // SchedRW let mayLoad = 1, SchedRW = [WriteALULd] in { def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "adox{l}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8XS; + "adox{l}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "adox{q}\t{$src, $dst|$dst, $src}", - [], IIC_BIN_MEM>, T8XS, Requires<[In64BitMode]>; + "adox{q}\t{$src, $dst|$dst, $src}", [], IIC_BIN_MEM>, T8XS; } } diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h index e421f8c7b985..2056056d23a5 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -21,8 +21,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86INSTRBUILDER_H -#define X86INSTRBUILDER_H +#ifndef LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H +#define LLVM_LIB_TARGET_X86_X86INSTRBUILDER_H #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td index ca4f608a6b80..880c982982a1 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -32,7 +32,7 @@ def GetLo8XForm : SDNodeXForm<imm, [{ // PIC base construction. This expands to code that looks like this: // call $next_inst // popl %destreg" -let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in +let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), "", []>; @@ -43,15 +43,18 @@ let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [ESP, EFLAGS], Uses = [ESP] in { -def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKDOWN", - [(X86callseq_start timm:$amt)]>, - Requires<[Not64BitMode]>; + []>, + Requires<[NotLP64]>; def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", [(X86callseq_end timm:$amt1, timm:$amt2)]>, - Requires<[Not64BitMode]>; + Requires<[NotLP64]>; } +def : Pat<(X86callseq_start timm:$amt1), + (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>; + // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into // a stack adjustment and the codegen must know that they may modify the stack @@ -59,16 +62,17 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [RSP, EFLAGS], Uses = [RSP] in { -def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKDOWN", - [(X86callseq_start timm:$amt)]>, - Requires<[In64BitMode]>; + []>, + Requires<[IsLP64]>; def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", [(X86callseq_end timm:$amt1, timm:$amt2)]>, - Requires<[In64BitMode]>; + Requires<[IsLP64]>; } - +def : Pat<(X86callseq_start timm:$amt1), + (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>; // x86-64 va_start lowering magic. @@ -118,7 +122,7 @@ def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), "# variable sized alloca for segmented stacks", [(set GR32:$dst, (X86SegAlloca GR32:$size))]>, - Requires<[Not64BitMode]>; + Requires<[NotLP64]>; let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), @@ -214,6 +218,8 @@ let isPseudo = 1 in { "#SEH_PushFrame $mode", []>; def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), "#SEH_EndPrologue", []>; + def SEH_Epilogue : I<0, Pseudo, (outs), (ins), + "#SEH_Epilogue", []>; } //===----------------------------------------------------------------------===// @@ -257,7 +263,7 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, - isCodeGenOnly = 1, neverHasSideEffects = 1 in + isCodeGenOnly = 1, hasSideEffects = 0 in def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; @@ -407,7 +413,8 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in { // All calls clobber the non-callee saved registers. ESP is marked as // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. -let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, +let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], @@ -426,7 +433,8 @@ def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, - FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], @@ -596,12 +604,12 @@ def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, OpSize32, LOCK; -def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, - ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, - ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), - !strconcat(mnemonic, "{q}\t", - "{$src2, $dst|$dst, $src2}"), - [], IIC_ALU_MEM>, LOCK; +def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [], IIC_ALU_MEM>, LOCK; def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, @@ -747,18 +755,88 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>, TB, LOCK; -def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR8:$dst, (atomic_load_8 addr:$src))]>; -def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR16:$dst, (atomic_load_16 addr:$src))]>; -def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR32:$dst, (atomic_load_32 addr:$src))]>; -def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR64:$dst, (atomic_load_64 addr:$src))]>; +/* The following multiclass tries to make sure that in code like + * x.store (immediate op x.load(acquire), release) + * an operation directly on memory is generated instead of wasting a register. + * It is not automatic as atomic_store/load are only lowered to MOV instructions + * extremely late to prevent them from being accidentally reordered in the backend + * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) + */ +multiclass RELEASE_BINOP_MI<string op> { + def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + // NAME#16 is not generated as 16-bit arithmetic instructions are considered + // costly and avoided as far as possible by this backend anyway + def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op) + (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; +} +defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; +defm RELEASE_AND : RELEASE_BINOP_MI<"and">; +defm RELEASE_OR : RELEASE_BINOP_MI<"or">; +defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; +// Note: we don't deal with sub, because substractions of constants are +// optimized into additions before this code can run + +multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { + def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_8 addr:$dst, dag8)]>; + def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_16 addr:$dst, dag16)]>; + def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_32 addr:$dst, dag32)]>; + def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_64 addr:$dst, dag64)]>; +} + +defm RELEASE_INC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 1)), + (add (atomic_load_16 addr:$dst), (i16 1)), + (add (atomic_load_32 addr:$dst), (i32 1)), + (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; +defm RELEASE_DEC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 -1)), + (add (atomic_load_16 addr:$dst), (i16 -1)), + (add (atomic_load_32 addr:$dst), (i32 -1)), + (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +/* +TODO: These don't work because the type inference of TableGen fails. +TODO: find a way to fix it. +defm RELEASE_NEG : RELEASE_UNOP< + (ineg (atomic_load_8 addr:$dst)), + (ineg (atomic_load_16 addr:$dst)), + (ineg (atomic_load_32 addr:$dst)), + (ineg (atomic_load_64 addr:$dst))>; +defm RELEASE_NOT : RELEASE_UNOP< + (not (atomic_load_8 addr:$dst)), + (not (atomic_load_16 addr:$dst)), + (not (atomic_load_32 addr:$dst)), + (not (atomic_load_64 addr:$dst))>; +*/ + +def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; +def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; +def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; +def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), "#RELEASE_MOV PSEUDO!", @@ -773,11 +851,22 @@ def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), "#RELEASE_MOV PSEUDO!", [(atomic_store_64 addr:$dst, GR64:$src)]>; +def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR8:$dst, (atomic_load_8 addr:$src))]>; +def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR16:$dst, (atomic_load_16 addr:$src))]>; +def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR32:$dst, (atomic_load_32 addr:$src))]>; +def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR64:$dst, (atomic_load_64 addr:$src))]>; //===----------------------------------------------------------------------===// // Conditional Move Pseudo Instructions. //===----------------------------------------------------------------------===// - // CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after // instruction selection into a branch sequence. let Uses = [EFLAGS], usesCustomInserter = 1 in { @@ -925,6 +1014,9 @@ def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), (MOV64mi32 addr:$dst, tblockaddress:$src)>, Requires<[NearData, IsStatic]>; +def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; +def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>; + // Calls // tls has some funny stuff here... @@ -1106,6 +1198,7 @@ def def32 : PatLeaf<(i32 GR32:$src), [{ return N->getOpcode() != ISD::TRUNCATE && N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && N->getOpcode() != ISD::CopyFromReg && + N->getOpcode() != ISD::AssertSext && N->getOpcode() != X86ISD::CMOV; }]>; @@ -1638,35 +1731,18 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; -// Increment reg. -// Do not make INC if it is slow -def : Pat<(add GR8:$src, 1), - (INC8r GR8:$src)>, Requires<[NotSlowIncDec]>; -def : Pat<(add GR16:$src, 1), - (INC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; -def : Pat<(add GR16:$src, 1), - (INC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; -def : Pat<(add GR32:$src, 1), - (INC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; -def : Pat<(add GR32:$src, 1), - (INC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; -def : Pat<(add GR64:$src, 1), - (INC64r GR64:$src)>, Requires<[NotSlowIncDec]>; - -// Decrement reg. -// Do not make DEC if it is slow -def : Pat<(add GR8:$src, -1), - (DEC8r GR8:$src)>, Requires<[NotSlowIncDec]>; -def : Pat<(add GR16:$src, -1), - (DEC16r GR16:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; -def : Pat<(add GR16:$src, -1), - (DEC64_16r GR16:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; -def : Pat<(add GR32:$src, -1), - (DEC32r GR32:$src)>, Requires<[NotSlowIncDec, Not64BitMode]>; -def : Pat<(add GR32:$src, -1), - (DEC64_32r GR32:$src)>, Requires<[NotSlowIncDec, In64BitMode]>; -def : Pat<(add GR64:$src, -1), - (DEC64r GR64:$src)>, Requires<[NotSlowIncDec]>; +// Increment/Decrement reg. +// Do not make INC/DEC if it is slow +let Predicates = [NotSlowIncDec] in { + def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>; + def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>; + def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>; + def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; + def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>; + def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>; + def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>; + def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; +} // or reg/reg. def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td index 39ad3954af79..71415c6fde8e 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -57,33 +57,32 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, // Unconditional branches. let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { - def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst), - "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize32; - def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst), - "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>, OpSize16, - Requires<[In16BitMode]>; - let hasSideEffects = 0 in def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), - "jmp\t$dst", [], IIC_JMP_REL>; + "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst), + "jmp\t$dst", [], IIC_JMP_REL>, OpSize16; + def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst), + "jmp\t$dst", [], IIC_JMP_REL>, OpSize32; + } } // Conditional Branches. let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in { multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { - let hasSideEffects = 0 in - def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [], - IIC_Jcc>; - def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm, - [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, OpSize16, - TB, Requires<[In16BitMode]>; - def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm, - [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB, - OpSize32; + def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, + [(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm, + [], IIC_Jcc>, OpSize16, TB; + def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm, + [], IIC_Jcc>, TB, OpSize32; + } } } defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>; -defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>; +defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>; defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>; defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>; defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>; @@ -106,20 +105,14 @@ let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in // jecxz. let Uses = [CX] in def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[Not64BitMode]>; + "jcxz\t$dst", [], IIC_JCXZ>, AdSize16; let Uses = [ECX] in - def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jecxz\t$dst", [], IIC_JCXZ>, Requires<[Not64BitMode]>; + def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jecxz\t$dst", [], IIC_JCXZ>, AdSize32; - // J*CXZ instruction: 64-bit versions of this instruction for the asmparser. - // In 64-bit mode, the address size prefix is jecxz and the unprefixed version - // is jrcxz. - let Uses = [ECX] in - def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>; let Uses = [RCX] in def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), - "jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>; + "jrcxz\t$dst", [], IIC_JCXZ>, AdSize64; } // Indirect branches @@ -145,14 +138,16 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { [(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, Requires<[In64BitMode]>, Sched<[WriteJumpLd]>; - def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), - (ins i16imm:$off, i16imm:$seg), - "ljmp{w}\t{$seg, $off|$off, $seg}", [], - IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>; - def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), - (ins i32imm:$off, i16imm:$seg), - "ljmp{l}\t{$seg, $off|$off, $seg}", [], - IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + let Predicates = [Not64BitMode] in { + def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "ljmp{w}\t$seg, $off", [], + IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>; + def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "ljmp{l}\t$seg, $off", [], + IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + } def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst), "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>, Sched<[WriteJump]>; @@ -186,10 +181,11 @@ let isCall = 1 in (outs), (ins i32imm_pcrel:$dst), "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>; - def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, - (outs), (ins i16imm_pcrel:$dst), - "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16, - Sched<[WriteJump]>; + let hasSideEffects = 0 in + def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, + (outs), (ins i16imm_pcrel:$dst), + "call{w}\t$dst", [], IIC_CALL_RI>, OpSize16, + Sched<[WriteJump]>; def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst), "call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>, OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>; @@ -207,14 +203,16 @@ let isCall = 1 in Requires<[Not64BitMode,FavorMemIndirectCall]>, Sched<[WriteJumpLd]>; - def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), - (ins i16imm:$off, i16imm:$seg), - "lcall{w}\t{$seg, $off|$off, $seg}", [], - IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>; - def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), - (ins i32imm:$off, i16imm:$seg), - "lcall{l}\t{$seg, $off|$off, $seg}", [], - IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + let Predicates = [Not64BitMode] in { + def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "lcall{w}\t$seg, $off", [], + IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>; + def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "lcall{l}\t$seg, $off", [], + IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>; + } def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst), "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16, diff --git a/contrib/llvm/lib/Target/X86/X86InstrExtension.td b/contrib/llvm/lib/Target/X86/X86InstrExtension.td index 6be6a1fc6cb7..eea4f17db3f2 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrExtension.td +++ b/contrib/llvm/lib/Target/X86/X86InstrExtension.td @@ -11,7 +11,7 @@ // //===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { let Defs = [AX], Uses = [AL] in def CBW : I<0x98, RawFrm, (outs), (ins), "{cbtw|cbw}", [], IIC_CBW>, OpSize16; // AX = signext(AL) @@ -39,7 +39,7 @@ let neverHasSideEffects = 1 in { // Sign/Zero extenders -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_R8>, TB, OpSize16, Sched<[WriteALU]>; @@ -47,7 +47,7 @@ let mayLoad = 1 in def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), "movs{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVSX_R16_M8>, TB, OpSize16, Sched<[WriteALULd]>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), "movs{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (sext GR8:$src))], IIC_MOVSX>, TB, @@ -65,7 +65,7 @@ def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), [(set GR32:$dst, (sextloadi32i16 addr:$src))], IIC_MOVSX>, OpSize32, TB, Sched<[WriteALULd]>; -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_R8>, TB, OpSize16, Sched<[WriteALU]>; @@ -73,7 +73,7 @@ let mayLoad = 1 in def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), "movz{bw|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX_R16_M8>, TB, OpSize16, Sched<[WriteALULd]>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), "movz{bl|x}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (zext GR8:$src))], IIC_MOVZX>, TB, @@ -94,16 +94,26 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), // These are the same as the regular MOVZX32rr8 and MOVZX32rm8 // except that they use GR32_NOREX for the output operand register class // instead of GR32. This allows them to operate on h registers on x86-64. -let neverHasSideEffects = 1, isCodeGenOnly = 1 in { +let hasSideEffects = 0, isCodeGenOnly = 1 in { def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg, (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), - "movz{bl|x}\t{$src, $dst|$dst, $src}", + "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOVZX>, TB, Sched<[WriteALU]>; let mayLoad = 1 in def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem, (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), - "movz{bl|x}\t{$src, $dst|$dst, $src}", + "movz{bl|x}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOVZX>, TB, Sched<[WriteALULd]>; + +def MOVSX32_NOREXrr8 : I<0xBE, MRMSrcReg, + (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVSX>, TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVSX32_NOREXrm8 : I<0xBE, MRMSrcMem, + (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src} # NOREX", + [], IIC_MOVSX>, TB, Sched<[WriteALULd]>; } // MOVSX64rr8 always has a REX prefix and it has an 8-bit register diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td index c0a6864e2585..2993e42443d4 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -69,7 +69,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, /* IsRVariantCommutable */ 1, /* IsMVariantCommutable */ 1, Op>; -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, "132", PackTy), MemFrag128, MemFrag256, OpTy128, OpTy256>; @@ -81,7 +81,7 @@ let neverHasSideEffects = 1 in { MemFrag128, MemFrag256, OpTy128, OpTy256, /* IsRVariantCommutable */ 1, /* IsMVariantCommutable */ 0>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 } // Fused Multiply-Add @@ -155,7 +155,7 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, SDNode OpNode, RegisterClass RC, ValueType OpVT, X86MemOperand x86memop, Operand memop, PatFrag mem_frag, ComplexPattern mem_cpat> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC, OpVT, mem_frag>; // See the other defm of r231 for the explanation regarding the diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td index 4ad7b7e32a0a..e1abf26664bb 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -17,13 +17,13 @@ // FPStack specific DAG Nodes. //===----------------------------------------------------------------------===// -def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, +def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, SDTCisVT<1, f80>]>; def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, - SDTCisPtrTy<1>, + SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, - SDTCisPtrTy<1>, + SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>]>; @@ -98,7 +98,7 @@ let usesCustomInserter = 1 in { // Expanded after instruction selection. // All FP Stack operations are represented with four instructions here. The // first three instructions, generated by the instruction selector, use "RFP32" // "RFP64" or "RFP80" registers: traditional register files to reference 32-bit, -// 64-bit or 80-bit floating point values. These sizes apply to the values, +// 64-bit or 80-bit floating point values. These sizes apply to the values, // not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be // copied to each other without losing information. These instructions are all // pseudo instructions and use the "_Fp" suffix. @@ -107,16 +107,13 @@ let usesCustomInserter = 1 in { // Expanded after instruction selection. // The second instruction is defined with FPI, which is the actual instruction // emitted by the assembler. These use "RST" registers, although frequently // the actual register(s) used are implicit. These are always 80 bits. -// The FP stackifier pass converts one to the other after register allocation +// The FP stackifier pass converts one to the other after register allocation // occurs. // // Note that the FpI instruction should have instruction selection info (e.g. // a pattern) and the FPI instruction should have emission info (e.g. opcode // encoding and asm printing info). -// Pseudo Instruction for FP stack return values. -def FpPOP_RETVAL : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; - // FpIf32, FpIf64 - Floating Point Pseudo Instruction template. // f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1. // f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2. @@ -142,66 +139,66 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // These instructions cannot address 80-bit memory. multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> { // ST(0) = ST(0) + [mem] -def _Fp32m : FpIf32<(outs RFP32:$dst), +def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, + [(set RFP32:$dst, (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; -def _Fp64m : FpIf64<(outs RFP64:$dst), +def _Fp64m : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP64:$dst, + [(set RFP64:$dst, (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; -def _Fp64m32: FpIf64<(outs RFP64:$dst), +def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, + [(set RFP64:$dst, (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>; -def _Fp80m32: FpI_<(outs RFP80:$dst), +def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP80:$dst, + [(set RFP80:$dst, (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>; -def _Fp80m64: FpI_<(outs RFP80:$dst), +def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP80:$dst, + [(set RFP80:$dst, (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>; -def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), - !strconcat("f", asmstring, "{s}\t$src")> { - let mayLoad = 1; +def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), + !strconcat("f", asmstring, "{s}\t$src")> { + let mayLoad = 1; } -def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), - !strconcat("f", asmstring, "{l}\t$src")> { - let mayLoad = 1; +def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), + !strconcat("f", asmstring, "{l}\t$src")> { + let mayLoad = 1; } // ST(0) = ST(0) + [memint] -def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), +def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, [(set RFP32:$dst, (OpNode RFP32:$src1, (X86fild addr:$src2, i16)))]>; -def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), +def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, [(set RFP32:$dst, (OpNode RFP32:$src1, (X86fild addr:$src2, i32)))]>; -def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), +def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, [(set RFP64:$dst, (OpNode RFP64:$src1, (X86fild addr:$src2, i16)))]>; -def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), +def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, [(set RFP64:$dst, (OpNode RFP64:$src1, (X86fild addr:$src2, i32)))]>; -def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), +def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), OneArgFPRW, [(set RFP80:$dst, (OpNode RFP80:$src1, (X86fild addr:$src2, i16)))]>; -def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), +def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), OneArgFPRW, [(set RFP80:$dst, (OpNode RFP80:$src1, (X86fild addr:$src2, i32)))]>; -def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), - !strconcat("fi", asmstring, "{s}\t$src")> { - let mayLoad = 1; +def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), + !strconcat("fi", asmstring, "{s}\t$src")> { + let mayLoad = 1; } -def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), - !strconcat("fi", asmstring, "{l}\t$src")> { - let mayLoad = 1; +def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), + !strconcat("fi", asmstring, "{l}\t$src")> { + let mayLoad = 1; } } @@ -285,7 +282,7 @@ defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">; defm SIN : FPUnary<fsin, MRM_FE, "fsin">; defm COS : FPUnary<fcos, MRM_FF, "fcos">; -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>; def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>; def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; @@ -418,7 +415,7 @@ def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, [(truncstoref64 RFP80:$src, addr:$op)]>; // FST does not support 80-bit memory target; FSTP must be used. -let mayStore = 1, neverHasSideEffects = 1 in { +let mayStore = 1, hasSideEffects = 0 in { def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>; def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>; def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>; @@ -427,7 +424,7 @@ def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>; } def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP, [(store RFP80:$src, addr:$op)]>; -let mayStore = 1, neverHasSideEffects = 1 in { +let mayStore = 1, hasSideEffects = 0 in { def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>; def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>; def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>; @@ -503,7 +500,7 @@ def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst", IIC_FST>; def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst", IIC_FST>; -def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), +def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst", IIC_FST>; } @@ -639,7 +636,7 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), "fxsave\t$dst", [], IIC_FXSAVE>, TB; def FXSAVE64 : RI<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), - "fxsave{q|64}\t$dst", [], IIC_FXSAVE>, TB, + "fxsave{q|64}\t$dst", [], IIC_FXSAVE>, TB, Requires<[In64BitMode]>; def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), "fxrstor\t$src", [], IIC_FXRSTOR>, TB; @@ -659,12 +656,12 @@ def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>; // Required for CALL which return f32 / f64 / f80 values. def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; -def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, +def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, RFP64:$src)>; def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, +def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, RFP80:$src)>; -def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, +def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, RFP80:$src)>; def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, RFP80:$src)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm/lib/Target/X86/X86InstrFormats.td index 8ef5f901c185..ba2387823f5e 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFormats.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFormats.td @@ -36,20 +36,21 @@ def MRM6m : Format<30>; def MRM7m : Format<31>; def MRM_C0 : Format<32>; def MRM_C1 : Format<33>; def MRM_C2 : Format<34>; def MRM_C3 : Format<35>; def MRM_C4 : Format<36>; def MRM_C8 : Format<37>; def MRM_C9 : Format<38>; def MRM_CA : Format<39>; def MRM_CB : Format<40>; -def MRM_D0 : Format<41>; def MRM_D1 : Format<42>; def MRM_D4 : Format<43>; -def MRM_D5 : Format<44>; def MRM_D6 : Format<45>; def MRM_D8 : Format<46>; -def MRM_D9 : Format<47>; def MRM_DA : Format<48>; def MRM_DB : Format<49>; -def MRM_DC : Format<50>; def MRM_DD : Format<51>; def MRM_DE : Format<52>; -def MRM_DF : Format<53>; def MRM_E0 : Format<54>; def MRM_E1 : Format<55>; -def MRM_E2 : Format<56>; def MRM_E3 : Format<57>; def MRM_E4 : Format<58>; -def MRM_E5 : Format<59>; def MRM_E8 : Format<60>; def MRM_E9 : Format<61>; -def MRM_EA : Format<62>; def MRM_EB : Format<63>; def MRM_EC : Format<64>; -def MRM_ED : Format<65>; def MRM_EE : Format<66>; def MRM_F0 : Format<67>; -def MRM_F1 : Format<68>; def MRM_F2 : Format<69>; def MRM_F3 : Format<70>; -def MRM_F4 : Format<71>; def MRM_F5 : Format<72>; def MRM_F6 : Format<73>; -def MRM_F7 : Format<74>; def MRM_F8 : Format<75>; def MRM_F9 : Format<76>; -def MRM_FA : Format<77>; def MRM_FB : Format<78>; def MRM_FC : Format<79>; -def MRM_FD : Format<80>; def MRM_FE : Format<81>; def MRM_FF : Format<82>; +def MRM_CF : Format<41>; def MRM_D0 : Format<42>; def MRM_D1 : Format<43>; +def MRM_D4 : Format<44>; def MRM_D5 : Format<45>; def MRM_D6 : Format<46>; +def MRM_D7 : Format<47>; def MRM_D8 : Format<48>; def MRM_D9 : Format<49>; +def MRM_DA : Format<50>; def MRM_DB : Format<51>; def MRM_DC : Format<52>; +def MRM_DD : Format<53>; def MRM_DE : Format<54>; def MRM_DF : Format<55>; +def MRM_E0 : Format<56>; def MRM_E1 : Format<57>; def MRM_E2 : Format<58>; +def MRM_E3 : Format<59>; def MRM_E4 : Format<60>; def MRM_E5 : Format<61>; +def MRM_E8 : Format<62>; def MRM_E9 : Format<63>; def MRM_EA : Format<64>; +def MRM_EB : Format<65>; def MRM_EC : Format<66>; def MRM_ED : Format<67>; +def MRM_EE : Format<68>; def MRM_F0 : Format<69>; def MRM_F1 : Format<70>; +def MRM_F2 : Format<71>; def MRM_F3 : Format<72>; def MRM_F4 : Format<73>; +def MRM_F5 : Format<74>; def MRM_F6 : Format<75>; def MRM_F7 : Format<76>; +def MRM_F8 : Format<77>; def MRM_F9 : Format<78>; def MRM_FA : Format<79>; +def MRM_FB : Format<80>; def MRM_FC : Format<81>; def MRM_FD : Format<82>; +def MRM_FE : Format<83>; def MRM_FF : Format<84>; // ImmType - This specifies the immediate type used by an instruction. This is // part of the ad-hoc solution used to emit machine instruction encodings by our @@ -100,6 +101,7 @@ def CD8VF : CD8VForm<0>; // v := VL def CD8VH : CD8VForm<1>; // v := VL/2 def CD8VQ : CD8VForm<2>; // v := VL/4 def CD8VO : CD8VForm<3>; // v := VL/8 +// The tuple (subvector) forms. def CD8VT1 : CD8VForm<4>; // v := 1 def CD8VT2 : CD8VForm<5>; // v := 2 def CD8VT4 : CD8VForm<6>; // v := 4 @@ -144,11 +146,22 @@ def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix. def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode. def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode. +// Address size for encodings that change based on mode. +class AddressSize<bits<2> val> { + bits<2> Value = val; +} +def AdSizeX : AddressSize<0>; // Address size determined using addr operand. +def AdSize16 : AddressSize<1>; // Encodes a 16-bit address. +def AdSize32 : AddressSize<2>; // Encodes a 32-bit address. +def AdSize64 : AddressSize<3>; // Encodes a 64-bit address. + // Prefix byte classes which are used to indicate to the ad-hoc machine code // emitter that various prefix bytes are required. class OpSize16 { OperandSize OpSize = OpSize16; } class OpSize32 { OperandSize OpSize = OpSize32; } -class AdSize { bit hasAdSizePrefix = 1; } +class AdSize16 { AddressSize AdSize = AdSize16; } +class AdSize32 { AddressSize AdSize = AdSize32; } +class AdSize64 { AddressSize AdSize = AdSize64; } class REX_W { bit hasREX_WPrefix = 1; } class LOCK { bit hasLockPrefix = 1; } class REP { bit hasREPPrefix = 1; } @@ -229,9 +242,11 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, // AsmString from the parser, but still disassemble. OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change - // based on operand size of the mode + // based on operand size of the mode? bits<2> OpSizeBits = OpSize.Value; - bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix? + AddressSize AdSize = AdSizeX; // Does this instruction's encoding change + // based on address size of the mode? + bits<2> AdSizeBits = AdSize.Value; Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have? bits<3> OpPrefixBits = OpPrefix.Value; @@ -282,35 +297,35 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, CD8_EltSize, !srl(VectSize, CD8_Form{1-0}))), 0); - // TSFlags layout should be kept in sync with X86InstrInfo.h. + // TSFlags layout should be kept in sync with X86BaseInfo.h. let TSFlags{6-0} = FormBits; let TSFlags{8-7} = OpSizeBits; - let TSFlags{9} = hasAdSizePrefix; - let TSFlags{12-10} = OpPrefixBits; - let TSFlags{15-13} = OpMapBits; - let TSFlags{16} = hasREX_WPrefix; - let TSFlags{20-17} = ImmT.Value; - let TSFlags{23-21} = FPForm.Value; - let TSFlags{24} = hasLockPrefix; - let TSFlags{25} = hasREPPrefix; - let TSFlags{27-26} = ExeDomain.Value; - let TSFlags{29-28} = OpEncBits; - let TSFlags{37-30} = Opcode; - let TSFlags{38} = hasVEX_WPrefix; - let TSFlags{39} = hasVEX_4V; - let TSFlags{40} = hasVEX_4VOp3; - let TSFlags{41} = hasVEX_i8ImmReg; - let TSFlags{42} = hasVEX_L; - let TSFlags{43} = ignoresVEX_L; - let TSFlags{44} = hasEVEX_K; - let TSFlags{45} = hasEVEX_Z; - let TSFlags{46} = hasEVEX_L2; - let TSFlags{47} = hasEVEX_B; + let TSFlags{10-9} = AdSizeBits; + let TSFlags{13-11} = OpPrefixBits; + let TSFlags{16-14} = OpMapBits; + let TSFlags{17} = hasREX_WPrefix; + let TSFlags{21-18} = ImmT.Value; + let TSFlags{24-22} = FPForm.Value; + let TSFlags{25} = hasLockPrefix; + let TSFlags{26} = hasREPPrefix; + let TSFlags{28-27} = ExeDomain.Value; + let TSFlags{30-29} = OpEncBits; + let TSFlags{38-31} = Opcode; + let TSFlags{39} = hasVEX_WPrefix; + let TSFlags{40} = hasVEX_4V; + let TSFlags{41} = hasVEX_4VOp3; + let TSFlags{42} = hasVEX_i8ImmReg; + let TSFlags{43} = hasVEX_L; + let TSFlags{44} = ignoresVEX_L; + let TSFlags{45} = hasEVEX_K; + let TSFlags{46} = hasEVEX_Z; + let TSFlags{47} = hasEVEX_L2; + let TSFlags{48} = hasEVEX_B; // If we run out of TSFlags bits, it's possible to encode this in 3 bits. - let TSFlags{54-48} = CD8_Scale; - let TSFlags{55} = has3DNow0F0FOpcode; - let TSFlags{56} = hasMemOp4Prefix; - let TSFlags{57} = hasEVEX_RC; + let TSFlags{55-49} = CD8_Scale; + let TSFlags{56} = has3DNow0F0FOpcode; + let TSFlags{57} = hasMemOp4Prefix; + let TSFlags{58} = hasEVEX_RC; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -325,26 +340,26 @@ class I<bits<8> o, Format f, dag outs, dag ins, string asm, let Pattern = pattern; let CodeSize = 3; } -class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary, Domain d = GenericDomain> : X86Inst<o, f, Imm8, outs, ins, asm, itin, d> { let Pattern = pattern; let CodeSize = 3; } -class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm8PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } -class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm16, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } -class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm32, outs, ins, asm, itin> { let Pattern = pattern; @@ -357,14 +372,14 @@ class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm, let CodeSize = 3; } -class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm16PCRel, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } -class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, +class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm32PCRel, outs, ins, asm, itin> { let Pattern = pattern; @@ -391,14 +406,14 @@ class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern, // Iseg16 - 16-bit segment selector, 16-bit offset // Iseg32 - 16-bit segment selector, 32-bit offset -class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, +class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm16, outs, ins, asm, itin> { let Pattern = pattern; let CodeSize = 3; } -class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, +class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : X86Inst<o, f, Imm32, outs, ins, asm, itin> { let Pattern = pattern; @@ -476,7 +491,7 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, } // SSE1 Instruction Templates: -// +// // SSI - SSE1 instructions with XS prefix. // PSI - SSE1 instructions with PS prefix. // PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix. @@ -507,7 +522,7 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, Requires<[HasAVX]>; // SSE2 Instruction Templates: -// +// // SDI - SSE2 instructions with XD prefix. // SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix. // S2SI - SSE2 instructions with XS prefix. @@ -571,16 +586,16 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>; // SSE3 Instruction Templates: -// +// // S3I - SSE3 instructions with PD prefixes. // S3SI - SSE3 instructions with XS prefix. // S3DI - SSE3 instructions with XD prefix. -class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, +class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS, Requires<[UseSSE3]>; -class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, +class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD, Requires<[UseSSE3]>; @@ -591,7 +606,7 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, // SSSE3 Instruction Templates: -// +// // SS38I - SSSE3 instructions with T8 prefix. // SS3AI - SSSE3 instructions with TA prefix. // MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands. @@ -619,7 +634,7 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, Requires<[HasSSSE3]>; // SSE4.1 Instruction Templates: -// +// // SS48I - SSE 4.1 instructions with T8 prefix. // SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8. // @@ -633,7 +648,7 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, Requires<[UseSSE41]>; // SSE4.2 Instruction Templates: -// +// // SS428I - SSE 4.2 instructions with T8 prefix. class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> @@ -697,6 +712,9 @@ class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, Requires<[HasAVX512]>; +class AVX5128IBase : T8PD { + Domain ExeDomain = SSEPackedInt; +} class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS, @@ -713,14 +731,25 @@ class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD, Requires<[HasAVX512]>; +class AVX512BIBase : PD { + Domain ExeDomain = SSEPackedInt; +} class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, PD, Requires<[HasAVX512]>; +class AVX512BIi8Base : PD { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, Requires<[HasAVX512]>; +class AVX512AIi8Base : TAPD { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, @@ -743,6 +772,11 @@ class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, T8PD, EVEX_4V, Requires<[HasAVX512]>; +class AVX512FMA3Base : T8PD, EVEX_4V; + +class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, Requires<[HasAVX512]>; // AES Instruction Templates: // @@ -850,27 +884,27 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm, // MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix. // MMXID - MMX instructions with XD prefix. // MMXIS - MMX instructions with XS prefix. -class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>; -class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,Not64BitMode]>; -class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX,In64BitMode]>; -class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PS, REX_W, Requires<[HasMMX]>; -class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, PD, Requires<[HasMMX]>; -class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, PS, Requires<[HasMMX]>; -class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasMMX]>; -class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, +class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasMMX]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 6f0fa9462770..7069bd68e00e 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -83,7 +83,7 @@ def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; def X86insertps : SDNode<"X86ISD::INSERTPS", SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, - SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; + SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>; def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; @@ -188,6 +188,8 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>; +def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>]>; def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisInt<2>]>; def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, @@ -197,12 +199,17 @@ def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; + SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisInt<2>]>; +def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisInt<3>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; +def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; @@ -231,10 +238,11 @@ def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; -def X86VPermilp : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>; -def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; -def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; -def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>; +def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; +def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; +def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; +def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; +def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>; def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; @@ -247,6 +255,9 @@ def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; + +def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; + def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; @@ -254,6 +265,13 @@ def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; +def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>; +def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>; +def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; + +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; +def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; + def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, SDTCisVT<4, i8>]>; @@ -265,6 +283,13 @@ def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>; def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>; +def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, + SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; +def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3, + [SDTCisSameAs<0, 3>, + SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// @@ -311,6 +336,8 @@ def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; // 512-bit load pattern fragments def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; +def loadv64i8 : PatFrag<(ops node:$ptr), (v64i8 (load node:$ptr))>; +def loadv32i16 : PatFrag<(ops node:$ptr), (v32i16 (load node:$ptr))>; def loadv16i32 : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>; def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; @@ -397,20 +424,10 @@ def alignedloadv8i64 : PatFrag<(ops node:$ptr), // setting a feature bit in the processor (on startup, for example). // Opteron 10h and later implement such a feature. def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return Subtarget->hasVectorUAMem() + return Subtarget->hasSSEUnalignedMem() || cast<LoadSDNode>(N)->getAlignment() >= 16; }]>; -def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return Subtarget->hasVectorUAMem() - || cast<LoadSDNode>(N)->getAlignment() >= 4; -}]>; - -def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return Subtarget->hasVectorUAMem() - || cast<LoadSDNode>(N)->getAlignment() >= 8; -}]>; - def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; @@ -427,10 +444,10 @@ def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; // 512-bit memop pattern fragments -def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>; -def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop8 node:$ptr))>; -def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>; -def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop8 node:$ptr))>; +def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop node:$ptr))>; +def memopv8f64 : PatFrag<(ops node:$ptr), (v8f64 (memop node:$ptr))>; +def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop node:$ptr))>; +def memopv8i64 : PatFrag<(ops node:$ptr), (v8i64 (memop node:$ptr))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. @@ -509,7 +526,9 @@ def I8Imm : SDNodeXForm<imm, [{ }]>; def FROUND_NO_EXC : ImmLeaf<i32, [{ return Imm == 8; }]>; -def FROUND_CURRENT : ImmLeaf<i32, [{ return Imm == 4; }]>; +def FROUND_CURRENT : ImmLeaf<i32, [{ + return Imm == X86::STATIC_ROUNDING::CUR_DIRECTION; +}]>; // BYTE_imm - Transform bit immediates into byte immediates. def BYTE_imm : SDNodeXForm<imm, [{ diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 0d3afc43c2bb..6b6b8aedc9c6 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" @@ -64,6 +65,7 @@ enum { TB_INDEX_1 = 1, TB_INDEX_2 = 2, TB_INDEX_3 = 3, + TB_INDEX_4 = 4, TB_INDEX_MASK = 0xf, // Do not insert the reverse map (MemOp -> RegOp) into the table. @@ -100,8 +102,8 @@ void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(X86Subtarget &STI) : X86GenInstrInfo( - (STI.is64Bit() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), - (STI.is64Bit() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), Subtarget(STI), RI(STI) { static const X86OpTblEntry OpTbl2Addr[] = { @@ -144,14 +146,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::AND8rr, X86::AND8mr, 0 }, { X86::DEC16r, X86::DEC16m, 0 }, { X86::DEC32r, X86::DEC32m, 0 }, - { X86::DEC64_16r, X86::DEC64_16m, 0 }, - { X86::DEC64_32r, X86::DEC64_32m, 0 }, { X86::DEC64r, X86::DEC64m, 0 }, { X86::DEC8r, X86::DEC8m, 0 }, { X86::INC16r, X86::INC16m, 0 }, { X86::INC32r, X86::INC32m, 0 }, - { X86::INC64_16r, X86::INC64_16m, 0 }, - { X86::INC64_32r, X86::INC64_32m, 0 }, { X86::INC64r, X86::INC64m, 0 }, { X86::INC8r, X86::INC8m, 0 }, { X86::NEG16r, X86::NEG16m, 0 }, @@ -377,7 +375,39 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, // AVX-512 foldable instructions - { X86::VMOVPDI2DIZrr,X86::VMOVPDI2DIZmr, TB_FOLDED_STORE } + { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, + { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, + { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE }, + { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE }, + { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE }, + { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, + { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, + { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions (256-bit versions) + { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, + { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE }, + { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, + { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, + // AVX-512 foldable instructions (128-bit versions) + { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, + { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE }, + { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE }, + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE } }; for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { @@ -415,6 +445,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, + { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, + { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, + { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, + { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, @@ -493,6 +527,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, + { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, + { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 }, + { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 }, + { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, + { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, + { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, @@ -526,6 +566,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, // AVX 256-bit foldable instructions + { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, + { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, + { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, + { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, + { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, + { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, @@ -533,6 +579,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, + { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, + { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, + { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, + { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, + { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, + { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, + { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, // AVX2 foldable instructions { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, @@ -541,13 +594,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, - { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, - { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, - { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, - { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, - { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, - { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, - { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, @@ -601,18 +647,51 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 foldable instructions { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, - { X86::VMOVDQA32rr, X86::VMOVDQA32rm, TB_ALIGN_64 }, - { X86::VMOVDQA64rr, X86::VMOVDQA64rm, TB_ALIGN_64 }, - { X86::VMOVDQU32rr, X86::VMOVDQU32rm, 0 }, - { X86::VMOVDQU64rr, X86::VMOVDQU64rm, 0 }, + { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, + { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, + { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, + { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 }, + { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 }, + { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 }, + { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 }, + { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 }, + { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 }, + { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, + { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + // AVX-512 foldable instructions (256-bit versions) + { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, + { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, + { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, + { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 }, + { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 }, + { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 }, + { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 }, + { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, + { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, + { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, + { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + // AVX-512 foldable instructions (256-bit versions) + { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, + { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, + { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, + { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 }, + { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 }, + { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 }, + { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 }, + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, + { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, + { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, + { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, // AES foldable instructions { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, { X86::VAESIMCrr, X86::VAESIMCrm, TB_ALIGN_16 }, - { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 }, + { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 } }; for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { @@ -869,8 +948,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, - { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, - { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, @@ -1249,6 +1326,19 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VALIGNQrri, X86::VALIGNQrmi, 0 }, { X86::VALIGNDrri, X86::VALIGNDrmi, 0 }, { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, + { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, + + // AVX-512{F,VL} foldable instructions + { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, + + // AVX-512{F,VL} foldable instructions + { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 }, + { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, + { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, + { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, // AES foldable instructions { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, @@ -1429,7 +1519,51 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 }, { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 }, { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 }, - { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 } + { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 }, + { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, + // AVX-512 arithmetic instructions + { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, + { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, + { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, + { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, + { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, + { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, + { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, + { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, + { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, + { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, + { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, + { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, + // AVX-512{F,VL} arithmetic instructions 256-bit + { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, + { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, + { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, + { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, + { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, + { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, + { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, + { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, + { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, + { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, + { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, + { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, + // AVX-512{F,VL} arithmetic instructions 128-bit + { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, + { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, + { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, + { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, + { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, + { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, + { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, + { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, + { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, + { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, + { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, + { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 } }; for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { @@ -1442,6 +1576,57 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } + static const X86OpTblEntry OpTbl4[] = { + // AVX-512 foldable instructions + { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, + { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, + { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, + { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, + { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, + { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, + { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, + { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, + { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, + { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, + { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, + { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, + // AVX-512{F,VL} foldable instructions 256-bit + { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, + { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, + { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, + { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, + { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, + { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, + { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, + { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, + { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, + { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, + { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, + { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, + // AVX-512{F,VL} foldable instructions 128-bit + { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, + { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, + { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, + { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, + { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, + { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, + { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, + { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, + { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, + { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, + { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, + { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 } + }; + + for (unsigned i = 0, e = array_lengthof(OpTbl4); i != e; ++i) { + unsigned RegOp = OpTbl4[i].RegOp; + unsigned MemOp = OpTbl4[i].MemOp; + unsigned Flags = OpTbl4[i].Flags; + AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, + RegOp, MemOp, + // Index 4, folded load + Flags | TB_INDEX_4 | TB_FOLDED_LOAD); + } } void @@ -1507,6 +1692,58 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, return false; } +int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); + + if (MI->getOpcode() == getCallFrameSetupOpcode() || + MI->getOpcode() == getCallFrameDestroyOpcode()) { + unsigned StackAlign = TFI->getStackAlignment(); + int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * + StackAlign; + + SPAdj -= MI->getOperand(1).getImm(); + + if (MI->getOpcode() == getCallFrameSetupOpcode()) + return SPAdj; + else + return -SPAdj; + } + + // To know whether a call adjusts the stack, we need information + // that is bound to the following ADJCALLSTACKUP pseudo. + // Look for the next ADJCALLSTACKUP that follows the call. + if (MI->isCall()) { + const MachineBasicBlock* MBB = MI->getParent(); + auto I = ++MachineBasicBlock::const_iterator(MI); + for (auto E = MBB->end(); I != E; ++I) { + if (I->getOpcode() == getCallFrameDestroyOpcode() || + I->isCall()) + break; + } + + // If we could not find a frame destroy opcode, then it has already + // been simplified, so we don't care. + if (I->getOpcode() != getCallFrameDestroyOpcode()) + return 0; + + return -(I->getOperand(1).getImm()); + } + + // Currently handle only PUSHes we can reasonably expect to see + // in call sequences + switch (MI->getOpcode()) { + default: + return 0; + case X86::PUSH32i8: + case X86::PUSH32r: + case X86::PUSH32rmm: + case X86::PUSH32rmr: + case X86::PUSHi32: + return 4; + } +} + /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, @@ -1543,8 +1780,11 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::VMOVAPSrm: case X86::VMOVAPDrm: case X86::VMOVDQArm: + case X86::VMOVUPSYrm: case X86::VMOVAPSYrm: + case X86::VMOVUPDYrm: case X86::VMOVAPDYrm: + case X86::VMOVDQUYrm: case X86::VMOVDQAYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: @@ -1572,8 +1812,11 @@ static bool isFrameStoreOpcode(int Opcode) { case X86::VMOVAPSmr: case X86::VMOVAPDmr: case X86::VMOVDQAmr: + case X86::VMOVUPSYmr: case X86::VMOVAPSYmr: + case X86::VMOVUPDYmr: case X86::VMOVAPDYmr: + case X86::VMOVDQUYmr: case X86::VMOVDQAYmr: case X86::VMOVUPSZmr: case X86::VMOVAPSZmr: @@ -1980,11 +2223,9 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, break; } case X86::INC16r: - case X86::INC64_16r: addRegOffset(MIB, leaInReg, true, 1); break; case X86::DEC16r: - case X86::DEC64_16r: addRegOffset(MIB, leaInReg, true, -1); break; case X86::ADD16ri: @@ -2078,34 +2319,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, unsigned MIOpc = MI->getOpcode(); switch (MIOpc) { - case X86::SHUFPSrri: { - assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!"); - if (!Subtarget.hasSSE2()) return nullptr; - - unsigned B = MI->getOperand(1).getReg(); - unsigned C = MI->getOperand(2).getReg(); - if (B != C) return nullptr; - unsigned M = MI->getOperand(3).getImm(); - NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addOperand(Dest).addOperand(Src).addImm(M); - break; - } - case X86::SHUFPDrri: { - assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!"); - if (!Subtarget.hasSSE2()) return nullptr; - - unsigned B = MI->getOperand(1).getReg(); - unsigned C = MI->getOperand(2).getReg(); - if (B != C) return nullptr; - unsigned M = MI->getOperand(3).getImm(); - - // Convert to PSHUFD mask. - M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44; - - NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri)) - .addOperand(Dest).addOperand(Src).addImm(M); - break; - } + default: return nullptr; case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); @@ -2160,185 +2374,175 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } - default: { + case X86::INC64r: + case X86::INC32r: { + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; - switch (MIOpc) { - default: return nullptr; - case X86::INC64r: - case X86::INC32r: - case X86::INC64_32r: { - assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); - unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r - : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - bool isKill, isUndef; - unsigned SrcReg; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) - return nullptr; + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); - if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + NewMI = addOffset(MIB, 1); + break; + } + case X86::INC16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), 1); + break; + case X86::DEC64r: + case X86::DEC32r: { + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r + : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - NewMI = addOffset(MIB, 1); - break; - } - case X86::INC16r: - case X86::INC64_16r: - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) - : nullptr; - assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest).addOperand(Src), 1); - break; - case X86::DEC64r: - case X86::DEC32r: - case X86::DEC64_32r: { - assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); - unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r - : (is64Bit ? X86::LEA64_32r : X86::LEA32r); - - bool isKill, isUndef; - unsigned SrcReg; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, - SrcReg, isKill, isUndef, ImplicitOp)) - return nullptr; + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); - if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); - NewMI = addOffset(MIB, -1); + NewMI = addOffset(MIB, -1); - break; - } - case X86::DEC16r: - case X86::DEC64_16r: - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) - : nullptr; - assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest).addOperand(Src), -1); - break; - case X86::ADD64rr: - case X86::ADD64rr_DB: - case X86::ADD32rr: - case X86::ADD32rr_DB: { - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Opc; - if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) - Opc = X86::LEA64r; - else - Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; + break; + } + case X86::DEC16r: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), -1); + break; + case X86::ADD64rr: + case X86::ADD64rr_DB: + case X86::ADD32rr: + case X86::ADD32rr_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc; + if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) + Opc = X86::LEA64r; + else + Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - bool isKill, isUndef; - unsigned SrcReg; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp)) - return nullptr; + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; - const MachineOperand &Src2 = MI->getOperand(2); - bool isKill2, isUndef2; - unsigned SrcReg2; - MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, - SrcReg2, isKill2, isUndef2, ImplicitOp2)) - return nullptr; + const MachineOperand &Src2 = MI->getOperand(2); + bool isKill2, isUndef2; + unsigned SrcReg2; + MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, + SrcReg2, isKill2, isUndef2, ImplicitOp2)) + return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest); - if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); - if (ImplicitOp2.getReg() != 0) - MIB.addOperand(ImplicitOp2); + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + if (ImplicitOp2.getReg() != 0) + MIB.addOperand(ImplicitOp2); - NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); + NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); - // Preserve undefness of the operands. - NewMI->getOperand(1).setIsUndef(isUndef); - NewMI->getOperand(3).setIsUndef(isUndef2); + // Preserve undefness of the operands. + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); - if (LV && Src2.isKill()) - LV->replaceKillInstruction(SrcReg2, MI, NewMI); - break; - } - case X86::ADD16rr: - case X86::ADD16rr_DB: { - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) - : nullptr; - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Src2 = MI->getOperand(2).getReg(); - bool isKill2 = MI->getOperand(2).isKill(); - NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest), - Src.getReg(), Src.isKill(), Src2, isKill2); - - // Preserve undefness of the operands. - bool isUndef = MI->getOperand(1).isUndef(); - bool isUndef2 = MI->getOperand(2).isUndef(); - NewMI->getOperand(1).setIsUndef(isUndef); - NewMI->getOperand(3).setIsUndef(isUndef2); - - if (LV && isKill2) - LV->replaceKillInstruction(Src2, MI, NewMI); - break; - } - case X86::ADD64ri32: - case X86::ADD64ri8: - case X86::ADD64ri32_DB: - case X86::ADD64ri8_DB: - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) - .addOperand(Dest).addOperand(Src), - MI->getOperand(2).getImm()); - break; - case X86::ADD32ri: - case X86::ADD32ri8: - case X86::ADD32ri_DB: - case X86::ADD32ri8_DB: { - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - - bool isKill, isUndef; - unsigned SrcReg; - MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); - if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, - SrcReg, isKill, isUndef, ImplicitOp)) - return nullptr; + if (LV && Src2.isKill()) + LV->replaceKillInstruction(SrcReg2, MI, NewMI); + break; + } + case X86::ADD16rr: + case X86::ADD16rr_DB: { + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Src2 = MI->getOperand(2).getReg(); + bool isKill2 = MI->getOperand(2).isKill(); + NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); + + // Preserve undefness of the operands. + bool isUndef = MI->getOperand(1).isUndef(); + bool isUndef2 = MI->getOperand(2).isUndef(); + NewMI->getOperand(1).setIsUndef(isUndef); + NewMI->getOperand(3).setIsUndef(isUndef2); + + if (LV && isKill2) + LV->replaceKillInstruction(Src2, MI, NewMI); + break; + } + case X86::ADD64ri32: + case X86::ADD64ri8: + case X86::ADD64ri32_DB: + case X86::ADD64ri8_DB: + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); + break; + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD32ri_DB: + case X86::ADD32ri8_DB: { + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); - if (ImplicitOp.getReg() != 0) - MIB.addOperand(ImplicitOp); + bool isKill, isUndef; + unsigned SrcReg; + MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); + if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, + SrcReg, isKill, isUndef, ImplicitOp)) + return nullptr; - NewMI = addOffset(MIB, MI->getOperand(2).getImm()); - break; - } - case X86::ADD16ri: - case X86::ADD16ri8: - case X86::ADD16ri_DB: - case X86::ADD16ri8_DB: - if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) - : nullptr; - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest).addOperand(Src), - MI->getOperand(2).getImm()); - break; - } + MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + if (ImplicitOp.getReg() != 0) + MIB.addOperand(ImplicitOp); + + NewMI = addOffset(MIB, MI->getOperand(2).getImm()); + break; } + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD16ri_DB: + case X86::ADD16ri8_DB: + if (DisableLEA16) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + : nullptr; + assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest).addOperand(Src), + MI->getOperand(2).getImm()); + break; } if (!NewMI) return nullptr; @@ -2387,6 +2591,42 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { MI->getOperand(3).setImm(Size-Amt); return TargetInstrInfo::commuteInstruction(MI, NewMI); } + case X86::BLENDPDrri: + case X86::BLENDPSrri: + case X86::PBLENDWrri: + case X86::VBLENDPDrri: + case X86::VBLENDPSrri: + case X86::VBLENDPDYrri: + case X86::VBLENDPSYrri: + case X86::VPBLENDDrri: + case X86::VPBLENDWrri: + case X86::VPBLENDDYrri: + case X86::VPBLENDWYrri:{ + unsigned Mask; + switch (MI->getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::BLENDPDrri: Mask = 0x03; break; + case X86::BLENDPSrri: Mask = 0x0F; break; + case X86::PBLENDWrri: Mask = 0xFF; break; + case X86::VBLENDPDrri: Mask = 0x03; break; + case X86::VBLENDPSrri: Mask = 0x0F; break; + case X86::VBLENDPDYrri: Mask = 0x0F; break; + case X86::VBLENDPSYrri: Mask = 0xFF; break; + case X86::VPBLENDDrri: Mask = 0x0F; break; + case X86::VPBLENDWrri: Mask = 0xFF; break; + case X86::VPBLENDDYrri: Mask = 0xFF; break; + case X86::VPBLENDWYrri: Mask = 0xFF; break; + } + // Only the least significant bits of Imm are used. + unsigned Imm = MI->getOperand(3).getImm() & Mask; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm(Mask ^ Imm); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: @@ -2471,6 +2711,20 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { + case X86::BLENDPDrri: + case X86::BLENDPSrri: + case X86::PBLENDWrri: + case X86::VBLENDPDrri: + case X86::VBLENDPSrri: + case X86::VBLENDPDYrri: + case X86::VBLENDPSYrri: + case X86::VPBLENDDrri: + case X86::VPBLENDDYrri: + case X86::VPBLENDWrri: + case X86::VPBLENDWYrri: + SrcOpIdx1 = 1; + SrcOpIdx2 = 2; + return true; case X86::VFMADDPDr231r: case X86::VFMADDPSr231r: case X86::VFMADDSDr231r: @@ -2506,22 +2760,22 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { switch (BrOpc) { default: return X86::COND_INVALID; - case X86::JE_4: return X86::COND_E; - case X86::JNE_4: return X86::COND_NE; - case X86::JL_4: return X86::COND_L; - case X86::JLE_4: return X86::COND_LE; - case X86::JG_4: return X86::COND_G; - case X86::JGE_4: return X86::COND_GE; - case X86::JB_4: return X86::COND_B; - case X86::JBE_4: return X86::COND_BE; - case X86::JA_4: return X86::COND_A; - case X86::JAE_4: return X86::COND_AE; - case X86::JS_4: return X86::COND_S; - case X86::JNS_4: return X86::COND_NS; - case X86::JP_4: return X86::COND_P; - case X86::JNP_4: return X86::COND_NP; - case X86::JO_4: return X86::COND_O; - case X86::JNO_4: return X86::COND_NO; + case X86::JE_1: return X86::COND_E; + case X86::JNE_1: return X86::COND_NE; + case X86::JL_1: return X86::COND_L; + case X86::JLE_1: return X86::COND_LE; + case X86::JG_1: return X86::COND_G; + case X86::JGE_1: return X86::COND_GE; + case X86::JB_1: return X86::COND_B; + case X86::JBE_1: return X86::COND_BE; + case X86::JA_1: return X86::COND_A; + case X86::JAE_1: return X86::COND_AE; + case X86::JS_1: return X86::COND_S; + case X86::JNS_1: return X86::COND_NS; + case X86::JP_1: return X86::COND_P; + case X86::JNP_1: return X86::COND_NP; + case X86::JO_1: return X86::COND_O; + case X86::JNO_1: return X86::COND_NO; } } @@ -2606,22 +2860,22 @@ X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { switch (CC) { default: llvm_unreachable("Illegal condition code!"); - case X86::COND_E: return X86::JE_4; - case X86::COND_NE: return X86::JNE_4; - case X86::COND_L: return X86::JL_4; - case X86::COND_LE: return X86::JLE_4; - case X86::COND_G: return X86::JG_4; - case X86::COND_GE: return X86::JGE_4; - case X86::COND_B: return X86::JB_4; - case X86::COND_BE: return X86::JBE_4; - case X86::COND_A: return X86::JA_4; - case X86::COND_AE: return X86::JAE_4; - case X86::COND_S: return X86::JS_4; - case X86::COND_NS: return X86::JNS_4; - case X86::COND_P: return X86::JP_4; - case X86::COND_NP: return X86::JNP_4; - case X86::COND_O: return X86::JO_4; - case X86::COND_NO: return X86::JNO_4; + case X86::COND_E: return X86::JE_1; + case X86::COND_NE: return X86::JNE_1; + case X86::COND_L: return X86::JL_1; + case X86::COND_LE: return X86::JLE_1; + case X86::COND_G: return X86::JG_1; + case X86::COND_GE: return X86::JGE_1; + case X86::COND_B: return X86::JB_1; + case X86::COND_BE: return X86::JBE_1; + case X86::COND_A: return X86::JA_1; + case X86::COND_AE: return X86::JAE_1; + case X86::COND_S: return X86::JS_1; + case X86::COND_NS: return X86::JNS_1; + case X86::COND_P: return X86::JP_1; + case X86::COND_NP: return X86::JNP_1; + case X86::COND_O: return X86::JO_1; + case X86::COND_NO: return X86::JNO_1; } } @@ -2779,7 +3033,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return true; // Handle unconditional branches. - if (I->getOpcode() == X86::JMP_4) { + if (I->getOpcode() == X86::JMP_1) { UnCondBrIter = I; if (!AllowModify) { @@ -2841,7 +3095,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC)) .addMBB(UnCondBrIter->getOperand(0).getMBB()); - BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_4)) + BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1)) .addMBB(TargetBB); OldInst->eraseFromParent(); @@ -2906,7 +3160,7 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { --I; if (I->isDebugValue()) continue; - if (I->getOpcode() != X86::JMP_4 && + if (I->getOpcode() != X86::JMP_1 && getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) break; // Remove the branch. @@ -2931,7 +3185,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, if (Cond.empty()) { // Unconditional branch? assert(!FBB && "Unconditional branch with multiple successors!"); - BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB); return 1; } @@ -2941,16 +3195,16 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, switch (CC) { case X86::COND_NP_OR_E: // Synthesize NP_OR_E with two branches. - BuildMI(&MBB, DL, get(X86::JNP_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); ++Count; - BuildMI(&MBB, DL, get(X86::JE_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB); ++Count; break; case X86::COND_NE_OR_P: // Synthesize NE_OR_P with two branches. - BuildMI(&MBB, DL, get(X86::JNE_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB); ++Count; - BuildMI(&MBB, DL, get(X86::JP_4)).addMBB(TBB); + BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB); ++Count; break; default: { @@ -2961,7 +3215,7 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, } if (FBB) { // Two-way Conditional branch. Insert the second branch. - BuildMI(&MBB, DL, get(X86::JMP_4)).addMBB(FBB); + BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB); ++Count; } return Count; @@ -3067,6 +3321,8 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, inline static bool MaskRegClassContains(unsigned Reg) { return X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || + X86::VK32RegClass.contains(Reg) || + X86::VK64RegClass.contains(Reg) || X86::VK1RegClass.contains(Reg); } static @@ -3143,7 +3399,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Moving EFLAGS to / from another register requires a push and a pop. // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. See X86FrameLowering.cpp - colobbersTheStack. + // first frame index. See X86FrameLowering.cpp - clobbersTheStack. if (SrcReg == X86::EFLAGS) { if (X86::GR64RegClass.contains(DestReg)) { BuildMI(MBB, MI, DL, get(X86::PUSHF64)); @@ -3287,9 +3543,11 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = - (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); + bool isAligned = (MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) @@ -3324,9 +3582,11 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16); - bool isAligned = - (MF.getTarget().getFrameLowering()->getStackAlignment() >= Alignment) || - RI.canRealignStack(MF); + bool isAligned = (MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getStackAlignment() >= Alignment) || + RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); @@ -3495,14 +3755,12 @@ inline static bool isDefConvertible(MachineInstr *MI) { case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r: - case X86::DEC64_32r: case X86::DEC64_16r: case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr: case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm: case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm: case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r: - case X86::INC64_32r: case X86::INC64_16r: case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri: case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8: case X86::AND8ri: case X86::AND64rr: case X86::AND32rr: @@ -3868,10 +4126,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, /// operand at the use. We fold the load instructions if load defines a virtual /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. -MachineInstr* X86InstrInfo:: -optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, - unsigned &FoldAsLoadDefReg, - MachineInstr *&DefMI) const { +MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { if (FoldAsLoadDefReg == 0) return nullptr; // To be conservative, if there exists another load, clear the load candidate. @@ -3887,55 +4145,35 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, if (!DefMI->isSafeToMove(this, nullptr, SawStore)) return nullptr; - // We try to commute MI if possible. - unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1; - for (unsigned Idx = 0; Idx < IdxEnd; Idx++) { - // Collect information about virtual register operands of MI. - unsigned SrcOperandId = 0; - bool FoundSrcOperand = false; - for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (Reg != FoldAsLoadDefReg) - continue; - // Do not fold if we have a subreg use or a def or multiple uses. - if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) - return nullptr; - - SrcOperandId = i; - FoundSrcOperand = true; - } - if (!FoundSrcOperand) return nullptr; - - // Check whether we can fold the def into SrcOperandId. - SmallVector<unsigned, 8> Ops; - Ops.push_back(SrcOperandId); - MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); - if (FoldMI) { - FoldAsLoadDefReg = 0; - return FoldMI; - } - - if (Idx == 1) { - // MI was changed but it didn't help, commute it back! - commuteInstruction(MI, false); + // Collect information about virtual register operands of MI. + unsigned SrcOperandId = 0; + bool FoundSrcOperand = false; + for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg != FoldAsLoadDefReg) + continue; + // Do not fold if we have a subreg use or a def or multiple uses. + if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) return nullptr; - } - // Check whether we can commute MI and enable folding. - if (MI->isCommutable()) { - MachineInstr *NewMI = commuteInstruction(MI, false); - // Unable to commute. - if (!NewMI) return nullptr; - if (NewMI != MI) { - // New instruction. It doesn't need to be kept. - NewMI->eraseFromParent(); - return nullptr; - } - } + SrcOperandId = i; + FoundSrcOperand = true; } + if (!FoundSrcOperand) + return nullptr; + + // Check whether we can fold the def into SrcOperandId. + SmallVector<unsigned, 8> Ops; + Ops.push_back(SrcOperandId); + MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); + if (FoldMI) { + FoldAsLoadDefReg = 0; + return FoldMI; + } + return nullptr; } @@ -3961,6 +4199,28 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, return true; } +// LoadStackGuard has so far only been implemented for 64-bit MachO. Different +// code sequence is needed for other targets. +static void expandLoadStackGuard(MachineInstrBuilder &MIB, + const TargetInstrInfo &TII) { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + unsigned Reg = MIB->getOperand(0).getReg(); + const GlobalValue *GV = + cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); + unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + MachineMemOperand *MMO = MBB.getParent()-> + getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8); + MachineBasicBlock::iterator I = MIB.getInstr(); + + BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) + .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0) + .addMemOperand(MMO); + MIB->setDebugLoc(DL); + MIB->setDesc(TII.get(X86::MOV64rm)); + MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); +} + bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); @@ -3991,10 +4251,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; - case X86::KSET0B: + case X86::KSET0B: case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); case X86::KSET1B: case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); + case TargetOpcode::LOAD_STACK_GUARD: + expandLoadStackGuard(MIB, *this); + return true; } return false; } @@ -4070,7 +4333,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned i, const SmallVectorImpl<MachineOperand> &MOs, - unsigned Size, unsigned Align) const { + unsigned Size, unsigned Align, + bool AllowCommute) const { const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = Subtarget.callRegIndirect(); @@ -4117,6 +4381,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, OpcodeTablePtr = &RegOp2MemOpTable2; } else if (i == 3) { OpcodeTablePtr = &RegOp2MemOpTable3; + } else if (i == 4) { + OpcodeTablePtr = &RegOp2MemOpTable4; } // If table selected... @@ -4138,8 +4404,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) return nullptr; // If this is a 64-bit load, but the spill slot is 32, then we can do - // a 32-bit load which is implicitly zero-extended. This likely is due - // to liveintervalanalysis remat'ing a load from stack slot. + // a 32-bit load which is implicitly zero-extended. This likely is + // due to live interval analysis remat'ing a load from stack slot. if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) return nullptr; Opcode = X86::MOV32rm; @@ -4158,8 +4424,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // to a 32-bit one. unsigned DstReg = NewMI->getOperand(0).getReg(); if (TargetRegisterInfo::isPhysicalRegister(DstReg)) - NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, - X86::sub_32bit)); + NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); else NewMI->getOperand(0).setSubReg(X86::sub_32bit); } @@ -4167,6 +4432,65 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, } } + // If the instruction and target operand are commutable, commute the + // instruction and try again. + if (AllowCommute) { + unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2; + if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { + bool HasDef = MI->getDesc().getNumDefs(); + unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; + unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); + unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); + bool Tied0 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied1 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); + + // If either of the commutable operands are tied to the destination + // then we can not commute + fold. + if ((HasDef && Reg0 == Reg1 && Tied0) || + (HasDef && Reg0 == Reg2 && Tied1)) + return nullptr; + + if ((CommuteOpIdx1 == OriginalOpIdx) || + (CommuteOpIdx2 == OriginalOpIdx)) { + MachineInstr *CommutedMI = commuteInstruction(MI, false); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } + + // Attempt to fold with the commuted version of the instruction. + unsigned CommuteOp = + (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align, + /*AllowCommute=*/false); + if (NewMI) + return NewMI; + + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = commuteInstruction(MI, false); + if (!UncommutedMI) { + // Unable to commute. + return nullptr; + } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; + } + } + } + // No fusion if (PrintFailedFusing && !MI->isCopy()) dbgs() << "We failed to fuse operand " << i << " in " << *MI; @@ -4192,23 +4516,43 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, static bool hasPartialRegUpdate(unsigned Opcode) { switch (Opcode) { case X86::CVTSI2SSrr: + case X86::CVTSI2SSrm: case X86::CVTSI2SS64rr: + case X86::CVTSI2SS64rm: case X86::CVTSI2SDrr: + case X86::CVTSI2SDrm: case X86::CVTSI2SD64rr: + case X86::CVTSI2SD64rm: case X86::CVTSD2SSrr: + case X86::CVTSD2SSrm: case X86::Int_CVTSD2SSrr: + case X86::Int_CVTSD2SSrm: case X86::CVTSS2SDrr: + case X86::CVTSS2SDrm: case X86::Int_CVTSS2SDrr: + case X86::Int_CVTSS2SDrm: case X86::RCPSSr: + case X86::RCPSSm: case X86::RCPSSr_Int: + case X86::RCPSSm_Int: case X86::ROUNDSDr: + case X86::ROUNDSDm: case X86::ROUNDSDr_Int: case X86::ROUNDSSr: + case X86::ROUNDSSm: case X86::ROUNDSSr_Int: case X86::RSQRTSSr: + case X86::RSQRTSSm: case X86::RSQRTSSr_Int: + case X86::RSQRTSSm_Int: case X86::SQRTSSr: + case X86::SQRTSSm: case X86::SQRTSSr_Int: + case X86::SQRTSSm_Int: + case X86::SQRTSDr: + case X86::SQRTSDm: + case X86::SQRTSDr_Int: + case X86::SQRTSDm_Int: return true; } @@ -4245,28 +4589,52 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, static bool hasUndefRegUpdate(unsigned Opcode) { switch (Opcode) { case X86::VCVTSI2SSrr: + case X86::VCVTSI2SSrm: case X86::Int_VCVTSI2SSrr: + case X86::Int_VCVTSI2SSrm: case X86::VCVTSI2SS64rr: + case X86::VCVTSI2SS64rm: case X86::Int_VCVTSI2SS64rr: + case X86::Int_VCVTSI2SS64rm: case X86::VCVTSI2SDrr: + case X86::VCVTSI2SDrm: case X86::Int_VCVTSI2SDrr: + case X86::Int_VCVTSI2SDrm: case X86::VCVTSI2SD64rr: + case X86::VCVTSI2SD64rm: case X86::Int_VCVTSI2SD64rr: + case X86::Int_VCVTSI2SD64rm: case X86::VCVTSD2SSrr: + case X86::VCVTSD2SSrm: case X86::Int_VCVTSD2SSrr: + case X86::Int_VCVTSD2SSrm: case X86::VCVTSS2SDrr: + case X86::VCVTSS2SDrm: case X86::Int_VCVTSS2SDrr: + case X86::Int_VCVTSS2SDrm: case X86::VRCPSSr: + case X86::VRCPSSm: + case X86::VRCPSSm_Int: case X86::VROUNDSDr: + case X86::VROUNDSDm: case X86::VROUNDSDr_Int: case X86::VROUNDSSr: + case X86::VROUNDSSm: case X86::VROUNDSSr_Int: case X86::VRSQRTSSr: + case X86::VRSQRTSSm: + case X86::VRSQRTSSm_Int: case X86::VSQRTSSr: - - // AVX-512 + case X86::VSQRTSSm: + case X86::VSQRTSSm_Int: + case X86::VSQRTSDr: + case X86::VSQRTSDm: + case X86::VSQRTSDm_Int: + // AVX-512 case X86::VCVTSD2SSZrr: + case X86::VCVTSD2SSZrm: case X86::VCVTSS2SDZrr: + case X86::VCVTSS2SDZrm: return true; } @@ -4350,8 +4718,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) - Alignment = std::min( - Alignment, MF.getTarget().getFrameLowering()->getStackAlignment()); + Alignment = std::min(Alignment, MF.getTarget() + .getSubtargetImpl() + ->getFrameLowering() + ->getStackAlignment()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; @@ -4374,7 +4744,27 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, SmallVector<MachineOperand,4> MOs; MOs.push_back(MachineOperand::CreateFI(FrameIndex)); - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, + Size, Alignment, /*AllowCommute=*/true); +} + +static bool isPartialRegisterLoad(const MachineInstr &LoadMI, + const MachineFunction &MF) { + unsigned Opc = LoadMI.getOpcode(); + unsigned RegSize = + MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); + + if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) + // These instructions only load 32 bits, we can't fold them if the + // destination register is wider than 32 bits (4 bytes). + return true; + + if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) + // These instructions only load 64 bits, we can't fold them if the + // destination register is wider than 64 bits (8 bytes). + return true; + + return false; } MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, @@ -4384,8 +4774,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI->getDesc().getNumOperands(); int FrameIndex; - if (isLoadFromStackSlot(LoadMI, FrameIndex)) + if (isLoadFromStackSlot(LoadMI, FrameIndex)) { + if (isPartialRegisterLoad(*LoadMI, MF)) + return nullptr; return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex); + } // Check switch flag if (NoFusing) return nullptr; @@ -4496,19 +4889,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, break; } default: { - if ((LoadMI->getOpcode() == X86::MOVSSrm || - LoadMI->getOpcode() == X86::VMOVSSrm) && - MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() - > 4) - // These instructions only load 32 bits, we can't fold them if the - // destination register is wider than 32 bits (4 bytes). - return nullptr; - if ((LoadMI->getOpcode() == X86::MOVSDrm || - LoadMI->getOpcode() == X86::VMOVSDrm) && - MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize() - > 8) - // These instructions only load 64 bits, we can't fold them if the - // destination register is wider than 64 bits (8 bytes). + if (isPartialRegisterLoad(*LoadMI, MF)) return nullptr; // Folding a normal load. Just copy the load's address operands. @@ -4517,7 +4898,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, break; } } - return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment); + return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, + /*Size=*/0, Alignment, /*AllowCommute=*/true); } @@ -4997,26 +5379,26 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, switch(Second->getOpcode()) { default: return false; - case X86::JE_4: - case X86::JNE_4: - case X86::JL_4: - case X86::JLE_4: - case X86::JG_4: - case X86::JGE_4: + case X86::JE_1: + case X86::JNE_1: + case X86::JL_1: + case X86::JLE_1: + case X86::JG_1: + case X86::JGE_1: FuseKind = FuseInc; break; - case X86::JB_4: - case X86::JBE_4: - case X86::JA_4: - case X86::JAE_4: + case X86::JB_1: + case X86::JBE_1: + case X86::JA_1: + case X86::JAE_1: FuseKind = FuseCmp; break; - case X86::JS_4: - case X86::JNS_4: - case X86::JP_4: - case X86::JNP_4: - case X86::JO_4: - case X86::JNO_4: + case X86::JS_1: + case X86::JNS_1: + case X86::JP_1: + case X86::JNP_1: + case X86::JO_1: + case X86::JNO_1: FuseKind = FuseTest; break; } @@ -5129,14 +5511,10 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, return FuseKind == FuseCmp || FuseKind == FuseInc; case X86::INC16r: case X86::INC32r: - case X86::INC64_16r: - case X86::INC64_32r: case X86::INC64r: case X86::INC8r: case X86::DEC16r: case X86::DEC32r: - case X86::DEC64_16r: - case X86::DEC64_32r: case X86::DEC64r: case X86::DEC8r: return FuseKind == FuseInc; @@ -5299,16 +5677,32 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } +// This code must remain in sync with getJumpInstrTableEntryBound in this class! +// In particular, getJumpInstrTableEntryBound must always return an upper bound +// on the encoding lengths of the instructions generated by +// getUnconditionalBranch and getTrap. void X86InstrInfo::getUnconditionalBranch( MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { - Branch.setOpcode(X86::JMP_4); + Branch.setOpcode(X86::JMP_1); Branch.addOperand(MCOperand::CreateExpr(BranchTarget)); } +// This code must remain in sync with getJumpInstrTableEntryBound in this class! +// In particular, getJumpInstrTableEntryBound must always return an upper bound +// on the encoding lengths of the instructions generated by +// getUnconditionalBranch and getTrap. void X86InstrInfo::getTrap(MCInst &MI) const { MI.setOpcode(X86::TRAP); } +// See getTrap and getUnconditionalBranch for conditions on the value returned +// by this function. +unsigned X86InstrInfo::getJumpInstrTableEntryBound() const { + // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4 + // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B). + return 5; +} + bool X86InstrInfo::isHighLatencyDef(int opc) const { switch (opc) { default: return false; @@ -5351,10 +5745,10 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSSr: - case X86::VSQRTPDZrm: - case X86::VSQRTPDZrr: - case X86::VSQRTPSZrm: - case X86::VSQRTPSZrr: + case X86::VSQRTPDZm: + case X86::VSQRTPDZr: + case X86::VSQRTPSZm: + case X86::VSQRTPSZr: case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: case X86::VSQRTSDZr: @@ -5426,7 +5820,7 @@ namespace { MachineBasicBlock::iterator MBBI = FirstMBB.begin(); DebugLoc DL = FirstMBB.findDebugLoc(MBBI); MachineRegisterInfo &RegInfo = MF.getRegInfo(); - const X86InstrInfo *TII = TM->getInstrInfo(); + const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); unsigned PC; if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) @@ -5524,7 +5918,7 @@ namespace { const X86TargetMachine *TM = static_cast<const X86TargetMachine *>(&MF->getTarget()); const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); - const X86InstrInfo *TII = TM->getInstrInfo(); + const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to RAX/EAX. MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), @@ -5545,7 +5939,7 @@ namespace { const X86TargetMachine *TM = static_cast<const X86TargetMachine *>(&MF->getTarget()); const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit(); - const X86InstrInfo *TII = TM->getInstrInfo(); + const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); // Create a virtual register for the TLS base address. MachineRegisterInfo &RegInfo = MF->getRegInfo(); diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index c177e3a5c7c7..4d15467f0ca3 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86INSTRUCTIONINFO_H -#define X86INSTRUCTIONINFO_H +#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H +#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H #include "MCTargetDesc/X86BaseInfo.h" #include "X86RegisterInfo.h" @@ -152,6 +152,7 @@ class X86InstrInfo final : public X86GenInstrInfo { RegOp2MemOpTableType RegOp2MemOpTable1; RegOp2MemOpTableType RegOp2MemOpTable2; RegOp2MemOpTableType RegOp2MemOpTable3; + RegOp2MemOpTableType RegOp2MemOpTable4; /// MemOp2RegOpTable - Load / store unfolding opcode map. /// @@ -174,6 +175,11 @@ public: /// const X86RegisterInfo &getRegisterInfo() const { return RI; } + /// getSPAdjust - This returns the stack pointer adjustment made by + /// this instruction. For x86, we need to handle more complex call + /// sequences involving PUSHes. + int getSPAdjust(const MachineInstr *MI) const override; + /// isCoalescableExtInstr - Return true if the instruction is a "coalescable" /// extension instruction. That is, it's like a copy where it's legal for the /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns @@ -404,7 +410,8 @@ public: MachineInstr* MI, unsigned OpNum, const SmallVectorImpl<MachineOperand> &MOs, - unsigned Size, unsigned Alignment) const; + unsigned Size, unsigned Alignment, + bool AllowCommute) const; void getUnconditionalBranch(MCInst &Branch, @@ -412,6 +419,8 @@ public: void getTrap(MCInst &MI) const override; + unsigned getJumpInstrTableEntryBound() const override; + bool isHighLatencyDef(int opc) const override; bool hasHighOperandLatency(const InstrItineraryData *ItinData, diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index 0f872a676c25..53715dc6dd39 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -32,7 +32,8 @@ def SDTX86Cmov : SDTypeProfile<1, 4, // Unary and binary operator instructions that set EFLAGS as a side-effect. def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, - [SDTCisInt<0>, SDTCisVT<1, i32>]>; + [SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, @@ -188,11 +189,15 @@ def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, - [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; +def X86RecoverFrameAlloc : SDNode<"ISD::FRAME_ALLOC_RECOVER", + SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisInt<1>]>>; + def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; @@ -261,121 +266,75 @@ def ptr_rc_nosp : PointerLikeRegClass<1>; def X86MemAsmOperand : AsmOperandClass { let Name = "Mem"; } -def X86Mem8AsmOperand : AsmOperandClass { - let Name = "Mem8"; let RenderMethod = "addMemOperands"; -} -def X86Mem16AsmOperand : AsmOperandClass { - let Name = "Mem16"; let RenderMethod = "addMemOperands"; -} -def X86Mem32AsmOperand : AsmOperandClass { - let Name = "Mem32"; let RenderMethod = "addMemOperands"; -} -def X86Mem64AsmOperand : AsmOperandClass { - let Name = "Mem64"; let RenderMethod = "addMemOperands"; -} -def X86Mem80AsmOperand : AsmOperandClass { - let Name = "Mem80"; let RenderMethod = "addMemOperands"; -} -def X86Mem128AsmOperand : AsmOperandClass { - let Name = "Mem128"; let RenderMethod = "addMemOperands"; -} -def X86Mem256AsmOperand : AsmOperandClass { - let Name = "Mem256"; let RenderMethod = "addMemOperands"; -} -def X86Mem512AsmOperand : AsmOperandClass { - let Name = "Mem512"; let RenderMethod = "addMemOperands"; -} - -// Gather mem operands -def X86MemVX32Operand : AsmOperandClass { - let Name = "MemVX32"; let RenderMethod = "addMemOperands"; -} -def X86MemVY32Operand : AsmOperandClass { - let Name = "MemVY32"; let RenderMethod = "addMemOperands"; -} -def X86MemVZ32Operand : AsmOperandClass { - let Name = "MemVZ32"; let RenderMethod = "addMemOperands"; -} -def X86MemVX64Operand : AsmOperandClass { - let Name = "MemVX64"; let RenderMethod = "addMemOperands"; -} -def X86MemVY64Operand : AsmOperandClass { - let Name = "MemVY64"; let RenderMethod = "addMemOperands"; -} -def X86MemVZ64Operand : AsmOperandClass { - let Name = "MemVZ64"; let RenderMethod = "addMemOperands"; +let RenderMethod = "addMemOperands" in { + def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; } + def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; } + def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; } + def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; } + def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; } + def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; } + def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; } + def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; } + // Gather mem operands + def X86MemVX32Operand : AsmOperandClass { let Name = "MemVX32"; } + def X86MemVY32Operand : AsmOperandClass { let Name = "MemVY32"; } + def X86MemVZ32Operand : AsmOperandClass { let Name = "MemVZ32"; } + def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; } + def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; } + def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; } } def X86AbsMemAsmOperand : AsmOperandClass { let Name = "AbsMem"; let SuperClasses = [X86MemAsmOperand]; } -class X86MemOperand<string printMethod> : Operand<iPTR> { + +class X86MemOperand<string printMethod, + AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> { let PrintMethod = printMethod; let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); - let ParserMatchClass = X86MemAsmOperand; + let ParserMatchClass = parserMatchClass; + let OperandType = "OPERAND_MEMORY"; } -let OperandType = "OPERAND_MEMORY" in { +// Gather mem operands +class X86VMemOperand<RegisterClass RC, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, i8imm); +} + +def anymem : X86MemOperand<"printanymem">; + def opaque32mem : X86MemOperand<"printopaquemem">; def opaque48mem : X86MemOperand<"printopaquemem">; def opaque80mem : X86MemOperand<"printopaquemem">; def opaque512mem : X86MemOperand<"printopaquemem">; -def i8mem : X86MemOperand<"printi8mem"> { - let ParserMatchClass = X86Mem8AsmOperand; } -def i16mem : X86MemOperand<"printi16mem"> { - let ParserMatchClass = X86Mem16AsmOperand; } -def i32mem : X86MemOperand<"printi32mem"> { - let ParserMatchClass = X86Mem32AsmOperand; } -def i64mem : X86MemOperand<"printi64mem"> { - let ParserMatchClass = X86Mem64AsmOperand; } -def i128mem : X86MemOperand<"printi128mem"> { - let ParserMatchClass = X86Mem128AsmOperand; } -def i256mem : X86MemOperand<"printi256mem"> { - let ParserMatchClass = X86Mem256AsmOperand; } -def i512mem : X86MemOperand<"printi512mem"> { - let ParserMatchClass = X86Mem512AsmOperand; } -def f32mem : X86MemOperand<"printf32mem"> { - let ParserMatchClass = X86Mem32AsmOperand; } -def f64mem : X86MemOperand<"printf64mem"> { - let ParserMatchClass = X86Mem64AsmOperand; } -def f80mem : X86MemOperand<"printf80mem"> { - let ParserMatchClass = X86Mem80AsmOperand; } -def f128mem : X86MemOperand<"printf128mem"> { - let ParserMatchClass = X86Mem128AsmOperand; } -def f256mem : X86MemOperand<"printf256mem">{ - let ParserMatchClass = X86Mem256AsmOperand; } -def f512mem : X86MemOperand<"printf512mem">{ - let ParserMatchClass = X86Mem512AsmOperand; } -def v512mem : Operand<iPTR> { - let PrintMethod = "printf512mem"; - let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); - let ParserMatchClass = X86Mem512AsmOperand; } +def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>; +def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>; +def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>; +def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>; +def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>; +def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>; +def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>; +def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>; +def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>; +def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>; +def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>; +def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>; +def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>; + +def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>; // Gather mem operands -def vx32mem : X86MemOperand<"printi32mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm); - let ParserMatchClass = X86MemVX32Operand; } -def vy32mem : X86MemOperand<"printi32mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); - let ParserMatchClass = X86MemVY32Operand; } -def vx64mem : X86MemOperand<"printi64mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR128, i32imm, i8imm); - let ParserMatchClass = X86MemVX64Operand; } -def vy64mem : X86MemOperand<"printi64mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm); - let ParserMatchClass = X86MemVY64Operand; } -def vy64xmem : X86MemOperand<"printi64mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm); - let ParserMatchClass = X86MemVY64Operand; } -def vz32mem : X86MemOperand<"printi32mem">{ - let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm); - let ParserMatchClass = X86MemVZ32Operand; } -def vz64mem : X86MemOperand<"printi64mem">{ - let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm); - let ParserMatchClass = X86MemVZ64Operand; } -} +def vx32mem : X86VMemOperand<VR128, "printi32mem", X86MemVX32Operand>; +def vy32mem : X86VMemOperand<VR256, "printi32mem", X86MemVY32Operand>; +def vx64mem : X86VMemOperand<VR128, "printi64mem", X86MemVX64Operand>; +def vy64mem : X86VMemOperand<VR256, "printi64mem", X86MemVY64Operand>; +def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64Operand>; +def vz32mem : X86VMemOperand<VR512, "printi32mem", X86MemVZ32Operand>; +def vz64mem : X86VMemOperand<VR512, "printi64mem", X86MemVZ64Operand>; // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of // plain GR64, so that it doesn't potentially require a REX prefix. @@ -424,134 +383,184 @@ def brtarget8 : Operand<OtherVT>; } -def X86SrcIdx8Operand : AsmOperandClass { - let Name = "SrcIdx8"; - let RenderMethod = "addSrcIdxOperands"; - let SuperClasses = [X86Mem8AsmOperand]; -} -def X86SrcIdx16Operand : AsmOperandClass { - let Name = "SrcIdx16"; - let RenderMethod = "addSrcIdxOperands"; - let SuperClasses = [X86Mem16AsmOperand]; -} -def X86SrcIdx32Operand : AsmOperandClass { - let Name = "SrcIdx32"; - let RenderMethod = "addSrcIdxOperands"; - let SuperClasses = [X86Mem32AsmOperand]; -} -def X86SrcIdx64Operand : AsmOperandClass { - let Name = "SrcIdx64"; - let RenderMethod = "addSrcIdxOperands"; - let SuperClasses = [X86Mem64AsmOperand]; -} -def X86DstIdx8Operand : AsmOperandClass { - let Name = "DstIdx8"; - let RenderMethod = "addDstIdxOperands"; - let SuperClasses = [X86Mem8AsmOperand]; -} -def X86DstIdx16Operand : AsmOperandClass { - let Name = "DstIdx16"; - let RenderMethod = "addDstIdxOperands"; - let SuperClasses = [X86Mem16AsmOperand]; -} -def X86DstIdx32Operand : AsmOperandClass { - let Name = "DstIdx32"; - let RenderMethod = "addDstIdxOperands"; - let SuperClasses = [X86Mem32AsmOperand]; -} -def X86DstIdx64Operand : AsmOperandClass { - let Name = "DstIdx64"; - let RenderMethod = "addDstIdxOperands"; - let SuperClasses = [X86Mem64AsmOperand]; -} -def X86MemOffs8AsmOperand : AsmOperandClass { - let Name = "MemOffs8"; - let RenderMethod = "addMemOffsOperands"; - let SuperClasses = [X86Mem8AsmOperand]; -} -def X86MemOffs16AsmOperand : AsmOperandClass { - let Name = "MemOffs16"; - let RenderMethod = "addMemOffsOperands"; - let SuperClasses = [X86Mem16AsmOperand]; -} -def X86MemOffs32AsmOperand : AsmOperandClass { - let Name = "MemOffs32"; - let RenderMethod = "addMemOffsOperands"; - let SuperClasses = [X86Mem32AsmOperand]; -} -def X86MemOffs64AsmOperand : AsmOperandClass { - let Name = "MemOffs64"; - let RenderMethod = "addMemOffsOperands"; - let SuperClasses = [X86Mem64AsmOperand]; -} -let OperandType = "OPERAND_MEMORY" in { -def srcidx8 : Operand<iPTR> { - let ParserMatchClass = X86SrcIdx8Operand; - let MIOperandInfo = (ops ptr_rc, i8imm); - let PrintMethod = "printSrcIdx8"; } -def srcidx16 : Operand<iPTR> { - let ParserMatchClass = X86SrcIdx16Operand; - let MIOperandInfo = (ops ptr_rc, i8imm); - let PrintMethod = "printSrcIdx16"; } -def srcidx32 : Operand<iPTR> { - let ParserMatchClass = X86SrcIdx32Operand; - let MIOperandInfo = (ops ptr_rc, i8imm); - let PrintMethod = "printSrcIdx32"; } -def srcidx64 : Operand<iPTR> { - let ParserMatchClass = X86SrcIdx64Operand; +// Special parsers to detect mode to disambiguate. +def X86AbsMem16AsmOperand : AsmOperandClass { + let Name = "AbsMem16"; + let RenderMethod = "addAbsMemOperands"; + let SuperClasses = [X86AbsMemAsmOperand]; +} + +def X86AbsMem32AsmOperand : AsmOperandClass { + let Name = "AbsMem32"; + let RenderMethod = "addAbsMemOperands"; + let SuperClasses = [X86AbsMemAsmOperand]; +} + +// Branch targets have OtherVT type and print as pc-relative values. +let OperandType = "OPERAND_PCREL", + PrintMethod = "printPCRelImm" in { +let ParserMatchClass = X86AbsMem16AsmOperand in + def brtarget16 : Operand<OtherVT>; +let ParserMatchClass = X86AbsMem32AsmOperand in + def brtarget32 : Operand<OtherVT>; +} + +let RenderMethod = "addSrcIdxOperands" in { + def X86SrcIdx8Operand : AsmOperandClass { + let Name = "SrcIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86SrcIdx16Operand : AsmOperandClass { + let Name = "SrcIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86SrcIdx32Operand : AsmOperandClass { + let Name = "SrcIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86SrcIdx64Operand : AsmOperandClass { + let Name = "SrcIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addSrcIdxOperands" + +let RenderMethod = "addDstIdxOperands" in { + def X86DstIdx8Operand : AsmOperandClass { + let Name = "DstIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86DstIdx16Operand : AsmOperandClass { + let Name = "DstIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86DstIdx32Operand : AsmOperandClass { + let Name = "DstIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86DstIdx64Operand : AsmOperandClass { + let Name = "DstIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addDstIdxOperands" + +let RenderMethod = "addMemOffsOperands" in { + def X86MemOffs16_8AsmOperand : AsmOperandClass { + let Name = "MemOffs16_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs16_16AsmOperand : AsmOperandClass { + let Name = "MemOffs16_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs16_32AsmOperand : AsmOperandClass { + let Name = "MemOffs16_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_8AsmOperand : AsmOperandClass { + let Name = "MemOffs32_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs32_16AsmOperand : AsmOperandClass { + let Name = "MemOffs32_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs32_32AsmOperand : AsmOperandClass { + let Name = "MemOffs32_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_64AsmOperand : AsmOperandClass { + let Name = "MemOffs32_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } + def X86MemOffs64_8AsmOperand : AsmOperandClass { + let Name = "MemOffs64_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs64_16AsmOperand : AsmOperandClass { + let Name = "MemOffs64_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs64_32AsmOperand : AsmOperandClass { + let Name = "MemOffs64_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs64_64AsmOperand : AsmOperandClass { + let Name = "MemOffs64_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addMemOffsOperands" + +class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { let MIOperandInfo = (ops ptr_rc, i8imm); - let PrintMethod = "printSrcIdx64"; } -def dstidx8 : Operand<iPTR> { - let ParserMatchClass = X86DstIdx8Operand; - let MIOperandInfo = (ops ptr_rc); - let PrintMethod = "printDstIdx8"; } -def dstidx16 : Operand<iPTR> { - let ParserMatchClass = X86DstIdx16Operand; - let MIOperandInfo = (ops ptr_rc); - let PrintMethod = "printDstIdx16"; } -def dstidx32 : Operand<iPTR> { - let ParserMatchClass = X86DstIdx32Operand; - let MIOperandInfo = (ops ptr_rc); - let PrintMethod = "printDstIdx32"; } -def dstidx64 : Operand<iPTR> { - let ParserMatchClass = X86DstIdx64Operand; +} + +class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { let MIOperandInfo = (ops ptr_rc); - let PrintMethod = "printDstIdx64"; } -def offset8 : Operand<iPTR> { - let ParserMatchClass = X86MemOffs8AsmOperand; - let MIOperandInfo = (ops i64imm, i8imm); - let PrintMethod = "printMemOffs8"; } -def offset16 : Operand<iPTR> { - let ParserMatchClass = X86MemOffs16AsmOperand; - let MIOperandInfo = (ops i64imm, i8imm); - let PrintMethod = "printMemOffs16"; } -def offset32 : Operand<iPTR> { - let ParserMatchClass = X86MemOffs32AsmOperand; - let MIOperandInfo = (ops i64imm, i8imm); - let PrintMethod = "printMemOffs32"; } -def offset64 : Operand<iPTR> { - let ParserMatchClass = X86MemOffs64AsmOperand; - let MIOperandInfo = (ops i64imm, i8imm); - let PrintMethod = "printMemOffs64"; } } +def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>; +def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>; +def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>; +def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>; +def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>; +def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>; +def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>; +def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>; + +class X86MemOffsOperand<Operand immOperand, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops immOperand, i8imm); +} + +def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8", + X86MemOffs16_8AsmOperand>; +def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16", + X86MemOffs16_16AsmOperand>; +def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32", + X86MemOffs16_32AsmOperand>; +def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8", + X86MemOffs32_8AsmOperand>; +def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16", + X86MemOffs32_16AsmOperand>; +def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32", + X86MemOffs32_32AsmOperand>; +def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64", + X86MemOffs32_64AsmOperand>; +def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8", + X86MemOffs64_8AsmOperand>; +def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16", + X86MemOffs64_16AsmOperand>; +def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32", + X86MemOffs64_32AsmOperand>; +def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64", + X86MemOffs64_64AsmOperand>; def SSECC : Operand<i8> { let PrintMethod = "printSSECC"; let OperandType = "OPERAND_IMMEDIATE"; } +def i8immZExt3 : ImmLeaf<i8, [{ + return Imm >= 0 && Imm < 8; +}]>; + def AVXCC : Operand<i8> { let PrintMethod = "printAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; } -class ImmSExtAsmOperandClass : AsmOperandClass { - let SuperClasses = [ImmAsmOperand]; - let RenderMethod = "addImmOperands"; -} +def i8immZExt5 : ImmLeaf<i8, [{ + return Imm >= 0 && Imm < 32; +}]>; +// AVX-512 uses a 32-bit immediate in their intrinsics +def i32immZExt5 : ImmLeaf<i32, [{ + return Imm >= 0 && Imm < 32; +}]>; -class ImmZExtAsmOperandClass : AsmOperandClass { +class ImmSExtAsmOperandClass : AsmOperandClass { let SuperClasses = [ImmAsmOperand]; let RenderMethod = "addImmOperands"; } @@ -568,6 +577,7 @@ def AVX512RC : Operand<i32> { let PrintMethod = "printRoundingControl"; let OperandType = "OPERAND_IMMEDIATE"; } + // Sign-extended immediate classes. We don't need to define the full lattice // here because there is no instruction with an ambiguity between ImmSExti64i32 // and ImmSExti32i8. @@ -595,12 +605,6 @@ def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass { let Name = "ImmSExti32i8"; } -// [0, 0x000000FF] -def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass { - let Name = "ImmZExtu32u8"; -} - - // [0, 0x0000007F] | // [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { @@ -620,11 +624,6 @@ def i32i8imm : Operand<i32> { let ParserMatchClass = ImmSExti32i8AsmOperand; let OperandType = "OPERAND_IMMEDIATE"; } -// 32-bits but only 8 bits are significant, and those 8 bits are unsigned. -def u32u8imm : Operand<i32> { - let ParserMatchClass = ImmZExtu32u8AsmOperand; - let OperandType = "OPERAND_IMMEDIATE"; -} // 64-bits but only 32 bits are significant. def i64i32imm : Operand<i64> { @@ -647,14 +646,14 @@ def i64i8imm : Operand<i64> { } def lea64_32mem : Operand<i32> { - let PrintMethod = "printi32mem"; + let PrintMethod = "printanymem"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); let ParserMatchClass = X86MemAsmOperand; } // Memory operands that use 64-bit pointers in both ILP32 and LP64. def lea64mem : Operand<i64> { - let PrintMethod = "printi64mem"; + let PrintMethod = "printanymem"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm); let ParserMatchClass = X86MemAsmOperand; } @@ -708,6 +707,7 @@ def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">; def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; @@ -724,9 +724,11 @@ def HasCDI : Predicate<"Subtarget->hasCDI()">; def HasPFI : Predicate<"Subtarget->hasPFI()">; def HasERI : Predicate<"Subtarget->hasERI()">; def HasDQI : Predicate<"Subtarget->hasDQI()">; +def NoDQI : Predicate<"!Subtarget->hasDQI()">; def HasBWI : Predicate<"Subtarget->hasBWI()">; def HasVLX : Predicate<"Subtarget->hasVLX()">, AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">; +def NoVLX : Predicate<"!Subtarget->hasVLX()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; @@ -748,8 +750,10 @@ def HasHLE : Predicate<"Subtarget->hasHLE()">; def HasTSX : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">; def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; +def HasSGX : Predicate<"Subtarget->hasSGX()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; +def HasSMAP : Predicate<"Subtarget->hasSMAP()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; @@ -758,6 +762,8 @@ def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, AssemblerPredicate<"Mode64Bit", "64-bit mode">; +def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">; +def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">; def In16BitMode : Predicate<"Subtarget->is16Bit()">, AssemblerPredicate<"Mode16Bit", "16-bit mode">; def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, @@ -781,6 +787,7 @@ def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; +def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. @@ -811,6 +818,11 @@ def X86_COND_O : PatLeaf<(i8 13)>; def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE def X86_COND_S : PatLeaf<(i8 15)>; +// Predicate used to help when pattern matching LZCNT/TZCNT. +def X86_COND_E_OR_NE : ImmLeaf<i8, [{ + return (Imm == X86::COND_E) || (Imm == X86::COND_NE); +}]>; + let FastIselShouldIgnore = 1 in { // FastIsel should ignore all simm8 instrs. def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; @@ -913,7 +925,7 @@ def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ // // Nop -let neverHasSideEffects = 1, SchedRW = [WriteZero] in { +let hasSideEffects = 0, SchedRW = [WriteZero] in { def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", [], IIC_NOP>; def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero), "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16; @@ -927,12 +939,12 @@ def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), "enter\t$len, $lvl", [], IIC_ENTER>, Sched<[WriteMicrocoded]>; let SchedRW = [WriteALU] in { -let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in +let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in def LEAVE : I<0xC9, RawFrm, (outs), (ins), "leave", [], IIC_LEAVE>, Requires<[Not64BitMode]>; -let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in +let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in def LEAVE64 : I<0xC9, RawFrm, (outs), (ins), "leave", [], IIC_LEAVE>, Requires<[In64BitMode]>; @@ -942,7 +954,7 @@ def LEAVE64 : I<0xC9, RawFrm, // Miscellaneous Instructions. // -let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in { +let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in { let mayLoad = 1, SchedRW = [WriteLoad] in { def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", [], IIC_POP_REG16>, OpSize16; @@ -956,11 +968,6 @@ def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[Not64BitMode]>; def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [], IIC_POP_MEM>, OpSize32, Requires<[Not64BitMode]>; - -def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, - OpSize16; -def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, - OpSize32, Requires<[Not64BitMode]>; } // mayLoad, SchedRW let mayStore = 1, SchedRW = [WriteStore] in { @@ -989,16 +996,26 @@ def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[Not64BitMode]>; +} // mayStore, SchedRW +} +let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0, + SchedRW = [WriteLoad] in { +def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, + OpSize16; +def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", [], IIC_POP_FD>, + OpSize32, Requires<[Not64BitMode]>; +} + +let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, hasSideEffects=0, + SchedRW = [WriteStore] in { def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", [], IIC_PUSH_F>, OpSize16; def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", [], IIC_PUSH_F>, OpSize32, Requires<[Not64BitMode]>; - -} // mayStore, SchedRW } -let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in { +let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in { let mayLoad = 1, SchedRW = [WriteLoad] in { def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>, OpSize32, Requires<[In64BitMode]>; @@ -1017,7 +1034,7 @@ def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], } // mayStore, SchedRW } -let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1, +let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in { def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), "push{q}\t$imm", [], IIC_PUSH_IMM>, Requires<[In64BitMode]>; @@ -1029,22 +1046,22 @@ def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), Requires<[In64BitMode]>; } -let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, neverHasSideEffects=1 in +let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", [], IIC_POP_FD>, OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>; -let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in +let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, hasSideEffects=0 in def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>, OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>; let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], - mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in { + mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in { def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", [], IIC_POP_A>, OpSize32, Requires<[Not64BitMode]>; def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", [], IIC_POP_A>, OpSize16, Requires<[Not64BitMode]>; } let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], - mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { + mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", [], IIC_PUSH_A>, OpSize32, Requires<[Not64BitMode]>; def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", [], IIC_PUSH_A>, @@ -1174,7 +1191,7 @@ def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), // Move Instructions. // let SchedRW = [WriteMove] in { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>; def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), @@ -1233,62 +1250,67 @@ def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), let hasSideEffects = 0 in { -/// moffs8, moffs16 and moffs32 versions of moves. The immediate is a -/// 32-bit offset from the segment base. These are only valid in x86-32 mode. +/// Memory offset versions of moves. The immediate is an address mode sized +/// offset from the segment base. let SchedRW = [WriteALU] in { let mayLoad = 1 in { let Defs = [AL] in -def MOV8o8a : Ii32 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src), - "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, - Requires<[In32BitMode]>; +def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src), + "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, + AdSize32; let Defs = [AX] in -def MOV16o16a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src), - "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, - OpSize16, Requires<[In32BitMode]>; +def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, + OpSize16, AdSize32; let Defs = [EAX] in -def MOV32o32a : Ii32 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src), - "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, - OpSize32, Requires<[In32BitMode]>; +def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, + OpSize32, AdSize32; +let Defs = [RAX] in +def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src), + "mov{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>, + AdSize32; let Defs = [AL] in -def MOV8o8a_16 : Ii16 <0xA0, RawFrmMemOffs, (outs), (ins offset8:$src), - "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, - AdSize, Requires<[In16BitMode]>; +def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src), + "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, AdSize16; let Defs = [AX] in -def MOV16o16a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset16:$src), - "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, - OpSize16, AdSize, Requires<[In16BitMode]>; +def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, + OpSize16, AdSize16; let Defs = [EAX] in -def MOV32o32a_16 : Ii16 <0xA1, RawFrmMemOffs, (outs), (ins offset32:$src), - "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, - AdSize, OpSize32, Requires<[In16BitMode]>; +def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, + AdSize16, OpSize32; } let mayStore = 1 in { let Uses = [AL] in -def MOV8ao8 : Ii32 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins), - "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, - Requires<[In32BitMode]>; +def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs offset32_8:$dst), (ins), + "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize32; let Uses = [AX] in -def MOV16ao16 : Ii32 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins), - "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, - OpSize16, Requires<[In32BitMode]>; +def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_16:$dst), (ins), + "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, + OpSize16, AdSize32; let Uses = [EAX] in -def MOV32ao32 : Ii32 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins), - "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, - OpSize32, Requires<[In32BitMode]>; +def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs offset32_32:$dst), (ins), + "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, + OpSize32, AdSize32; +let Uses = [RAX] in +def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs offset32_64:$dst), (ins), + "mov{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>, + AdSize32; let Uses = [AL] in -def MOV8ao8_16 : Ii16 <0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins), - "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, - AdSize, Requires<[In16BitMode]>; +def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs offset16_8:$dst), (ins), + "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, AdSize16; let Uses = [AX] in -def MOV16ao16_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins), - "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, - OpSize16, AdSize, Requires<[In16BitMode]>; +def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_16:$dst), (ins), + "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, + OpSize16, AdSize16; let Uses = [EAX] in -def MOV32ao32_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins), - "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, - OpSize32, AdSize, Requires<[In16BitMode]>; +def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs offset16_32:$dst), (ins), + "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, + OpSize32, AdSize16; } } @@ -1296,40 +1318,34 @@ def MOV32ao32_16 : Ii16 <0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins), // and use the movabs mnemonic to indicate this specific form. let mayLoad = 1 in { let Defs = [AL] in -def MOV64o8a : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset8:$src), - "movabs{b}\t{$src, %al|al, $src}", []>, - Requires<[In64BitMode]>; +def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src), + "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64; let Defs = [AX] in -def MOV64o16a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset16:$src), - "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, - Requires<[In64BitMode]>; +def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src), + "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64; let Defs = [EAX] in -def MOV64o32a : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset32:$src), +def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src), "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32, - Requires<[In64BitMode]>; + AdSize64; let Defs = [RAX] in -def MOV64o64a : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64:$src), - "movabs{q}\t{$src, %rax|rax, $src}", []>, - Requires<[In64BitMode]>; +def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src), + "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64; } let mayStore = 1 in { let Uses = [AL] in -def MOV64ao8 : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset8:$dst), (ins), - "movabs{b}\t{%al, $dst|$dst, al}", []>, - Requires<[In64BitMode]>; +def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs offset64_8:$dst), (ins), + "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64; let Uses = [AX] in -def MOV64ao16 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset16:$dst), (ins), - "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, - Requires<[In64BitMode]>; +def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_16:$dst), (ins), + "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64; let Uses = [EAX] in -def MOV64ao32 : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset32:$dst), (ins), +def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs offset64_32:$dst), (ins), "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32, - Requires<[In64BitMode]>; + AdSize64; let Uses = [RAX] in -def MOV64ao64 : RIi64<0xA3, RawFrmMemOffs, (outs offset64:$dst), (ins), - "movabs{q}\t{%rax, $dst|$dst, rax}", []>, - Requires<[In64BitMode]>; +def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs offset64_64:$dst), (ins), + "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64; } } // hasSideEffects = 0 @@ -1379,17 +1395,17 @@ def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), // that they can be used for copying and storing h registers, which can't be // encoded when a REX prefix is present. let isCodeGenOnly = 1 in { -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def MOV8rr_NOREX : I<0x88, MRMDestReg, (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV>, Sched<[WriteMove]>; -let mayStore = 1, neverHasSideEffects = 1 in +let mayStore = 1, hasSideEffects = 0 in def MOV8mr_NOREX : I<0x88, MRMDestMem, (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), "mov{b}\t{$src, $dst|$dst, $src} # NOREX", [], IIC_MOV_MEM>, Sched<[WriteStore]>; -let mayLoad = 1, neverHasSideEffects = 1, +let mayLoad = 1, hasSideEffects = 0, canFoldAsLoad = 1, isReMaterializable = 1 in def MOV8rm_NOREX : I<0x8A, MRMSrcMem, (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), @@ -1403,7 +1419,7 @@ let SchedRW = [WriteALU] in { let Defs = [EFLAGS], Uses = [AH] in def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", [(set EFLAGS, (X86sahf AH))], IIC_AHF>; -let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in +let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], IIC_AHF>; // AH = flags } // SchedRW @@ -1989,42 +2005,42 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in { } let Predicates = [HasLZCNT] in { - def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E), - (X86cmp GR16:$src, (i16 0))), + def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E_OR_NE), + (X86cmp GR16:$src, (i16 0))), (LZCNT16rr GR16:$src)>; - def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E), + def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E_OR_NE), (X86cmp GR32:$src, (i32 0))), (LZCNT32rr GR32:$src)>; - def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E), + def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E_OR_NE), (X86cmp GR64:$src, (i64 0))), (LZCNT64rr GR64:$src)>; - def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E), + def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E_OR_NE), (X86cmp GR16:$src, (i16 0))), (LZCNT16rr GR16:$src)>; - def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E), + def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E_OR_NE), (X86cmp GR32:$src, (i32 0))), (LZCNT32rr GR32:$src)>; - def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E), + def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E_OR_NE), (X86cmp GR64:$src, (i64 0))), (LZCNT64rr GR64:$src)>; - def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E), - (X86cmp (loadi16 addr:$src), (i16 0))), + def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), (LZCNT16rm addr:$src)>; - def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E), - (X86cmp (loadi32 addr:$src), (i32 0))), + def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), (LZCNT32rm addr:$src)>; - def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E), - (X86cmp (loadi64 addr:$src), (i64 0))), + def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), (LZCNT64rm addr:$src)>; - def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E), - (X86cmp (loadi16 addr:$src), (i16 0))), + def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), (LZCNT16rm addr:$src)>; - def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E), - (X86cmp (loadi32 addr:$src), (i32 0))), + def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), (LZCNT32rm addr:$src)>; - def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E), - (X86cmp (loadi64 addr:$src), (i64 0))), + def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), (LZCNT64rm addr:$src)>; } @@ -2105,42 +2121,42 @@ let Predicates = [HasBMI] in { } let Predicates = [HasBMI] in { - def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E), + def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E_OR_NE), (X86cmp GR16:$src, (i16 0))), (TZCNT16rr GR16:$src)>; - def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E), + def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E_OR_NE), (X86cmp GR32:$src, (i32 0))), (TZCNT32rr GR32:$src)>; - def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E), + def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E_OR_NE), (X86cmp GR64:$src, (i64 0))), (TZCNT64rr GR64:$src)>; - def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E), + def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E_OR_NE), (X86cmp GR16:$src, (i16 0))), (TZCNT16rr GR16:$src)>; - def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E), + def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E_OR_NE), (X86cmp GR32:$src, (i32 0))), (TZCNT32rr GR32:$src)>; - def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E), + def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E_OR_NE), (X86cmp GR64:$src, (i64 0))), (TZCNT64rr GR64:$src)>; - def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E), - (X86cmp (loadi16 addr:$src), (i16 0))), + def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), (TZCNT16rm addr:$src)>; - def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E), - (X86cmp (loadi32 addr:$src), (i32 0))), + def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), (TZCNT32rm addr:$src)>; - def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E), - (X86cmp (loadi64 addr:$src), (i64 0))), + def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), (TZCNT64rm addr:$src)>; - def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E), - (X86cmp (loadi16 addr:$src), (i16 0))), + def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi16 addr:$src), (i16 0))), (TZCNT16rm addr:$src)>; - def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E), - (X86cmp (loadi32 addr:$src), (i32 0))), + def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi32 addr:$src), (i32 0))), (TZCNT32rm addr:$src)>; - def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E), - (X86cmp (loadi64 addr:$src), (i64 0))), + def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E_OR_NE), + (X86cmp (loadi64 addr:$src), (i64 0))), (TZCNT64rm addr:$src)>; } @@ -2400,6 +2416,7 @@ include "X86InstrVMX.td" include "X86InstrSVM.td" include "X86InstrTSX.td" +include "X86InstrSGX.td" // System instructions. include "X86InstrSystem.td" @@ -2518,7 +2535,7 @@ def : MnemonicAlias<"fldcww", "fldcw", "att">; def : MnemonicAlias<"fnstcww", "fnstcw", "att">; def : MnemonicAlias<"fnstsww", "fnstsw", "att">; def : MnemonicAlias<"fucomip", "fucompi", "att">; -def : MnemonicAlias<"fwait", "wait", "att">; +def : MnemonicAlias<"fwait", "wait">; class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, @@ -2707,28 +2724,28 @@ def : InstAlias<"fnstsw" , (FNSTSW16r)>; // this is compatible with what GAS does. def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; def : InstAlias<"ljmp $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"lcall *$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; -def : InstAlias<"ljmp *$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall {*}$dst", (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp {*}$dst", (FARJMP32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>; def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; def : InstAlias<"ljmp $seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; -def : InstAlias<"lcall *$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"ljmp *$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall {*}$dst", (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp {*}$dst", (FARJMP16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"call *$dst", (CALL64m i16mem:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"jmp *$dst", (JMP64m i16mem:$dst), 0>, Requires<[In64BitMode]>; -def : InstAlias<"call *$dst", (CALL32m i16mem:$dst), 0>, Requires<[In32BitMode]>; -def : InstAlias<"jmp *$dst", (JMP32m i16mem:$dst), 0>, Requires<[In32BitMode]>; -def : InstAlias<"call *$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; -def : InstAlias<"jmp *$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"call {*}$dst", (CALL64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"jmp {*}$dst", (JMP64m i64mem:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"call {*}$dst", (CALL32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"jmp {*}$dst", (JMP32m i32mem:$dst), 0>, Requires<[In32BitMode]>; +def : InstAlias<"call {*}$dst", (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"jmp {*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16BitMode]>; // "imul <imm>, B" is an alias for "imul <imm>, B, B". -def : InstAlias<"imulw $imm, $r", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm)>; -def : InstAlias<"imulw $imm, $r", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm)>; -def : InstAlias<"imull $imm, $r", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm)>; -def : InstAlias<"imull $imm, $r", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm)>; -def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>; -def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>; +def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; +def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; +def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; +def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; +def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; +def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; // inb %dx -> inb %al, %dx def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; @@ -2752,34 +2769,34 @@ def : InstAlias<"jmpl $seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>; // Force mov without a suffix with a segment and mem to prefer the 'l' form of // the move. All segment/mem forms are equivalent, this has the shortest // encoding. -def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; -def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; +def : InstAlias<"mov {$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; +def : InstAlias<"mov {$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; // Match 'movq <largeimm>, <reg>' as an alias for movabsq. -def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm), 0>; +def : InstAlias<"movq {$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; // Match 'movq GR64, MMX' as an alias for movd. -def : InstAlias<"movq $src, $dst", +def : InstAlias<"movq {$src, $dst|$dst, $src}", (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; -def : InstAlias<"movq $src, $dst", +def : InstAlias<"movq {$src, $dst|$dst, $src}", (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; // movsx aliases -def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; -def : InstAlias<"movsx $src, $dst", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movsx {$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0>; // movzx aliases -def : InstAlias<"movzx $src, $dst", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; -def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr8_Q GR64:$dst, GR8:$src), 0>; +def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>; // Note: No GR32->GR64 movzx form. // outb %dx -> outb %al, %dx diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td index ecf80a1ac9bf..5b36427acb1b 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -38,12 +38,17 @@ def MMX_PHADDSUBD : OpndItins< >; } +let Sched = WriteVecLogic in +def MMX_INTALU_ITINS_VECLOGICSCHED : OpndItins< + IIC_MMX_ALU_RR, IIC_MMX_ALU_RM +>; + let Sched = WriteVecIMul in def MMX_PMUL_ITINS : OpndItins< IIC_MMX_PMUL, IIC_MMX_PMUL >; -let Sched = WriteVecALU in { +let Sched = WriteVecIMul in { def MMX_PSADBW_ITINS : OpndItins< IIC_MMX_PSADBW, IIC_MMX_PSADBW >; @@ -166,13 +171,15 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, multiclass ssse3_palign_mm<string asm, Intrinsic IntId> { def R64irr : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2, i8imm:$src3), - !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>; + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>, + Sched<[WriteShuffle]>; def R64irm : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2, i8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set VR64:$dst, (IntId VR64:$src1, - (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>; + (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>, + Sched<[WriteShuffleLd, ReadAfterLd]>; } multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, @@ -192,11 +199,11 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], - NoItinerary, d>; + NoItinerary, d>, Sched<[WriteCvtI2F]>; def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src2), asm, [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], - NoItinerary, d>; + NoItinerary, d>, Sched<[WriteCvtI2FLd]>; } //===----------------------------------------------------------------------===// @@ -213,7 +220,7 @@ def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", // Data Transfer Instructions def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", - [(set VR64:$dst, + [(set VR64:$dst, (x86mmx (scalar_to_vector GR32:$src)))], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>; let canFoldAsLoad = 1 in @@ -247,10 +254,10 @@ def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), let SchedRW = [WriteMove] in { def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR64:$src), - "movd\t{$src, $dst|$dst, $src}", + "movd\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>; -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), "movq\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOVQ_RR>; @@ -427,13 +434,13 @@ let Constraints = "$src1 = $dst" in // Logical Instructions defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS_VECLOGICSCHED, 1>; defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS_VECLOGICSCHED, 1>; defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, - MMX_INTALU_ITINS, 1>; + MMX_INTALU_ITINS_VECLOGICSCHED, 1>; defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, - MMX_INTALU_ITINS>; + MMX_INTALU_ITINS_VECLOGICSCHED>; // Shift Instructions defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", @@ -479,19 +486,19 @@ defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d, MMX_INTALU_ITINS>; // -- Unpack Instructions -defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", +defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", int_x86_mmx_punpckhbw, MMX_UNPCK_H_ITINS>; -defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", +defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", int_x86_mmx_punpckhwd, MMX_UNPCK_H_ITINS>; -defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", +defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", int_x86_mmx_punpckhdq, MMX_UNPCK_H_ITINS>; -defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", +defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", int_x86_mmx_punpcklbw, MMX_UNPCK_L_ITINS>; -defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", +defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", int_x86_mmx_punpcklwd, MMX_UNPCK_L_ITINS>; defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq", @@ -559,7 +566,7 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; let Constraints = "$src1 = $dst" in { def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, - (outs VR64:$dst), + (outs VR64:$dst), (ins VR64:$src1, GR32orGR64:$src2, i32i8imm:$src3), "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, diff --git a/contrib/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm/lib/Target/X86/X86InstrSGX.td new file mode 100644 index 000000000000..47c5dc57fcc9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86InstrSGX.td @@ -0,0 +1,24 @@ +//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel SGX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SGX instructions + +// ENCLS - Execute an Enclave System Function of Specified Leaf Number +def ENCLS : I<0x01, MRM_CF, (outs), (ins), + "encls", []>, TB, Requires<[HasSGX]>; + +// ENCLU - Execute an Enclave User Function of Specified Leaf Number +def ENCLU : I<0x01, MRM_D7, (outs), (ins), + "enclu", []>, TB, Requires<[HasSGX]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index 2bb898e7465b..fca0c443bf48 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -181,6 +181,7 @@ def SSE_MPSADBW_ITINS : OpndItins< IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM >; +let Sched = WriteVecIMul in def SSE_PMULLD_ITINS : OpndItins< IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM >; @@ -218,11 +219,21 @@ def DEFAULT_ITINS_BLENDSCHED : OpndItins< IIC_ALU_NONMEM, IIC_ALU_MEM >; +let Sched = WriteVarBlend in +def DEFAULT_ITINS_VARBLENDSCHED : OpndItins< + IIC_ALU_NONMEM, IIC_ALU_MEM +>; + let Sched = WriteFBlend in def SSE_INTALU_ITINS_FBLEND_P : OpndItins< IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM >; +let Sched = WriteBlend in +def SSE_INTALU_ITINS_BLEND_P : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + //===----------------------------------------------------------------------===// // SSE 1 & 2 Instructions Classes //===----------------------------------------------------------------------===// @@ -601,29 +612,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in { // Patterns let Predicates = [UseAVX] in { - let AddedComplexity = 15 in { - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVS{S,D} to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; - def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; - - // Move low f32 and clear high bits. - def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4f32 (V_SET0)), - (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (v4i32 (V_SET0)), - (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>; - } - let AddedComplexity = 20 in { // MOVSSrm zeros the high parts of the register; represent this // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 @@ -659,31 +647,10 @@ let Predicates = [UseAVX] in { (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; } - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), - (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), - sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), - sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2f64 (V_SET0)), - (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>; - - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (v2i64 (V_SET0)), - (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>; - // Extract and store. def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), addr:$dst), @@ -734,7 +701,6 @@ let Predicates = [UseAVX] in { (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), sub_xmm)>; - // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the @@ -750,7 +716,7 @@ let Predicates = [UseAVX] in { } let Predicates = [UseSSE1] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), @@ -784,7 +750,7 @@ let Predicates = [UseSSE1] in { } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in { + let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), @@ -843,7 +809,7 @@ multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, string asm, Domain d, OpndItins itins, bit IsReMaterializable = 1> { -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>, Sched<[WriteFShuffle]>; @@ -854,6 +820,7 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in Sched<[WriteLoad]>; } +let Predicates = [HasAVX, NoVLX] in { defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS, VEX; @@ -879,20 +846,26 @@ defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD, VEX, VEX_L; +} + +let Predicates = [UseSSE1] in { defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", SSEPackedSingle, SSE_MOVA_ITINS>, PS; -defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, - "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, - PD; defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", SSEPackedSingle, SSE_MOVU_ITINS>, PS; +} +let Predicates = [UseSSE2] in { +defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, + "movapd", SSEPackedDouble, SSE_MOVA_ITINS>, + PD; defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>, PD; +} -let SchedRW = [WriteStore] in { +let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], @@ -1006,7 +979,7 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, - SchedRW = [WriteMove] in { + SchedRW = [WriteFShuffle] in { def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -1036,7 +1009,7 @@ let Predicates = [UseSSE2] in (MOVUPDmr addr:$dst, VR128:$src)>; // Use vmovaps/vmovups for AVX integer load/store. -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // 128-bit load/store def : Pat<(alignedloadv2i64 addr:$src), (VMOVAPSrm addr:$src)>; @@ -1251,6 +1224,9 @@ let Predicates = [HasAVX] in { (VMOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), @@ -1298,6 +1274,9 @@ let Predicates = [UseSSE2] in { (MOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (MOVLPDrm VR128:$src1, addr:$src2)>; // Store patterns def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), @@ -1353,6 +1332,8 @@ let Predicates = [HasAVX] in { (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), (VMOVHPSrm VR128:$src1, addr:$src2)>; + // VMOVHPD patterns + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time @@ -1360,6 +1341,16 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(store (f64 (vector_extract + (v2f64 (X86VPermilpi VR128:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (VMOVHPDmr addr:$dst, VR128:$src)>; } let Predicates = [UseSSE1] in { @@ -1373,6 +1364,8 @@ let Predicates = [UseSSE1] in { } let Predicates = [UseSSE2] in { + // MOVHPD patterns + // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem // is during lowering, where it's not possible to recognize the load fold // cause it has two uses through a bitcast. One use disappears at isel time @@ -1380,6 +1373,16 @@ let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (MOVHPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(store (f64 (vector_extract + (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (MOVHPDmr addr:$dst, VR128:$src)>; } //===----------------------------------------------------------------------===// @@ -1488,7 +1491,7 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm, Domain d, OpndItins itins> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def rr : I<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, [], itins.rr, d>, Sched<[itins.Sched]>; let mayLoad = 1 in @@ -1499,7 +1502,7 @@ let neverHasSideEffects = 1 in { multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, X86MemOperand x86memop, string asm> { -let neverHasSideEffects = 1, Predicates = [UseAVX] in { +let hasSideEffects = 0, Predicates = [UseAVX] in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, Sched<[WriteCvtI2F]>; @@ -1508,7 +1511,7 @@ let neverHasSideEffects = 1, Predicates = [UseAVX] in { (ins DstRC:$src1, x86memop:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, Sched<[WriteCvtI2FLd, ReadAfterLd]>; -} // neverHasSideEffects = 1 +} // hasSideEffects = 0 } let Predicates = [UseAVX] in { @@ -1815,7 +1818,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", /// SSE 2 Only // Convert scalar double to scalar single -let neverHasSideEffects = 1, Predicates = [UseAVX] in { +let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], @@ -1880,7 +1883,7 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg, // Convert scalar single to scalar double // SSE2 instructions with XS prefix -let neverHasSideEffects = 1, Predicates = [UseAVX] in { +let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -2202,7 +2205,7 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), // Convert Packed DW Integers to Packed Double FP let Predicates = [HasAVX] in { -let neverHasSideEffects = 1, mayLoad = 1 in +let hasSideEffects = 0, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX, Sched<[WriteCvtI2FLd]>; @@ -2224,7 +2227,7 @@ def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), Sched<[WriteCvtI2F]>; } -let neverHasSideEffects = 1, mayLoad = 1 in +let hasSideEffects = 0, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [], IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; @@ -2330,15 +2333,15 @@ let Predicates = [UseSSE2] in { multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, Operand CC, SDNode OpNode, ValueType VT, PatFrag ld_frag, string asm, string asm_alt, - OpndItins itins> { + OpndItins itins, ImmLeaf immLeaf> { def rr : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, - [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))], itins.rr>, Sched<[itins.Sched]>; def rm : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), imm:$cc))], + (ld_frag addr:$src2), immLeaf:$cc))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; @@ -2358,38 +2361,37 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSE_ALU_F32S>, - XS, VEX_4V, VEX_LIG; + SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG; defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSE_ALU_F32S>, // same latency as 32 bit compare + SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare XD, VEX_4V, VEX_LIG; let Constraints = "$src1 = $dst" in { defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", - "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>, - XS; + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S, + i8immZExt3>, XS; defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSE_ALU_F64S>, - XD; + SSE_ALU_F64S, i8immZExt3>, XD; } multiclass sse12_cmp_scalar_int<X86MemOperand x86memop, Operand CC, - Intrinsic Int, string asm, OpndItins itins> { + Intrinsic Int, string asm, OpndItins itins, + ImmLeaf immLeaf> { def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - VR128:$src, imm:$cc))], + VR128:$src, immLeaf:$cc))], itins.rr>, Sched<[itins.Sched]>; def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - (load addr:$src), imm:$cc))], + (load addr:$src), immLeaf:$cc))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -2398,19 +2400,19 @@ let isCodeGenOnly = 1 in { // Aliases to match intrinsics which expect XMM operand(s). defm Int_VCMPSS : sse12_cmp_scalar_int<f32mem, AVXCC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S>, + SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V; defm Int_VCMPSD : sse12_cmp_scalar_int<f64mem, AVXCC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S>, // same latency as f32 + SSE_ALU_F32S, i8immZExt5>, // same latency as f32 XD, VEX_4V; let Constraints = "$src1 = $dst" in { defm Int_CMPSS : sse12_cmp_scalar_int<f32mem, SSECC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $dst|$dst, $src}", - SSE_ALU_F32S>, XS; + SSE_ALU_F32S, i8immZExt3>, XS; defm Int_CMPSD : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $dst|$dst, $src}", - SSE_ALU_F64S>, + SSE_ALU_F64S, i8immZExt3>, XD; } } @@ -2484,16 +2486,16 @@ let Defs = [EFLAGS] in { // sse12_cmp_packed - sse 1 & 2 compare packed instructions multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, Operand CC, Intrinsic Int, string asm, - string asm_alt, Domain d, + string asm_alt, Domain d, ImmLeaf immLeaf, OpndItins itins = SSE_ALU_F32P> { def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], + [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))], itins.rr, d>, Sched<[WriteFAdd]>; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, - [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], + [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), immLeaf:$cc))], itins.rm, d>, Sched<[WriteFAddLd, ReadAfterLd]>; @@ -2512,28 +2514,28 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle>, PS, VEX_4V; + SSEPackedSingle, i8immZExt5>, PS, VEX_4V; defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble>, PD, VEX_4V; + SSEPackedDouble, i8immZExt5>, PD, VEX_4V; defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle>, PS, VEX_4V, VEX_L; + SSEPackedSingle, i8immZExt5>, PS, VEX_4V, VEX_L; defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble>, PD, VEX_4V, VEX_L; + SSEPackedDouble, i8immZExt5>, PD, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps, "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedSingle, SSE_ALU_F32P>, PS; + SSEPackedSingle, i8immZExt5, SSE_ALU_F32P>, PS; defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd, "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedDouble, SSE_ALU_F64P>, PD; + SSEPackedDouble, i8immZExt5, SSE_ALU_F64P>, PD; } let Predicates = [HasAVX] in { @@ -2577,18 +2579,17 @@ def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)), /// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string asm, PatFrag mem_frag, - Domain d, bit IsConvertibleToThreeAddress = 0> { + Domain d> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, Sched<[WriteFShuffleLd, ReadAfterLd]>; - let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in - def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), asm, - [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, - Sched<[WriteFShuffle]>; + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, + Sched<[WriteFShuffle]>; } defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, @@ -2607,10 +2608,10 @@ defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv4f32, SSEPackedSingle, 1 /* cvt to pshufd */>, PS; + memopv4f32, SSEPackedSingle>, PS; defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv2f64, SSEPackedDouble, 1 /* cvt to pshufd */>, PD; + memopv2f64, SSEPackedDouble>, PD; } let Predicates = [HasAVX] in { @@ -2850,7 +2851,7 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, ValueType OpVT128, ValueType OpVT256, OpndItins itins, bit IsCommutable = 0> { -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoVLX] in defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; @@ -2858,7 +2859,7 @@ let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, memopv2i64, i128mem, itins, IsCommutable, 1>; -let Predicates = [HasAVX2] in +let Predicates = [HasAVX2, NoVLX] in defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT256, VR256, loadv4i64, i256mem, itins, IsCommutable, 0>, VEX_4V, VEX_L; @@ -2920,6 +2921,7 @@ let isCodeGenOnly = 1 in { /// multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, SDNode OpNode> { + let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f256mem, [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))], @@ -2950,6 +2952,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), (loadv2i64 addr:$src2)))], 0>, PD, VEX_4V; + } let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, @@ -3005,6 +3008,7 @@ let Predicates = [HasAVX1Only] in { /// classes below multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, SizeItins itins> { + let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins.s, 0>, PS, VEX_4V; @@ -3018,6 +3022,7 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L; + } let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, @@ -3111,7 +3116,7 @@ let isCodeGenOnly = 1 in { // previously we generated: // addss %xmm0, %xmm1 // movss %xmm1, %xmm0 -// +// // we now generate: // addss %xmm1, %xmm0 @@ -3136,7 +3141,6 @@ let Predicates = [UseSSE1] in { let Predicates = [UseSSE2] in { // SSE2 patterns to select scalar double-precision fp arithmetic instructions - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), @@ -3156,10 +3160,10 @@ let Predicates = [UseSSE2] in { } let Predicates = [UseSSE41] in { - // If the subtarget has SSE4.1 but not AVX, the vector insert - // instruction is lowered into a X86insertps rather than a X86Movss. - // When selecting SSE scalar single-precision fp arithmetic instructions, - // make sure that we correctly match the X86insertps. + // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is + // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When + // selecting SSE scalar single-precision fp arithmetic instructions, make + // sure that we correctly match them. def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), @@ -3177,6 +3181,57 @@ let Predicates = [UseSSE41] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } let Predicates = [HasAVX] in { @@ -3215,6 +3270,57 @@ let Predicates = [HasAVX] in { (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (iPTR 0))), (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv + (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + FR32:$src))), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv + (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + FR64:$src))), (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; } // Patterns used to select SSE scalar fp arithmetic instructions from @@ -3232,7 +3338,7 @@ let Predicates = [HasAVX] in { // previously we generated: // addps %xmm0, %xmm1 // movss %xmm1, %xmm0 -// +// // we now generate: // addss %xmm1, %xmm0 @@ -3240,13 +3346,13 @@ let Predicates = [UseSSE1] in { def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))), (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))), (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))), (MULSSrr_Int v4f32:$dst, v4f32:$src)>; - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), + def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))), (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; } @@ -3269,6 +3375,49 @@ let Predicates = [UseSSE2] in { (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; } +let Predicates = [UseSSE41] in { + // With SSE4.1 we may see these operations using X86Blendi rather than + // X86Movs{s,d}. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (ADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (SUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (MULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (DIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (ADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (SUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (MULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (DIVSDrr_Int v2f64:$dst, v2f64:$src)>; +} + let Predicates = [HasAVX] in { // The following patterns select AVX Scalar single/double precision fp // arithmetic instructions from a packed single precision fp instruction @@ -3298,6 +3447,46 @@ let Predicates = [HasAVX] in { def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))), (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + // Also handle X86Blendi-based patterns. + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VADDSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VMULSSrr_Int v4f32:$dst, v4f32:$src)>; + def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), + (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), + (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>; + + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), + (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; + + def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VADDSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VMULSDrr_Int v2f64:$dst, v2f64:$src)>; + def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), + (v2f64 VR128:$dst), (i8 2))), + (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>; } /// Unop Arithmetic @@ -3326,6 +3515,16 @@ def SSE_SQRTSD : OpndItins< >; } +let Sched = WriteFRsqrt in { +def SSE_RSQRTPS : OpndItins< + IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM +>; + +def SSE_RSQRTSS : OpndItins< + IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM +>; +} + let Sched = WriteFRcp in { def SSE_RCPP : OpndItins< IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM @@ -3336,57 +3535,10 @@ def SSE_RCPS : OpndItins< >; } -/// sse1_fp_unop_s - SSE1 unops in scalar form. -multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, - SDNode OpNode, Intrinsic F32Int, OpndItins itins> { -let Predicates = [HasAVX], hasSideEffects = 0 in { - def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), - (ins FR32:$src1, FR32:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, Sched<[itins.Sched]>; - let mayLoad = 1 in { - def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), - (ins FR32:$src1,f32mem:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - let isCodeGenOnly = 1 in - def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, ssmem:$src2), - !strconcat("v", OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, VEX_4V, VEX_LIG, - Sched<[itins.Sched.Folded, ReadAfterLd]>; - } -} - - def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode FR32:$src))]>, Sched<[itins.Sched]>; - // For scalar unary operations, fold a load into the operation - // only in OptForSize mode. It eliminates an instruction, but it also - // eliminates a whole-register clobber (the load), so it introduces a - // partial register update condition. - def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS, - Requires<[UseSSE1, OptForSize]>, Sched<[itins.Sched.Folded]>; -let isCodeGenOnly = 1 in { - def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>, - Sched<[itins.Sched]>; - def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src), - !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>, - Sched<[itins.Sched.Folded]>; -} -} - -/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand. -multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode, +/// sse1_fp_unop_s - SSE1 unops in scalar form +/// For the non-AVX defs, we need $src1 to be tied to $dst because +/// the HW instructions are 2 operand / destructive. +multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { let Predicates = [HasAVX], hasSideEffects = 0 in { def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), @@ -3595,20 +3747,19 @@ let Predicates = [HasAVX] in { } // Square root. -defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss, - SSE_SQRTSS>, +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>, +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>, sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps, - int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>; -defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>, + int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>; +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>, sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, int_x86_avx_rcp_ps_256, SSE_RCPP>; @@ -3669,13 +3820,15 @@ let Predicates = [HasAVX] in { (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; } -// Reciprocal approximations. Note that these typically require refinement -// in order to obtain suitable precision. +// These are unary operations, but they are modeled as having 2 source operands +// because the high elements of the destination are unchanged in SSE. let Predicates = [UseSSE1] in { def : Pat<(int_x86_sse_rsqrt_ss VR128:$src), (RSQRTSSr_Int VR128:$src, VR128:$src)>; def : Pat<(int_x86_sse_rcp_ss VR128:$src), (RCPSSr_Int VR128:$src, VR128:$src)>; + def : Pat<(int_x86_sse_sqrt_ss VR128:$src), + (SQRTSSr_Int VR128:$src, VR128:$src)>; } // There is no f64 version of the reciprocal approximation instructions. @@ -3686,6 +3839,7 @@ let Predicates = [UseSSE1] in { let AddedComplexity = 400 in { // Prefer non-temporal versions let SchedRW = [WriteStore] in { +let Predicates = [HasAVX, NoVLX] in { def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3726,6 +3880,7 @@ def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), [(alignednontemporalstore (v4i64 VR256:$src), addr:$dst)], IIC_SSE_MOVNT>, VEX, VEX_L; +} def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3755,6 +3910,14 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), PS, Requires<[HasSSE2]>; } // SchedRW = [WriteStore] +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (VMOVNTPSmr addr:$dst, VR128:$src)>; +} + +def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (MOVNTPSmr addr:$dst, VR128:$src)>; + } // AddedComplexity //===----------------------------------------------------------------------===// @@ -3788,8 +3951,8 @@ def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), let SchedRW = [WriteNop] in { // Pause. This "instruction" is encoded as "rep; nop", so even though it // was introduced with SSE2, it's backward compatible. -def PAUSE : I<0x90, RawFrm, (outs), (ins), - "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, +def PAUSE : I<0x90, RawFrm, (outs), (ins), + "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, OBXS, Requires<[HasSSE2]>; } @@ -3821,12 +3984,14 @@ def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>; -def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src), - "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], - IIC_SSE_LDMXCSR>, Sched<[WriteLoad]>; -def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), - "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], - IIC_SSE_STMXCSR>, Sched<[WriteStore]>; +let Predicates = [UseSSE1] in { +def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)], + IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>; +def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)], + IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>; +} //===---------------------------------------------------------------------===// // SSE2 - Move Aligned/Unaligned Packed Integer Instructions @@ -3834,7 +3999,7 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), let ExeDomain = SSEPackedInt in { // SSE integer instructions -let neverHasSideEffects = 1, SchedRW = [WriteMove] in { +let hasSideEffects = 0, SchedRW = [WriteMove] in { def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, VEX; @@ -3869,7 +4034,7 @@ def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), } let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, - neverHasSideEffects = 1, SchedRW = [WriteLoad] in { + hasSideEffects = 0, SchedRW = [WriteLoad] in { def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>, VEX; @@ -3886,7 +4051,7 @@ let Predicates = [HasAVX] in { } } -let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>, @@ -3906,7 +4071,7 @@ def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), } let SchedRW = [WriteMove] in { -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -3927,7 +4092,7 @@ def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), } // SchedRW let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, - neverHasSideEffects = 1, SchedRW = [WriteLoad] in { + hasSideEffects = 0, SchedRW = [WriteLoad] in { def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], @@ -3939,7 +4104,7 @@ def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), XS, Requires<[UseSSE2]>; } -let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in { +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], @@ -5222,7 +5387,7 @@ let Predicates = [UseSSE3] in { //===---------------------------------------------------------------------===// multiclass sse3_replicate_dfp<string OpcodeStr> { -let neverHasSideEffects = 1 in +let hasSideEffects = 0 in def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; @@ -5277,6 +5442,13 @@ let Predicates = [HasAVX] in { (VMOVDDUPYrr VR256:$src)>; } +let Predicates = [UseAVX, OptForSize] in { + def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +} + let Predicates = [UseSSE3] in { def : Pat<(X86Movddup (memopv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; @@ -5357,56 +5529,34 @@ let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { // Patterns used to select 'addsub' instructions. let Predicates = [HasAVX] in { - // Constant 170 corresponds to the binary mask '10101010'. - // When used as a blend mask, it allows selecting eight elements from two - // input vectors as follow: - // - Even-numbered values in the destination are copied from - // the corresponding elements in the first input vector; - // - Odd-numbered values in the destination are copied from - // the corresponding elements in the second input vector. - - def : Pat<(v8f32 (X86Blendi (v8f32 (fsub VR256:$lhs, VR256:$rhs)), - (v8f32 (fadd VR256:$lhs, VR256:$rhs)), (i32 170))), - (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; - - // Constant 10 corresponds to the binary mask '1010'. - // In the two pattens below, constant 10 is used as a blend mask to select - // - the 1st and 3rd element from the first input vector (the 'fsub' node); - // - the 2nd and 4th element from the second input vector (the 'fadd' node). - - def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), - (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), - (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)), - (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))), - (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), - (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), - (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), - (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), - (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; + + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), + (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 (memop addr:$rhs)))), + (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), + (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; + def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 (memop addr:$rhs)))), + (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; } let Predicates = [UseSSE3] in { - // Constant 10 corresponds to the binary mask '1010'. - // In the pattern below, it is used as a blend mask to select: - // - the 1st and 3rd element from the first input vector (the fsub node); - // - the 2nd and 4th element from the second input vector (the fadd node). - - def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)), - (v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - - def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)), - (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))), - (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)), - (v2f64 (fsub VR128:$lhs, VR128:$rhs)))), + def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 (memop addr:$rhs)))), + (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; + def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 (memop addr:$rhs)))), + (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; } //===---------------------------------------------------------------------===// @@ -5810,7 +5960,7 @@ defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw", //===---------------------------------------------------------------------===// multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { - let neverHasSideEffects = 1 in { + let hasSideEffects = 0 in { def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3), !if(Is2Addr, @@ -5830,7 +5980,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { } multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { - let neverHasSideEffects = 1 in { + let hasSideEffects = 0 in { def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, i8imm:$src3), !strconcat(asm, @@ -5917,552 +6067,240 @@ def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, // SSE4.1 - Packed Move with Sign/Zero Extend //===----------------------------------------------------------------------===// -multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId, - OpndItins itins = DEFAULT_ITINS> { - def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, - Sched<[itins.Sched]>; - - def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, - (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))], - itins.rm>, Sched<[itins.Sched.Folded]>; -} - -multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId, X86FoldableSchedWrite Sched> { - def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; - - def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId (load addr:$src)))]>, - Sched<[Sched.Folded]>; -} - -let Predicates = [HasAVX] in { -defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", - int_x86_sse41_pmovsxbw, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", - int_x86_sse41_pmovsxwd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", - int_x86_sse41_pmovsxdq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", - int_x86_sse41_pmovzxbw, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", - int_x86_sse41_pmovzxwd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", - int_x86_sse41_pmovzxdq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -} - -let Predicates = [HasAVX2] in { -defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw", - int_x86_avx2_pmovsxbw, - WriteShuffle>, VEX, VEX_L; -defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd", - int_x86_avx2_pmovsxwd, - WriteShuffle>, VEX, VEX_L; -defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq", - int_x86_avx2_pmovsxdq, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw", - int_x86_avx2_pmovzxbw, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd", - int_x86_avx2_pmovzxwd, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq", - int_x86_avx2_pmovzxdq, - WriteShuffle>, VEX, VEX_L; -} - -defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXBW : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXWD : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXDQ : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq, - SSE_INTALU_ITINS_SHUFF_P>; - -let Predicates = [HasAVX] in { - // Common patterns involving scalar load. - def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), - (VPMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), - (VPMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), - (VPMOVSXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), - (VPMOVSXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), - (VPMOVSXDQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), - (VPMOVZXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), - (VPMOVZXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), - (VPMOVZXDQrm addr:$src)>; -} - -let Predicates = [UseSSE41] in { - // Common patterns involving scalar load. - def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)), - (PMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)), - (PMOVSXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))), - (PMOVSXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)), - (PMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)), - (PMOVSXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))), - (PMOVSXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)), - (PMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)), - (PMOVSXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))), - (PMOVSXDQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)), - (PMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)), - (PMOVZXBWrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))), - (PMOVZXBWrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)), - (PMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)), - (PMOVZXWDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))), - (PMOVZXWDrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)), - (PMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)), - (PMOVZXDQrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))), - (PMOVZXDQrm addr:$src)>; -} - -multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId, - OpndItins itins = DEFAULT_ITINS> { - def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), +multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, + RegisterClass OutRC, RegisterClass InRC, + OpndItins itins> { + def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, + [], itins.rr>, Sched<[itins.Sched]>; - def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, - (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))], - itins.rm>, Sched<[itins.Sched.Folded]>; -} - -multiclass SS41I_binop_rm_int8_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId, X86FoldableSchedWrite Sched> { - def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; - - def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i32mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, - (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, - Sched<[Sched.Folded]>; -} - -let Predicates = [HasAVX] in { -defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXBD : SS41I_binop_rm_int4<0x31, "vpmovzxbd", int_x86_sse41_pmovzxbd, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq, - DEFAULT_ITINS_SHUFFLESCHED>, VEX; -} - -let Predicates = [HasAVX2] in { -defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd", - int_x86_avx2_pmovsxbd, WriteShuffle>, - VEX, VEX_L; -defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq", - int_x86_avx2_pmovsxwq, WriteShuffle>, - VEX, VEX_L; -defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd", - int_x86_avx2_pmovzxbd, WriteShuffle>, - VEX, VEX_L; -defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq", - int_x86_avx2_pmovzxwq, WriteShuffle>, - VEX, VEX_L; -} - -defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd, - SSE_INTALU_ITINS_SHUFF_P>; -defm PMOVZXWQ : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq, - SSE_INTALU_ITINS_SHUFF_P>; - -let Predicates = [HasAVX] in { - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), - (VPMOVSXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), - (VPMOVSXWQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), - (VPMOVZXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), - (VPMOVZXWQrm addr:$src)>; -} - -let Predicates = [UseSSE41] in { - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)), - (PMOVSXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)), - (PMOVSXWQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)), - (PMOVZXBDrm addr:$src)>; - def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)), - (PMOVZXWQrm addr:$src)>; -} - -multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId, - X86FoldableSchedWrite Sched> { - def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; - - // Expecting a i16 load any extended to i32 value. - def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId (bitconvert - (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>, - Sched<[Sched.Folded]>; -} - -multiclass SS41I_binop_rm_int4_y<bits<8> opc, string OpcodeStr, - Intrinsic IntId, X86FoldableSchedWrite Sched> { - def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId VR128:$src))]>, Sched<[Sched]>; - - // Expecting a i16 load any extended to i32 value. - def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i16mem:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (IntId (bitconvert - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>, - Sched<[Sched.Folded]>; + [], + itins.rm>, Sched<[itins.Sched.Folded]>; } -let Predicates = [HasAVX] in { -defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq, - WriteShuffle>, VEX; -defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq, - WriteShuffle>, VEX; -} -let Predicates = [HasAVX2] in { -defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", int_x86_avx2_pmovsxbq, - WriteShuffle>, VEX, VEX_L; -defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", int_x86_avx2_pmovzxbq, - WriteShuffle>, VEX, VEX_L; -} -defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq, - WriteShuffle>; -defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq, - WriteShuffle>; - -let Predicates = [HasAVX2] in { - def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>; - def : Pat<(v8i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>; - - def : Pat<(v8i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>; - - def : Pat<(v4i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; - - def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))), - (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))), - (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))), - (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))), - (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))), - (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))), - (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v8i32 (X86vsext (v8i16 (bitconvert (v2i64 (load addr:$src)))))), - (VPMOVSXWDYrm addr:$src)>; - def : Pat<(v4i64 (X86vsext (v4i32 (bitconvert (v2i64 (load addr:$src)))))), - (VPMOVSXDQYrm addr:$src)>; - - def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXBDYrm addr:$src)>; - def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXBDYrm addr:$src)>; - - def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXWQYrm addr:$src)>; - def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXWQYrm addr:$src)>; - - def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBQYrm addr:$src)>; -} - -let Predicates = [HasAVX] in { - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXBQrm addr:$src)>; -} - -let Predicates = [UseSSE41] in { - def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; - - // Common patterns involving scalar load - def : Pat<(int_x86_sse41_pmovsxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXBQrm addr:$src)>; - - def : Pat<(int_x86_sse41_pmovzxbq - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXBQrm addr:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVSXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVSXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVSXWQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (extloadi32i16 addr:$src))))))), - (PMOVSXBQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVSXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVSXDQrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVSXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVSXBWrm addr:$src)>; +multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, + X86MemOperand MemOp, X86MemOperand MemYOp, + OpndItins SSEItins, OpndItins AVXItins, + OpndItins AVX2Itins> { + defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>; + let Predicates = [HasAVX] in + defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, + VR128, VR128, AVXItins>, VEX; + let Predicates = [HasAVX2] in + defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, + VR256, VR128, AVX2Itins>, VEX, VEX_L; +} + +multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, + X86MemOperand MemOp, X86MemOperand MemYOp> { + defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), + MemOp, MemYOp, + SSE_INTALU_ITINS_SHUFF_P, + DEFAULT_ITINS_SHUFFLESCHED, + DEFAULT_ITINS_SHUFFLESCHED>; + defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), + !strconcat("pmovzx", OpcodeStr), + MemOp, MemYOp, + SSE_INTALU_ITINS_SHUFF_P, + DEFAULT_ITINS_SHUFFLESCHED, + DEFAULT_ITINS_SHUFFLESCHED>; +} + +defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem>; +defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem>; +defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem>; + +defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem>; +defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem>; + +defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem>; + +// AVX2 Patterns +multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, SDNode ExtOp> { + // Register-Register patterns + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; + + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; + + // On AVX2, we also support 256bit inputs. + // FIXME: remove these patterns when the old shuffle lowering goes away. + def : Pat<(v16i16 (ExtOp (v32i8 VR256:$src))), + (!cast<I>(OpcPrefix#BWYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v8i32 (ExtOp (v32i8 VR256:$src))), + (!cast<I>(OpcPrefix#BDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (ExtOp (v32i8 VR256:$src))), + (!cast<I>(OpcPrefix#BQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v8i32 (ExtOp (v16i16 VR256:$src))), + (!cast<I>(OpcPrefix#WDYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (ExtOp (v16i16 VR256:$src))), + (!cast<I>(OpcPrefix#WQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v4i64 (ExtOp (v8i32 VR256:$src))), + (!cast<I>(OpcPrefix#DQYrr) (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + // AVX2 Register-Memory patterns + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; } let Predicates = [HasAVX2] in { - def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>; - def : Pat<(v8i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>; - - def : Pat<(v8i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>; - def : Pat<(v4i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>; - - def : Pat<(v4i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>; - - def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))), - (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))), - (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))), - (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))), - (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))), - (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; - - def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))), - (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", X86vsext>; + defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", X86vzext>; +} + +// SSE4.1/AVX patterns. +multiclass SS41I_pmovx_patterns<string OpcPrefix, SDNode ExtOp, + PatFrag ExtLoad16> { + def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; + + def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; + + def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; + + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; } let Predicates = [HasAVX] in { - def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>; - - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVZXBWrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), - (VPMOVZXBQrm addr:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVZXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVZXWQrm addr:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), - (VPMOVZXDQrm addr:$src)>; - - def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXDQrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64 - (scalar_to_vector (loadi64 addr:$src))))))), - (VPMOVSXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64 - (scalar_to_vector (loadf64 addr:$src))))))), - (VPMOVSXBWrm addr:$src)>; - - def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32 - (scalar_to_vector (loadi32 addr:$src))))))), - (VPMOVSXWQrm addr:$src)>; - def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32 - (scalar_to_vector (extloadi32i16 addr:$src))))))), - (VPMOVSXBQrm addr:$src)>; + defm : SS41I_pmovx_patterns<"VPMOVSX", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"VPMOVZX", X86vzext, loadi16_anyext>; } let Predicates = [UseSSE41] in { - def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>; - - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVZXBWrm addr:$src)>; - def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVZXBWrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXBDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))), - (PMOVZXBQrm addr:$src)>; - - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVZXWDrm addr:$src)>; - def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVZXWDrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))), - (PMOVZXWQrm addr:$src)>; - - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), - (PMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))), - (PMOVZXDQrm addr:$src)>; - def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), - (PMOVZXDQrm addr:$src)>; + defm : SS41I_pmovx_patterns<"PMOVSX", X86vsext, extloadi32i16>; + defm : SS41I_pmovx_patterns<"PMOVZX", X86vzext, loadi16_anyext>; } //===----------------------------------------------------------------------===// @@ -6478,7 +6316,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>, Sched<[WriteShuffle]>; - let neverHasSideEffects = 1, mayStore = 1, + let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2), @@ -6503,7 +6341,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[WriteShuffle]>; - let neverHasSideEffects = 1, mayStore = 1, + let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteShuffleLd, WriteRMW] in def mr : SS4AIi8<opc, MRMDestMem, (outs), (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2), @@ -6692,7 +6530,7 @@ let Constraints = "$src1 = $dst" in multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, OpndItins itins = DEFAULT_ITINS> { def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u32u8imm:$src3), + (ins VR128:$src1, VR128:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -6701,7 +6539,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1, (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>, Sched<[WriteFShuffle]>; def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3), + (ins VR128:$src1, f32mem:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, @@ -7221,7 +7059,7 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, Sched<[itins.Sched.Folded, ReadAfterLd]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { let isCommutable = 0 in defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128, loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, @@ -7252,7 +7090,7 @@ let Predicates = [HasAVX] in { SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX] in { let isCommutable = 0 in defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256, loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, @@ -7306,9 +7144,9 @@ let Constraints = "$src1 = $dst" in { SSE_INTMUL_ITINS_P, 1>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, - memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, + memopv2i64, i128mem, 0, SSE_PMULLD_ITINS>, VEX_4V; defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, memopv2i64, i128mem, 0, SSE_INTALU_ITINS_P>, @@ -7316,7 +7154,7 @@ let Predicates = [HasAVX] in { } let Predicates = [HasAVX2] in { defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, - memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, + memopv4i64, i256mem, 0, SSE_PMULLD_ITINS>, VEX_4V, VEX_L; defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, memopv4i64, i256mem, 0, SSE_INTALU_ITINS_P>, @@ -7337,7 +7175,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, OpndItins itins = DEFAULT_ITINS> { let isCommutable = 1 in def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u32u8imm:$src3), + (ins RC:$src1, RC:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7346,7 +7184,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>, Sched<[itins.Sched]>; def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), @@ -7360,31 +7198,33 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX] in { let isCommutable = 0 in { - let ExeDomain = SSEPackedSingle in { - defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", - int_x86_avx_blend_ps_256, VR256, loadv8f32, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; - } - let ExeDomain = SSEPackedDouble in { - defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; - defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", - int_x86_avx_blend_pd_256,VR256, loadv4f64, - f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, - VEX_4V, VEX_L; - } + defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, + VR128, loadv2i64, i128mem, 0, + DEFAULT_ITINS_MPSADSCHED>, VEX_4V; + } + + let ExeDomain = SSEPackedSingle in { + defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, + VR128, loadv4f32, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", + int_x86_avx_blend_ps_256, VR256, loadv8f32, + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; + } + let ExeDomain = SSEPackedDouble in { + defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, + VR128, loadv2f64, f128mem, 0, + DEFAULT_ITINS_FBLENDSCHED>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", + int_x86_avx_blend_pd_256,VR256, loadv4f64, + f256mem, 0, DEFAULT_ITINS_FBLENDSCHED>, + VEX_4V, VEX_L; + } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, VR128, loadv2i64, i128mem, 0, DEFAULT_ITINS_BLENDSCHED>, VEX_4V; - defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_MPSADSCHED>, VEX_4V; - } + let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, VR128, loadv4f32, f128mem, 0, @@ -7401,17 +7241,21 @@ let Predicates = [HasAVX] in { let Predicates = [HasAVX2] in { let isCommutable = 0 in { - defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, - VR256, loadv4i64, i256mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, VR256, loadv4i64, i256mem, 0, DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L; } + defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw, + VR256, loadv4i64, i256mem, 0, + DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { + defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv2i64, i128mem, + 1, SSE_MPSADBW_ITINS>; + } let ExeDomain = SSEPackedSingle in defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, VR128, memopv4f32, f128mem, @@ -7422,11 +7266,7 @@ let Constraints = "$src1 = $dst" in { 1, SSE_INTALU_ITINS_FBLEND_P>; defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, - VR128, memopv2i64, i128mem, - 1, SSE_MPSADBW_ITINS>; - } + 1, SSE_INTALU_ITINS_BLEND_P>; let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, @@ -7545,6 +7385,57 @@ let Predicates = [HasAVX2] in { (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>; } +// Patterns +let Predicates = [UseAVX] in { + let AddedComplexity = 15 in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVS{S,D} to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), + (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), + (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + } + + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)), + sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), + sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; + + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; +} + +let Predicates = [UseSSE41] in { + // With SSE41 we can use blends for these patterns. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (BLENDPDrri (v2f64 (V_SET0)), VR128:$src, (i8 1))>; +} + + /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, @@ -7555,7 +7446,7 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))], - itins.rr>; + itins.rr>, Sched<[itins.Sched]>; def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2), @@ -7564,18 +7455,21 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in { [(set VR128:$dst, (IntId VR128:$src1, (bitconvert (mem_frag addr:$src2)), XMM0))], - itins.rm>; + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } let ExeDomain = SSEPackedDouble in defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, - int_x86_sse41_blendvpd>; + int_x86_sse41_blendvpd, + DEFAULT_ITINS_FBLENDSCHED>; let ExeDomain = SSEPackedSingle in defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, - int_x86_sse41_blendvps>; + int_x86_sse41_blendvps, + DEFAULT_ITINS_FBLENDSCHED>; defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, - int_x86_sse41_pblendvb>; + int_x86_sse41_pblendvb, + DEFAULT_ITINS_VARBLENDSCHED>; // Aliases with the implicit xmm0 argument def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}", @@ -7704,7 +7598,7 @@ multiclass pcmpistrm_SS42AI<string asm> { []>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>; } -let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in { +let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { let Predicates = [HasAVX] in defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ; @@ -7739,7 +7633,7 @@ multiclass SS42AI_pcmpestrm<string asm> { []>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>; } -let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { +let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { let Predicates = [HasAVX] in defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">; @@ -7774,7 +7668,7 @@ multiclass SS42AI_pcmpistri<string asm> { []>, Sched<[WritePCmpIStrILd, ReadAfterLd]>; } -let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in { +let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { let Predicates = [HasAVX] in defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; @@ -7810,7 +7704,7 @@ multiclass SS42AI_pcmpestri<string asm> { []>, Sched<[WritePCmpEStrILd, ReadAfterLd]>; } -let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in { +let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { let Predicates = [HasAVX] in defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; @@ -8189,7 +8083,7 @@ def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), //===----------------------------------------------------------------------===// // VINSERTF128 - Insert packed floating-point values // -let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR128:$src2, i8imm:$src3), "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -8221,6 +8115,49 @@ def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2), (INSERT_get_vinsert128_imm VR256:$ins))>; } +// Combine two consecutive 16-byte loads with a common destination register into +// one 32-byte load to that register. +let Predicates = [HasAVX, HasFastMem32] in { + def : Pat<(insert_subvector + (v8f32 (insert_subvector undef, (loadv4f32 addr:$src), (iPTR 0))), + (loadv4f32 (add addr:$src, (iPTR 16))), + (iPTR 4)), + (VMOVUPSYrm addr:$src)>; + + def : Pat<(insert_subvector + (v4f64 (insert_subvector undef, (loadv2f64 addr:$src), (iPTR 0))), + (loadv2f64 (add addr:$src, (iPTR 16))), + (iPTR 2)), + (VMOVUPDYrm addr:$src)>; + + def : Pat<(insert_subvector + (v32i8 (insert_subvector + undef, (bc_v16i8 (loadv2i64 addr:$src)), (iPTR 0))), + (bc_v16i8 (loadv2i64 (add addr:$src, (iPTR 16)))), + (iPTR 16)), + (VMOVDQUYrm addr:$src)>; + + def : Pat<(insert_subvector + (v16i16 (insert_subvector + undef, (bc_v8i16 (loadv2i64 addr:$src)), (iPTR 0))), + (bc_v8i16 (loadv2i64 (add addr:$src, (iPTR 16)))), + (iPTR 8)), + (VMOVDQUYrm addr:$src)>; + + def : Pat<(insert_subvector + (v8i32 (insert_subvector + undef, (bc_v4i32 (loadv2i64 addr:$src)), (iPTR 0))), + (bc_v4i32 (loadv2i64 (add addr:$src, (iPTR 16)))), + (iPTR 4)), + (VMOVDQUYrm addr:$src)>; + + def : Pat<(insert_subvector + (v4i64 (insert_subvector undef, (loadv2i64 addr:$src), (iPTR 0))), + (loadv2i64 (add addr:$src, (iPTR 16))), + (iPTR 2)), + (VMOVDQUYrm addr:$src)>; +} + let Predicates = [HasAVX1Only] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), @@ -8263,7 +8200,7 @@ def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), //===----------------------------------------------------------------------===// // VEXTRACTF128 - Extract packed floating-point values // -let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in { +let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), (ins VR256:$src1, i8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -8393,13 +8330,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (vt (X86VPermilp RC:$src1, (i8 imm:$src2))))]>, VEX, + [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (vt (X86VPermilp (memop addr:$src1), (i8 imm:$src2))))]>, VEX, + (vt (X86VPermilpi (memop addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; } @@ -8417,19 +8354,37 @@ let ExeDomain = SSEPackedDouble in { } let Predicates = [HasAVX] in { -def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), + (VPERMILPSYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VPERMILPSYrm VR256:$src1, addr:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))), + (VPERMILPDYrr VR256:$src1, VR256:$src2)>; +def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))), + (VPERMILPDYrm VR256:$src1, addr:$src2)>; + +def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPSYri VR256:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))), (VPERMILPDYri VR256:$src1, imm:$imm)>; -def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)), +def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)), (i8 imm:$imm))), (VPERMILPSYmi addr:$src1, imm:$imm)>; -def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDYmi addr:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))), +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))), + (VPERMILPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))), + (VPERMILPSrm VR128:$src1, addr:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))), + (VPERMILPDrr VR128:$src1, VR128:$src2)>; +def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))), + (VPERMILPDrm VR128:$src1, addr:$src2)>; + +def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))), (VPERMILPDri VR128:$src1, imm:$imm)>; -def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))), +def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))), (VPERMILPDmi addr:$src1, imm:$imm)>; } @@ -8505,7 +8460,7 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { "vcvtph2ps\t{$src, $dst|$dst, $src}", [(set RC:$dst, (Int VR128:$src))]>, T8PD, VEX, Sched<[WriteCvtF2F]>; - let neverHasSideEffects = 1, mayLoad = 1 in + let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX, Sched<[WriteCvtF2FLd]>; @@ -8517,7 +8472,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, TAPD, VEX, Sched<[WriteCvtF2F]>; - let neverHasSideEffects = 1, mayStore = 1, + let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteCvtF2FLd, WriteRMW] in def mr : Ii8<0x1D, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src1, i32i8imm:$src2), @@ -8563,13 +8518,13 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, X86MemOperand x86memop> { let isCommutable = 1 in def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u32u8imm:$src3), + (ins RC:$src1, RC:$src2, i8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, Sched<[WriteBlend]>, VEX_4V; def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + (ins RC:$src1, x86memop:$src2, i8imm:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, @@ -8578,12 +8533,10 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr, Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; } -let isCommutable = 0 in { defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, VR128, loadv2i64, i128mem>; defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, VR256, loadv4i64, i256mem>, VEX_L; -} def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$mask)), @@ -8675,6 +8628,27 @@ let Predicates = [HasAVX2] in { def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), (VBROADCASTSDYrr VR128:$src)>; + // Provide aliases for broadcast from the same regitser class that + // automatically does the extract. + def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), + (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), + sub_xmm)))>; + def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), + (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), + sub_xmm)))>; + def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), + (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), + (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), + sub_xmm)))>; + def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), + (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), + (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), + sub_xmm)))>; + // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. let AddedComplexity = 20 in { @@ -8756,6 +8730,9 @@ let Predicates = [HasAVX] in { (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm), (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>; } + + def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; } //===----------------------------------------------------------------------===// @@ -8763,14 +8740,14 @@ let Predicates = [HasAVX] in { // multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT> { + ValueType OpVT, X86FoldableSchedWrite Sched> { def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, - Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; + Sched<[Sched]>, VEX_4V, VEX_L; def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, @@ -8778,22 +8755,22 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermv VR256:$src1, (bitconvert (mem_frag addr:$src2)))))]>, - Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; + Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; } -defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>; +defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>; let ExeDomain = SSEPackedSingle in -defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>; +defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>; multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, - ValueType OpVT> { + ValueType OpVT, X86FoldableSchedWrite Sched> { def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, i8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, - Sched<[WriteShuffle256]>, VEX, VEX_L; + Sched<[Sched]>, VEX, VEX_L; def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), !strconcat(OpcodeStr, @@ -8801,12 +8778,14 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, [(set VR256:$dst, (OpVT (X86VPermi (mem_frag addr:$src1), (i8 imm:$src2))))]>, - Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX, VEX_L; + Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; } -defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W; +defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, + WriteShuffle256>, VEX_W; let ExeDomain = SSEPackedDouble in -defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W; +defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, + WriteFShuffle256>, VEX_W; //===----------------------------------------------------------------------===// // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks @@ -8847,7 +8826,7 @@ def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)), //===----------------------------------------------------------------------===// // VINSERTI128 - Insert packed integer values // -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR128:$src2, i8imm:$src3), "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", @@ -8907,7 +8886,7 @@ def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), [(set VR128:$dst, (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>, Sched<[WriteShuffle256]>, VEX, VEX_L; -let neverHasSideEffects = 1, mayStore = 1 in +let hasSideEffects = 0, mayStore = 1 in def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), (ins i128mem:$dst, VR256:$src1, i8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, @@ -8985,6 +8964,115 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskstore_q, int_x86_avx2_maskstore_q_256>, VEX_W; +def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), + (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), + (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), + (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), + (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), + (bc_v8f32 (v8i32 immAllZerosV)))), + (VMASKMOVPSYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VMASKMOVPSYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 immAllZerosV))), + (VPMASKMOVDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v8i32 (masked_load addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src0))), + (VBLENDVPSYrr VR256:$src0, (VPMASKMOVDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), + (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), + (bc_v4f32 (v4i32 immAllZerosV)))), + (VMASKMOVPSrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4f32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src0))), + (VBLENDVPSrr VR128:$src0, (VMASKMOVPSrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), undef)), + (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 immAllZerosV))), + (VPMASKMOVDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0))), + (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), + (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), + (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (v4f64 immAllZerosV))), + (VMASKMOVPDYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VMASKMOVPDYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), + (bc_v4i64 (v8i32 immAllZerosV)))), + (VPMASKMOVQYrm VR256:$mask, addr:$ptr)>; + +def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))), + (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), + VR256:$mask)>; + +def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), + (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), + (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), + (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), + (v2f64 immAllZerosV))), + (VMASKMOVPDrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src0))), + (VBLENDVPDrr VR128:$src0, (VMASKMOVPDrm VR128:$mask, addr:$ptr), + VR128:$mask)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), + (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), + (bc_v2i64 (v4i32 immAllZerosV)))), + (VPMASKMOVQrm VR128:$mask, addr:$ptr)>; + +def: Pat<(v2i64 (masked_load addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src0))), + (VBLENDVPDrr VR128:$src0, (VPMASKMOVQrm VR128:$mask, addr:$ptr), + VR128:$mask)>; //===----------------------------------------------------------------------===// // Variable Bit Shifts diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index d0bb5231a2d0..c706d43c9f5c 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -49,6 +49,7 @@ def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), "shl{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; +} // isConvertibleToThreeAddress = 1 // NOTE: We don't include patterns for shifts of a register by one, because // 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one). @@ -62,7 +63,6 @@ def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1), def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), "shl{q}\t$dst", [], IIC_SR>; } // hasSideEffects = 0 -} // isConvertibleToThreeAddress = 1 } // Constraints = "$src = $dst", SchedRW @@ -289,11 +289,11 @@ def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), "sar{w}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize16; -def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), +def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), "sar{l}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize32; -def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), +def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), "sar{q}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; @@ -347,7 +347,7 @@ def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), let Uses = [CL] in def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; - + def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "rcl{w}\t$dst", [], IIC_SR>, OpSize16; def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), @@ -381,7 +381,7 @@ def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), let Uses = [CL] in def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; - + def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "rcr{w}\t$dst", [], IIC_SR>, OpSize16; def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), @@ -397,7 +397,7 @@ def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), let Uses = [CL] in def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; - + def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "rcr{q}\t$dst", [], IIC_SR>; def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), @@ -493,7 +493,7 @@ def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "rol{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; -def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), +def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "rol{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))], @@ -600,7 +600,7 @@ def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), "ror{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; -def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), +def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), "ror{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))], @@ -635,11 +635,11 @@ def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), "ror{w}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize16; -def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), +def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), "ror{l}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>, OpSize32; -def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), +def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), "ror{q}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; @@ -688,19 +688,19 @@ def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { let Uses = [CL] in { -def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), +def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize16; -def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), +def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))], IIC_SHD16_REG_CL>, TB, OpSize16; -def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), +def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))], @@ -710,58 +710,58 @@ def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))], IIC_SHD32_REG_CL>, TB, OpSize32; -def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), +def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))], - IIC_SHD64_REG_CL>, + IIC_SHD64_REG_CL>, TB; -def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), +def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))], - IIC_SHD64_REG_CL>, + IIC_SHD64_REG_CL>, TB; } let isCommutable = 1 in { // These instructions commute to each other. def SHLD16rri8 : Ii8<0xA4, MRMDestReg, - (outs GR16:$dst), + (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHRD16rri8 : Ii8<0xAC, MRMDestReg, - (outs GR16:$dst), + (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHLD32rri8 : Ii8<0xA4, MRMDestReg, - (outs GR32:$dst), + (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHRD32rri8 : Ii8<0xAC, MRMDestReg, - (outs GR32:$dst), + (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHLD64rri8 : RIi8<0xA4, MRMDestReg, - (outs GR64:$dst), + (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, (i8 imm:$src3)))], IIC_SHD64_REG_IM>, TB; def SHRD64rri8 : RIi8<0xAC, MRMDestReg, - (outs GR64:$dst), + (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, @@ -789,7 +789,7 @@ def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), addr:$dst)], IIC_SHD32_MEM_CL>, TB, OpSize32; - + def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), @@ -807,7 +807,7 @@ def SHLD16mri8 : Ii8<0xA4, MRMDestMem, (i8 imm:$src3)), addr:$dst)], IIC_SHD16_MEM_IM>, TB, OpSize16; -def SHRD16mri8 : Ii8<0xAC, MRMDestMem, +def SHRD16mri8 : Ii8<0xAC, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, @@ -822,7 +822,7 @@ def SHLD32mri8 : Ii8<0xA4, MRMDestMem, (i8 imm:$src3)), addr:$dst)], IIC_SHD32_MEM_IM>, TB, OpSize32; -def SHRD32mri8 : Ii8<0xAC, MRMDestMem, +def SHRD32mri8 : Ii8<0xAC, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, @@ -837,7 +837,7 @@ def SHLD64mri8 : RIi8<0xA4, MRMDestMem, (i8 imm:$src3)), addr:$dst)], IIC_SHD64_MEM_IM>, TB; -def SHRD64mri8 : RIi8<0xAC, MRMDestMem, +def SHRD64mri8 : RIi8<0xAC, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, @@ -859,7 +859,7 @@ def ROT64L2R_imm8 : SDNodeXForm<imm, [{ }]>; multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, TAXD, VEX, Sched<[WriteShift]>; @@ -872,7 +872,7 @@ let neverHasSideEffects = 1 in { } multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> { -let neverHasSideEffects = 1 in { +let hasSideEffects = 0 in { def rr : I<0xF7, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, VEX_4VOp3, Sched<[WriteShift]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td index 540278021f02..848eeeb07c82 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -207,7 +207,7 @@ def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), let SchedRW = [WriteSystem] in { def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", [], IIC_SWAPGS>, TB; -def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), +def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "lar{w}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize16; def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), @@ -215,14 +215,14 @@ def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), OpSize16; // i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo. -def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), +def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB, OpSize32; def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "lar{l}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB, OpSize32; // i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo. -def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), +def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RM>, TB; def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), "lar{q}\t{$src, $dst|$dst, $src}", [], IIC_LAR_RR>, TB; @@ -240,7 +240,7 @@ def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "lsl{l}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB, OpSize32; def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; + "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RM>, TB; def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "lsl{q}\t{$src, $dst|$dst, $src}", [], IIC_LSL_RR>, TB; @@ -260,7 +260,7 @@ def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), "ltr{w}\t$src", [], IIC_LTR>, TB; def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", [], IIC_LTR>, TB; - + def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, OpSize16, Requires<[Not64BitMode]>; @@ -347,31 +347,31 @@ def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "lds{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16; def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "lds{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32; - + def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "lss{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16; def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "lss{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32; def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), "lss{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; - + def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "les{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize16; def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "les{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, OpSize32; - + def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "lfs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16; def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "lfs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32; def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), "lfs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; - + def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src), "lgs{w}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize16; def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src), "lgs{l}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB, OpSize32; - + def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src), "lgs{q}\t{$src, $dst|$dst, $src}", [], IIC_LXS>, TB; @@ -408,7 +408,7 @@ def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins), "sldt{w}\t$dst", [], IIC_SLDT>, TB; def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins), "sldt{l}\t$dst", [], IIC_SLDT>, OpSize32, TB; - + // LLDT is not interpreted specially in 64-bit mode because there is no sign // extension. def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins), @@ -444,12 +444,12 @@ let Defs = [RAX, RDX], Uses = [ECX] in def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)], IIC_RDPMC>, TB; -def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), +def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), "smsw{w}\t$dst", [], IIC_SMSW>, OpSize16, TB; -def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), +def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), "smsw{l}\t$dst", [], IIC_SMSW>, OpSize32, TB; // no m form encodable; use SMSW16m -def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), +def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), "smsw{q}\t$dst", [], IIC_SMSW>, TB; // For memory operands, there is only a 16-bit form @@ -462,11 +462,7 @@ def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src), "lmsw{w}\t$src", [], IIC_LMSW_REG>, TB; let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in - def CPUID32 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB, - Requires<[Not64BitMode]>; -let Defs = [RAX, RBX, RCX, RDX], Uses = [RAX, RCX] in - def CPUID64 : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB, - Requires<[In64BitMode]>; + def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", [], IIC_CPUID>, TB; } // SchedRW //===----------------------------------------------------------------------===// @@ -479,10 +475,10 @@ def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; //===----------------------------------------------------------------------===// // XSAVE instructions let SchedRW = [WriteSystem] in { -let Defs = [RDX, RAX], Uses = [RCX] in +let Defs = [EDX, EAX], Uses = [ECX] in def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; -let Uses = [RDX, RAX, RCX] in +let Uses = [EDX, EAX, ECX] in def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; let Uses = [RDX, RAX] in { @@ -563,7 +559,7 @@ def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), //===----------------------------------------------------------------------===// // SMAP Instruction -let Defs = [EFLAGS], Uses = [EFLAGS] in { +let Predicates = [HasSMAP], Defs = [EFLAGS] in { def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm/lib/Target/X86/X86InstrTSX.td index 4940efc4443d..7267d752653e 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrTSX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrTSX.td @@ -23,9 +23,12 @@ def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins), "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>, Requires<[HasRTM]>; -let isBranch = 1, isTerminator = 1, Defs = [EAX] in -def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst), - "xbegin\t$dst", []>, Requires<[HasRTM]>; +let isBranch = 1, isTerminator = 1, Defs = [EAX] in { +def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst), + "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>; +def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst), + "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>; +} def XEND : I<0x01, MRM_D5, (outs), (ins), "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h new file mode 100644 index 000000000000..7130ae2c3097 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -0,0 +1,547 @@ +//===-- X86IntinsicsInfo.h - X86 Instrinsics ------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the details for lowering X86 intrinsics +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H +#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H + +namespace llvm { + +enum IntrinsicType { + INTR_NO_TYPE, + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, + CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, + INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK, INTR_TYPE_SCALAR_MASK_RM, + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, BLEND +}; + +struct IntrinsicData { + + unsigned Id; + IntrinsicType Type; + unsigned Opc0; + unsigned Opc1; + + bool operator<(const IntrinsicData &RHS) const { + return Id < RHS.Id; + } + bool operator==(const IntrinsicData &RHS) const { + return RHS.Id == Id; + } +}; + +#define X86_INTRINSIC_DATA(id, type, op0, op1) \ + { Intrinsic::x86_##id, type, op0, op1 } + +/* + * IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in + * the alphabetical order. + */ +static const IntrinsicData IntrinsicsWithChain[] = { + X86_INTRINSIC_DATA(addcarry_u32, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(addcarry_u64, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0), + + X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0), + X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0), + + X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH, + X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm), + X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH, + X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm), + X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH, + X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm), + X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, + X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), + + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), + X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), + + X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, + X86::VSCATTERPF0DPDm, X86::VSCATTERPF1DPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, + X86::VSCATTERPF0DPSm, X86::VSCATTERPF1DPSm), + X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, + X86::VSCATTERPF0QPDm, X86::VSCATTERPF1QPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, + X86::VSCATTERPF0QPSm, X86::VSCATTERPF1QPSm), + + X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0), + X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), + X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0), + X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0), + X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0), + X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0), + X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0), + X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0), + X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0), + + X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0), + X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0), + X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0), +}; + +/* + * Find Intrinsic data by intrinsic ID + */ +static const IntrinsicData* getIntrinsicWithChain(unsigned IntNo) { + + IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 }; + const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain), + IntrinsicToFind); + if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind) + return Data; + return nullptr; +} + +/* + * IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in + * the alphabetical order. + */ +static const IntrinsicData IntrinsicsWithoutChain[] = { + X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_permd, INTR_TYPE_2OP, X86ISD::VPERMV, 0), + X86_INTRINSIC_DATA(avx2_permps, INTR_TYPE_2OP, X86ISD::VPERMV, 0), + X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_b, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_d, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_d, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmaxu_w, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(avx2_pmins_b, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_d, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_d, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pminu_w, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), + X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), + X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psign_w, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_b_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_d_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_pd_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_ps_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_q_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + + X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_d_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_q_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_128, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_256, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpeq_w_512, CMP_MASK, X86ISD::PCMPEQM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_b_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_d_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_q_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_pslli_q, VSHIFT_MASK, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv_d, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psllv_q, INTR_TYPE_2OP_MASK, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_d, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psra_q, INTR_TYPE_2OP_MASK, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrai_d, VSHIFT_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrai_q, VSHIFT_MASK, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_d, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrlv_q, INTR_TYPE_2OP_MASK, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(avx_max_pd_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_128, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_256, FMA_OP_MASK, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB , 0), + X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), + X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), + X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), + X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0), + X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0), + X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0), + X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(sse2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0), + X86_INTRINSIC_DATA(sse2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(sse2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(sse2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), + X86_INTRINSIC_DATA(sse2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(sse2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), + X86_INTRINSIC_DATA(sse2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(sse2_psrai_w, VSHIFT, X86ISD::VSRAI, 0), + X86_INTRINSIC_DATA(sse2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(sse2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(sse2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse2_ucomile_sd, COMI, X86ISD::UCOMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse2_ucomilt_sd, COMI, X86ISD::UCOMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse2_ucomineq_sd, COMI, X86ISD::UCOMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse3_hadd_pd, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0), + X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), + X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse41_pmaxsb, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxsd, INTR_TYPE_2OP, X86ISD::SMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxud, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse41_pmaxuw, INTR_TYPE_2OP, X86ISD::UMAX, 0), + X86_INTRINSIC_DATA(sse41_pminsb, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, X86ISD::SMIN, 0), + X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, X86ISD::UMIN, 0), + X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0), + X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0), + X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0), + X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ), + X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE), + X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT), + X86_INTRINSIC_DATA(sse_ucomile_ss, COMI, X86ISD::UCOMI, ISD::SETLE), + X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT), + X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE), + X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), + X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), + X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0) +}; + +/* + * Retrieve data for Intrinsic without chain. + * Return nullptr if intrinsic is not defined in the table. + */ +static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) { + IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 }; + const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain), + IntrinsicToFind); + if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind) + return Data; + return nullptr; +} + +static void verifyIntrinsicTables() { + assert(std::is_sorted(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain)) && + std::is_sorted(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain)) && + "Intrinsic data tables should be sorted by Intrinsic ID"); +} + +} // End llvm namespace + +#endif diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp b/contrib/llvm/lib/Target/X86/X86JITInfo.cpp deleted file mode 100644 index a082c4f8b0ef..000000000000 --- a/contrib/llvm/lib/Target/X86/X86JITInfo.cpp +++ /dev/null @@ -1,588 +0,0 @@ -//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the JIT interfaces for the X86 target. -// -//===----------------------------------------------------------------------===// - -#include "X86JITInfo.h" -#include "X86Relocations.h" -#include "X86Subtarget.h" -#include "X86TargetMachine.h" -#include "llvm/IR/Function.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Valgrind.h" -#include <cstdlib> -#include <cstring> -using namespace llvm; - -#define DEBUG_TYPE "jit" - -// Determine the platform we're running on -#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64) -# define X86_64_JIT -#elif defined(__i386__) || defined(i386) || defined(_M_IX86) -# define X86_32_JIT -#endif - -void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) { - unsigned char *OldByte = (unsigned char *)Old; - *OldByte++ = 0xE9; // Emit JMP opcode. - unsigned *OldWord = (unsigned *)OldByte; - unsigned NewAddr = (intptr_t)New; - unsigned OldAddr = (intptr_t)OldWord; - *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code. - - // X86 doesn't need to invalidate the processor cache, so just invalidate - // Valgrind's cache directly. - sys::ValgrindDiscardTranslations(Old, 5); -} - - -/// JITCompilerFunction - This contains the address of the JIT function used to -/// compile a function lazily. -static TargetJITInfo::JITCompilerFn JITCompilerFunction; - -// Get the ASMPREFIX for the current host. This is often '_'. -#ifndef __USER_LABEL_PREFIX__ -#define __USER_LABEL_PREFIX__ -#endif -#define GETASMPREFIX2(X) #X -#define GETASMPREFIX(X) GETASMPREFIX2(X) -#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__) - -// For ELF targets, use a .size and .type directive, to let tools -// know the extent of functions defined in assembler. -#if defined(__ELF__) -# define SIZE(sym) ".size " #sym ", . - " #sym "\n" -# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n" -#else -# define SIZE(sym) -# define TYPE_FUNCTION(sym) -#endif - -// Provide a convenient way for disabling usage of CFI directives. -// This is needed for old/broken assemblers (for example, gas on -// Darwin is pretty old and doesn't support these directives) -#if defined(__APPLE__) -# define CFI(x) -#else -// FIXME: Disable this until we really want to use it. Also, we will -// need to add some workarounds for compilers, which support -// only subset of these directives. -# define CFI(x) -#endif - -// Provide a wrapper for LLVMX86CompilationCallback2 that saves non-traditional -// callee saved registers, for the fastcc calling convention. -extern "C" { -#if defined(X86_64_JIT) -# ifndef _MSC_VER - // No need to save EAX/EDX for X86-64. - void X86CompilationCallback(void); - asm( - ".text\n" - ".align 8\n" - ".globl " ASMPREFIX "X86CompilationCallback\n" - TYPE_FUNCTION(X86CompilationCallback) - ASMPREFIX "X86CompilationCallback:\n" - CFI(".cfi_startproc\n") - // Save RBP - "pushq %rbp\n" - CFI(".cfi_def_cfa_offset 16\n") - CFI(".cfi_offset %rbp, -16\n") - // Save RSP - "movq %rsp, %rbp\n" - CFI(".cfi_def_cfa_register %rbp\n") - // Save all int arg registers - "pushq %rdi\n" - CFI(".cfi_rel_offset %rdi, 0\n") - "pushq %rsi\n" - CFI(".cfi_rel_offset %rsi, 8\n") - "pushq %rdx\n" - CFI(".cfi_rel_offset %rdx, 16\n") - "pushq %rcx\n" - CFI(".cfi_rel_offset %rcx, 24\n") - "pushq %r8\n" - CFI(".cfi_rel_offset %r8, 32\n") - "pushq %r9\n" - CFI(".cfi_rel_offset %r9, 40\n") - // Align stack on 16-byte boundary. ESP might not be properly aligned - // (8 byte) if this is called from an indirect stub. - "andq $-16, %rsp\n" - // Save all XMM arg registers - "subq $128, %rsp\n" - "movaps %xmm0, (%rsp)\n" - "movaps %xmm1, 16(%rsp)\n" - "movaps %xmm2, 32(%rsp)\n" - "movaps %xmm3, 48(%rsp)\n" - "movaps %xmm4, 64(%rsp)\n" - "movaps %xmm5, 80(%rsp)\n" - "movaps %xmm6, 96(%rsp)\n" - "movaps %xmm7, 112(%rsp)\n" - // JIT callee -#if defined(_WIN64) || defined(__CYGWIN__) - "subq $32, %rsp\n" - "movq %rbp, %rcx\n" // Pass prev frame and return address - "movq 8(%rbp), %rdx\n" - "call " ASMPREFIX "LLVMX86CompilationCallback2\n" - "addq $32, %rsp\n" -#else - "movq %rbp, %rdi\n" // Pass prev frame and return address - "movq 8(%rbp), %rsi\n" - "call " ASMPREFIX "LLVMX86CompilationCallback2\n" -#endif - // Restore all XMM arg registers - "movaps 112(%rsp), %xmm7\n" - "movaps 96(%rsp), %xmm6\n" - "movaps 80(%rsp), %xmm5\n" - "movaps 64(%rsp), %xmm4\n" - "movaps 48(%rsp), %xmm3\n" - "movaps 32(%rsp), %xmm2\n" - "movaps 16(%rsp), %xmm1\n" - "movaps (%rsp), %xmm0\n" - // Restore RSP - "movq %rbp, %rsp\n" - CFI(".cfi_def_cfa_register %rsp\n") - // Restore all int arg registers - "subq $48, %rsp\n" - CFI(".cfi_adjust_cfa_offset 48\n") - "popq %r9\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %r9\n") - "popq %r8\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %r8\n") - "popq %rcx\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rcx\n") - "popq %rdx\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rdx\n") - "popq %rsi\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rsi\n") - "popq %rdi\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rdi\n") - // Restore RBP - "popq %rbp\n" - CFI(".cfi_adjust_cfa_offset -8\n") - CFI(".cfi_restore %rbp\n") - "ret\n" - CFI(".cfi_endproc\n") - SIZE(X86CompilationCallback) - ); -# else - // No inline assembler support on this platform. The routine is in external - // file. - void X86CompilationCallback(); - -# endif -#elif defined (X86_32_JIT) -# ifndef _MSC_VER - void X86CompilationCallback(void); - asm( - ".text\n" - ".align 8\n" - ".globl " ASMPREFIX "X86CompilationCallback\n" - TYPE_FUNCTION(X86CompilationCallback) - ASMPREFIX "X86CompilationCallback:\n" - CFI(".cfi_startproc\n") - "pushl %ebp\n" - CFI(".cfi_def_cfa_offset 8\n") - CFI(".cfi_offset %ebp, -8\n") - "movl %esp, %ebp\n" // Standard prologue - CFI(".cfi_def_cfa_register %ebp\n") - "pushl %eax\n" - CFI(".cfi_rel_offset %eax, 0\n") - "pushl %edx\n" // Save EAX/EDX/ECX - CFI(".cfi_rel_offset %edx, 4\n") - "pushl %ecx\n" - CFI(".cfi_rel_offset %ecx, 8\n") -# if defined(__APPLE__) - "andl $-16, %esp\n" // Align ESP on 16-byte boundary -# endif - "subl $16, %esp\n" - "movl 4(%ebp), %eax\n" // Pass prev frame and return address - "movl %eax, 4(%esp)\n" - "movl %ebp, (%esp)\n" - "call " ASMPREFIX "LLVMX86CompilationCallback2\n" - "movl %ebp, %esp\n" // Restore ESP - CFI(".cfi_def_cfa_register %esp\n") - "subl $12, %esp\n" - CFI(".cfi_adjust_cfa_offset 12\n") - "popl %ecx\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %ecx\n") - "popl %edx\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %edx\n") - "popl %eax\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %eax\n") - "popl %ebp\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %ebp\n") - "ret\n" - CFI(".cfi_endproc\n") - SIZE(X86CompilationCallback) - ); - - // Same as X86CompilationCallback but also saves XMM argument registers. - void X86CompilationCallback_SSE(void); - asm( - ".text\n" - ".align 8\n" - ".globl " ASMPREFIX "X86CompilationCallback_SSE\n" - TYPE_FUNCTION(X86CompilationCallback_SSE) - ASMPREFIX "X86CompilationCallback_SSE:\n" - CFI(".cfi_startproc\n") - "pushl %ebp\n" - CFI(".cfi_def_cfa_offset 8\n") - CFI(".cfi_offset %ebp, -8\n") - "movl %esp, %ebp\n" // Standard prologue - CFI(".cfi_def_cfa_register %ebp\n") - "pushl %eax\n" - CFI(".cfi_rel_offset %eax, 0\n") - "pushl %edx\n" // Save EAX/EDX/ECX - CFI(".cfi_rel_offset %edx, 4\n") - "pushl %ecx\n" - CFI(".cfi_rel_offset %ecx, 8\n") - "andl $-16, %esp\n" // Align ESP on 16-byte boundary - // Save all XMM arg registers - "subl $64, %esp\n" - // FIXME: provide frame move information for xmm registers. - // This can be tricky, because CFA register is ebp (unaligned) - // and we need to produce offsets relative to it. - "movaps %xmm0, (%esp)\n" - "movaps %xmm1, 16(%esp)\n" - "movaps %xmm2, 32(%esp)\n" - "movaps %xmm3, 48(%esp)\n" - "subl $16, %esp\n" - "movl 4(%ebp), %eax\n" // Pass prev frame and return address - "movl %eax, 4(%esp)\n" - "movl %ebp, (%esp)\n" - "call " ASMPREFIX "LLVMX86CompilationCallback2\n" - "addl $16, %esp\n" - "movaps 48(%esp), %xmm3\n" - CFI(".cfi_restore %xmm3\n") - "movaps 32(%esp), %xmm2\n" - CFI(".cfi_restore %xmm2\n") - "movaps 16(%esp), %xmm1\n" - CFI(".cfi_restore %xmm1\n") - "movaps (%esp), %xmm0\n" - CFI(".cfi_restore %xmm0\n") - "movl %ebp, %esp\n" // Restore ESP - CFI(".cfi_def_cfa_register esp\n") - "subl $12, %esp\n" - CFI(".cfi_adjust_cfa_offset 12\n") - "popl %ecx\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %ecx\n") - "popl %edx\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %edx\n") - "popl %eax\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %eax\n") - "popl %ebp\n" - CFI(".cfi_adjust_cfa_offset -4\n") - CFI(".cfi_restore %ebp\n") - "ret\n" - CFI(".cfi_endproc\n") - SIZE(X86CompilationCallback_SSE) - ); -# else - void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr); - - _declspec(naked) void X86CompilationCallback(void) { - __asm { - push ebp - mov ebp, esp - push eax - push edx - push ecx - and esp, -16 - sub esp, 16 - mov eax, dword ptr [ebp+4] - mov dword ptr [esp+4], eax - mov dword ptr [esp], ebp - call LLVMX86CompilationCallback2 - mov esp, ebp - sub esp, 12 - pop ecx - pop edx - pop eax - pop ebp - ret - } - } - -# endif // _MSC_VER - -#else // Not an i386 host - void X86CompilationCallback() { - llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!"); - } -#endif -} - -/// This is the target-specific function invoked by the -/// function stub when we did not know the real target of a call. This function -/// must locate the start of the stub or call site and pass it into the JIT -/// compiler function. -extern "C" { -LLVM_ATTRIBUTE_USED // Referenced from inline asm. -LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr, - intptr_t RetAddr) { - intptr_t *RetAddrLoc = &StackPtr[1]; - // We are reading raw stack data here. Tell MemorySanitizer that it is - // sufficiently initialized. - __msan_unpoison(RetAddrLoc, sizeof(*RetAddrLoc)); - assert(*RetAddrLoc == RetAddr && - "Could not find return address on the stack!"); - - // It's a stub if there is an interrupt marker after the call. - bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE; - - // The call instruction should have pushed the return value onto the stack... -#if defined (X86_64_JIT) - RetAddr--; // Backtrack to the reference itself... -#else - RetAddr -= 4; // Backtrack to the reference itself... -#endif - -#if 0 - DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr - << " ESP=" << (void*)StackPtr - << ": Resolving call to function: " - << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n"); -#endif - - // Sanity check to make sure this really is a call instruction. -#if defined (X86_64_JIT) - assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!"); - assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!"); -#else - assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!"); -#endif - - intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr); - - // Rewrite the call target... so that we don't end up here every time we - // execute the call. -#if defined (X86_64_JIT) - assert(isStub && - "X86-64 doesn't support rewriting non-stub lazy compilation calls:" - " the call instruction varies too much."); -#else - *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4); -#endif - - if (isStub) { - // If this is a stub, rewrite the call into an unconditional branch - // instruction so that two return addresses are not pushed onto the stack - // when the requested function finally gets called. This also makes the - // 0xCE byte (interrupt) dead, so the marker doesn't effect anything. -#if defined (X86_64_JIT) - // If the target address is within 32-bit range of the stub, use a - // PC-relative branch instead of loading the actual address. (This is - // considerably shorter than the 64-bit immediate load already there.) - // We assume here intptr_t is 64 bits. - intptr_t diff = NewVal-RetAddr+7; - if (diff >= -2147483648LL && diff <= 2147483647LL) { - *(unsigned char*)(RetAddr-0xc) = 0xE9; - *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff; - } else { - *(intptr_t *)(RetAddr - 0xa) = NewVal; - ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6)); - } - sys::ValgrindDiscardTranslations((void*)(RetAddr-0xc), 0xd); -#else - ((unsigned char*)RetAddr)[-1] = 0xE9; - sys::ValgrindDiscardTranslations((void*)(RetAddr-1), 5); -#endif - } - - // Change the return address to reexecute the call instruction... -#if defined (X86_64_JIT) - *RetAddrLoc -= 0xd; -#else - *RetAddrLoc -= 5; -#endif -} -} - -TargetJITInfo::LazyResolverFn -X86JITInfo::getLazyResolverFunction(JITCompilerFn F) { - TsanIgnoreWritesBegin(); - JITCompilerFunction = F; - TsanIgnoreWritesEnd(); - -#if defined (X86_32_JIT) && !defined (_MSC_VER) -#if defined(__SSE__) - // SSE Callback should be called for SSE-enabled LLVM. - return X86CompilationCallback_SSE; -#else - if (useSSE) - return X86CompilationCallback_SSE; -#endif -#endif - - return X86CompilationCallback; -} - -X86JITInfo::X86JITInfo(bool UseSSE) { - useSSE = UseSSE; - useGOT = 0; - TLSOffset = nullptr; -} - -void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, - JITCodeEmitter &JCE) { -#if defined (X86_64_JIT) - const unsigned Alignment = 8; - uint8_t Buffer[8]; - uint8_t *Cur = Buffer; - MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr); - MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32)); -#else - const unsigned Alignment = 4; - uint8_t Buffer[4]; - uint8_t *Cur = Buffer; - MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr); -#endif - return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment); -} - -TargetJITInfo::StubLayout X86JITInfo::getStubLayout() { - // The 64-bit stub contains: - // movabs r10 <- 8-byte-target-address # 10 bytes - // call|jmp *r10 # 3 bytes - // The 32-bit stub contains a 5-byte call|jmp. - // If the stub is a call to the compilation callback, an extra byte is added - // to mark it as a stub. - StubLayout Result = {14, 4}; - return Result; -} - -void *X86JITInfo::emitFunctionStub(const Function* F, void *Target, - JITCodeEmitter &JCE) { - // Note, we cast to intptr_t here to silence a -pedantic warning that - // complains about casting a function pointer to a normal pointer. -#if defined (X86_32_JIT) && !defined (_MSC_VER) - bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback && - Target != (void*)(intptr_t)X86CompilationCallback_SSE); -#else - bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback; -#endif - JCE.emitAlignment(4); - void *Result = (void*)JCE.getCurrentPCValue(); - if (NotCC) { -#if defined (X86_64_JIT) - JCE.emitByte(0x49); // REX prefix - JCE.emitByte(0xB8+2); // movabsq r10 - JCE.emitWordLE((unsigned)(intptr_t)Target); - JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32)); - JCE.emitByte(0x41); // REX prefix - JCE.emitByte(0xFF); // jmpq *r10 - JCE.emitByte(2 | (4 << 3) | (3 << 6)); -#else - JCE.emitByte(0xE9); - JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4); -#endif - return Result; - } - -#if defined (X86_64_JIT) - JCE.emitByte(0x49); // REX prefix - JCE.emitByte(0xB8+2); // movabsq r10 - JCE.emitWordLE((unsigned)(intptr_t)Target); - JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32)); - JCE.emitByte(0x41); // REX prefix - JCE.emitByte(0xFF); // callq *r10 - JCE.emitByte(2 | (2 << 3) | (3 << 6)); -#else - JCE.emitByte(0xE8); // Call with 32 bit pc-rel destination... - - JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4); -#endif - - // This used to use 0xCD, but that value is used by JITMemoryManager to - // initialize the buffer with garbage, which means it may follow a - // noreturn function call, confusing LLVMX86CompilationCallback2. PR 4929. - JCE.emitByte(0xCE); // Interrupt - Just a marker identifying the stub! - return Result; -} - -/// getPICJumpTableEntry - Returns the value of the jumptable entry for the -/// specific basic block. -uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) { -#if defined(X86_64_JIT) - return BB - Entry; -#else - return BB - PICBase; -#endif -} - -template<typename T> static void addUnaligned(void *Pos, T Delta) { - T Value; - std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos), - sizeof(T)); - Value += Delta; - std::memcpy(reinterpret_cast<char*>(Pos), reinterpret_cast<char*>(&Value), - sizeof(T)); -} - -/// relocate - Before the JIT can run a block of code that has been emitted, -/// it must rewrite the code to contain the actual addresses of any -/// referenced global symbols. -void X86JITInfo::relocate(void *Function, MachineRelocation *MR, - unsigned NumRelocs, unsigned char* GOTBase) { - for (unsigned i = 0; i != NumRelocs; ++i, ++MR) { - void *RelocPos = (char*)Function + MR->getMachineCodeOffset(); - intptr_t ResultPtr = (intptr_t)MR->getResultPointer(); - switch ((X86::RelocationType)MR->getRelocationType()) { - case X86::reloc_pcrel_word: { - // PC relative relocation, add the relocated value to the value already in - // memory, after we adjust it for where the PC is. - ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal(); - addUnaligned<unsigned>(RelocPos, ResultPtr); - break; - } - case X86::reloc_picrel_word: { - // PIC base relative relocation, add the relocated value to the value - // already in memory, after we adjust it for where the PIC base is. - ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal()); - addUnaligned<unsigned>(RelocPos, ResultPtr); - break; - } - case X86::reloc_absolute_word: - case X86::reloc_absolute_word_sext: - // Absolute relocation, just add the relocated value to the value already - // in memory. - addUnaligned<unsigned>(RelocPos, ResultPtr); - break; - case X86::reloc_absolute_dword: - addUnaligned<intptr_t>(RelocPos, ResultPtr); - break; - } - } -} - -char* X86JITInfo::allocateThreadLocalMemory(size_t size) { -#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER) - TLSOffset -= size; - return TLSOffset; -#else - llvm_unreachable("Cannot allocate thread local storage on this arch!"); -#endif -} diff --git a/contrib/llvm/lib/Target/X86/X86JITInfo.h b/contrib/llvm/lib/Target/X86/X86JITInfo.h deleted file mode 100644 index 564343ffa3fa..000000000000 --- a/contrib/llvm/lib/Target/X86/X86JITInfo.h +++ /dev/null @@ -1,79 +0,0 @@ -//===-- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the X86 implementation of the TargetJITInfo class. -// -//===----------------------------------------------------------------------===// - -#ifndef X86JITINFO_H -#define X86JITINFO_H - -#include "llvm/CodeGen/JITCodeEmitter.h" -#include "llvm/IR/Function.h" -#include "llvm/Target/TargetJITInfo.h" - -namespace llvm { - class X86Subtarget; - - class X86JITInfo : public TargetJITInfo { - uintptr_t PICBase; - char *TLSOffset; - bool useSSE; - public: - explicit X86JITInfo(bool UseSSE); - - /// replaceMachineCodeForFunction - Make it so that calling the function - /// whose machine code is at OLD turns into a call to NEW, perhaps by - /// overwriting OLD with a branch to NEW. This is used for self-modifying - /// code. - /// - void replaceMachineCodeForFunction(void *Old, void *New) override; - - /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object - /// to emit an indirect symbol which contains the address of the specified - /// ptr. - void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr, - JITCodeEmitter &JCE) override; - - // getStubLayout - Returns the size and alignment of the largest call stub - // on X86. - StubLayout getStubLayout() override; - - /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a - /// small native function that simply calls the function at the specified - /// address. - void *emitFunctionStub(const Function* F, void *Target, - JITCodeEmitter &JCE) override; - - /// getPICJumpTableEntry - Returns the value of the jumptable entry for the - /// specific basic block. - uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase) override; - - /// getLazyResolverFunction - Expose the lazy resolver to the JIT. - LazyResolverFn getLazyResolverFunction(JITCompilerFn) override; - - /// relocate - Before the JIT can run a block of code that has been emitted, - /// it must rewrite the code to contain the actual addresses of any - /// referenced global symbols. - void relocate(void *Function, MachineRelocation *MR, - unsigned NumRelocs, unsigned char* GOTBase) override; - - /// allocateThreadLocalMemory - Each target has its own way of - /// handling thread local variables. This method returns a value only - /// meaningful to the target. - char* allocateThreadLocalMemory(size_t size) override; - - /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute - /// PIC jumptable entry. - void setPICBase(uintptr_t Base) { PICBase = Base; } - uintptr_t getPICBase() const { return PICBase; } - }; -} - -#endif diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp index 2bd70a96c432..63ccded5ac9f 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -16,20 +16,25 @@ #include "X86RegisterInfo.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "Utils/X86ShuffleDecode.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Mangler.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/TargetRegistry.h" using namespace llvm; namespace { @@ -58,6 +63,53 @@ private: } // end anonymous namespace +// Emit a minimal sequence of nops spanning NumBytes bytes. +static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, + const MCSubtargetInfo &STI); + +namespace llvm { + X86AsmPrinter::StackMapShadowTracker::StackMapShadowTracker(TargetMachine &TM) + : TM(TM), InShadow(false), RequiredShadowSize(0), CurrentShadowSize(0) {} + + X86AsmPrinter::StackMapShadowTracker::~StackMapShadowTracker() {} + + void + X86AsmPrinter::StackMapShadowTracker::startFunction(MachineFunction &MF) { + CodeEmitter.reset(TM.getTarget().createMCCodeEmitter( + *TM.getSubtargetImpl()->getInstrInfo(), + *TM.getSubtargetImpl()->getRegisterInfo(), *TM.getSubtargetImpl(), + MF.getContext())); + } + + void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst, + const MCSubtargetInfo &STI) { + if (InShadow) { + SmallString<256> Code; + SmallVector<MCFixup, 4> Fixups; + raw_svector_ostream VecOS(Code); + CodeEmitter->EncodeInstruction(Inst, VecOS, Fixups, STI); + VecOS.flush(); + CurrentShadowSize += Code.size(); + if (CurrentShadowSize >= RequiredShadowSize) + InShadow = false; // The shadow is big enough. Stop counting. + } + } + + void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding( + MCStreamer &OutStreamer, const MCSubtargetInfo &STI) { + if (InShadow && CurrentShadowSize < RequiredShadowSize) { + InShadow = false; + EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize, + TM.getSubtarget<X86Subtarget>().is64Bit(), STI); + } + } + + void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { + OutStreamer.EmitInstruction(Inst, getSubtargetInfo()); + SMShadowTracker.count(Inst, getSubtargetInfo()); + } +} // end llvm namespace + X86MCInstLower::X86MCInstLower(const MachineFunction &mf, X86AsmPrinter &asmprinter) : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), @@ -72,7 +124,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// operand to an MCSymbol. MCSymbol *X86MCInstLower:: GetSymbolFromOperand(const MachineOperand &MO) const { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout(); assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); SmallString<128> Name; @@ -212,7 +264,8 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(MF.getPICBaseSymbol(), Ctx), Ctx); - if (MO.isJTI() && MAI.hasSetDirective()) { + if (MO.isJTI()) { + assert(MAI.doesSetDirectiveSuppressesReloc()); // If .set directive is supported, use it to reduce the number of // relocations the assembler will generate for differences between // local labels. This is only safe when the symbols are in the same @@ -337,9 +390,8 @@ static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst, Inst.addOperand(Seg); } -static unsigned getRetOpcode(const X86Subtarget &Subtarget) -{ - return Subtarget.is64Bit() ? X86::RETQ : X86::RETL; +static unsigned getRetOpcode(const X86Subtarget &Subtarget) { + return Subtarget.is64Bit() ? X86::RETQ : X86::RETL; } void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { @@ -493,6 +545,24 @@ ReSimplify: break; } + case X86::DEC16r: + case X86::DEC32r: + case X86::INC16r: + case X86::INC32r: + // If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions. + if (!AsmPrinter.getSubtarget().is64Bit()) { + unsigned Opcode; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::DEC16r: Opcode = X86::DEC16r_alt; break; + case X86::DEC32r: Opcode = X86::DEC32r_alt; break; + case X86::INC16r: Opcode = X86::INC16r_alt; break; + case X86::INC32r: Opcode = X86::INC32r_alt; break; + } + OutMI.setOpcode(Opcode); + } + break; + // These are pseudo-ops for OR to help with the OR->ADD transformation. We do // this with an ugly goto in case the resultant OR uses EAX and needs the // short form. @@ -506,39 +576,41 @@ ReSimplify: case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify; case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify; - // The assembler backend wants to see branches in their small form and relax - // them to their large form. The JIT can only handle the large form because - // it does not do relaxation. For now, translate the large form to the - // small one here. - case X86::JMP_4: OutMI.setOpcode(X86::JMP_1); break; - case X86::JO_4: OutMI.setOpcode(X86::JO_1); break; - case X86::JNO_4: OutMI.setOpcode(X86::JNO_1); break; - case X86::JB_4: OutMI.setOpcode(X86::JB_1); break; - case X86::JAE_4: OutMI.setOpcode(X86::JAE_1); break; - case X86::JE_4: OutMI.setOpcode(X86::JE_1); break; - case X86::JNE_4: OutMI.setOpcode(X86::JNE_1); break; - case X86::JBE_4: OutMI.setOpcode(X86::JBE_1); break; - case X86::JA_4: OutMI.setOpcode(X86::JA_1); break; - case X86::JS_4: OutMI.setOpcode(X86::JS_1); break; - case X86::JNS_4: OutMI.setOpcode(X86::JNS_1); break; - case X86::JP_4: OutMI.setOpcode(X86::JP_1); break; - case X86::JNP_4: OutMI.setOpcode(X86::JNP_1); break; - case X86::JL_4: OutMI.setOpcode(X86::JL_1); break; - case X86::JGE_4: OutMI.setOpcode(X86::JGE_1); break; - case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break; - case X86::JG_4: OutMI.setOpcode(X86::JG_1); break; - // Atomic load and store require a separate pseudo-inst because Acquire // implies mayStore and Release implies mayLoad; fix these to regular MOV // instructions here - case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify; - case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify; - case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify; - case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify; - case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify; - case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify; - case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify; - case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify; + case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify; + case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify; + case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify; + case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify; + case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify; + case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify; + case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify; + case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify; + case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify; + case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify; + case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; + case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; + case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; + case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; + case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; + case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify; + case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify; + case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify; + case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify; + case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify; // We don't currently select the correct instruction form for instructions // which have a short %eax, etc. form. Handle this by custom lowering, for @@ -548,13 +620,13 @@ ReSimplify: // MOV64ao8, MOV64o8a // XCHG16ar, XCHG32ar, XCHG64ar case X86::MOV8mr_NOREX: - case X86::MOV8mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao8); break; + case X86::MOV8mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o32a); break; case X86::MOV8rm_NOREX: - case X86::MOV8rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8o8a); break; - case X86::MOV16mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao16); break; - case X86::MOV16rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o16a); break; - case X86::MOV32mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break; - case X86::MOV32rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break; + case X86::MOV8rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV8ao32); break; + case X86::MOV16mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16o32a); break; + case X86::MOV16rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV16ao32); break; + case X86::MOV32mr: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32o32a); break; + case X86::MOV32rm: SimplifyShortMoveForm(AsmPrinter, OutMI, X86::MOV32ao32); break; case X86::ADC8ri: SimplifyShortImmForm(OutMI, X86::ADC8i8); break; case X86::ADC16ri: SimplifyShortImmForm(OutMI, X86::ADC16i16); break; @@ -602,10 +674,8 @@ ReSimplify: } } -static void LowerTlsAddr(MCStreamer &OutStreamer, - X86MCInstLower &MCInstLowering, - const MachineInstr &MI, - const MCSubtargetInfo& STI) { +void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, + const MachineInstr &MI) { bool is64Bits = MI.getOpcode() == X86::TLS_addr64 || MI.getOpcode() == X86::TLS_base_addr64; @@ -615,7 +685,7 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, MCContext &context = OutStreamer.getContext(); if (needsPadding) - OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI); + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); MCSymbolRefExpr::VariantKind SRVK; switch (MI.getOpcode()) { @@ -662,12 +732,12 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, LEA.addOperand(MCOperand::CreateExpr(symRef)); // disp LEA.addOperand(MCOperand::CreateReg(0)); // seg } - OutStreamer.EmitInstruction(LEA, STI); + EmitAndCountInstruction(LEA); if (needsPadding) { - OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI); - OutStreamer.EmitInstruction(MCInstBuilder(X86::DATA16_PREFIX), STI); - OutStreamer.EmitInstruction(MCInstBuilder(X86::REX64_PREFIX), STI); + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX)); + EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX)); } StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr"; @@ -677,9 +747,9 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, MCSymbolRefExpr::VK_PLT, context); - OutStreamer.EmitInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32 - : X86::CALLpcrel32) - .addExpr(tlsRef), STI); + EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32 + : X86::CALLpcrel32) + .addExpr(tlsRef)); } /// \brief Emit the optimal amount of multi-byte nops on X86. @@ -725,33 +795,82 @@ static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, const MCSu break; case X86::NOOPL: case X86::NOOPW: - OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg).addImm(ScaleVal) - .addReg(IndexReg) - .addImm(Displacement) - .addReg(SegmentReg), STI); + OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg) + .addImm(ScaleVal).addReg(IndexReg) + .addImm(Displacement).addReg(SegmentReg), STI); break; } } // while (NumBytes) } +static void LowerSTATEPOINT(MCStreamer &OS, StackMaps &SM, + const MachineInstr &MI, bool Is64Bit, + const TargetMachine& TM, + const MCSubtargetInfo& STI, + X86MCInstLower &MCInstLowering) { + assert(Is64Bit && "Statepoint currently only supports X86-64"); + + // Lower call target and choose correct opcode + const MachineOperand &call_target = StatepointOpers(&MI).getCallTarget(); + MCOperand call_target_mcop; + unsigned call_opcode; + switch (call_target.getType()) { + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + call_target_mcop = MCInstLowering.LowerSymbolOperand( + call_target, + MCInstLowering.GetSymbolFromOperand(call_target)); + call_opcode = X86::CALL64pcrel32; + // Currently, we only support relative addressing with statepoints. + // Otherwise, we'll need a scratch register to hold the target + // address. You'll fail asserts during load & relocation if this + // symbol is to far away. (TODO: support non-relative addressing) + break; + case MachineOperand::MO_Immediate: + call_target_mcop = MCOperand::CreateImm(call_target.getImm()); + call_opcode = X86::CALL64pcrel32; + // Currently, we only support relative addressing with statepoints. + // Otherwise, we'll need a scratch register to hold the target + // immediate. You'll fail asserts during load & relocation if this + // address is to far away. (TODO: support non-relative addressing) + break; + case MachineOperand::MO_Register: + call_target_mcop = MCOperand::CreateReg(call_target.getReg()); + call_opcode = X86::CALL64r; + break; + default: + llvm_unreachable("Unsupported operand type in statepoint call target"); + break; + } + + // Emit call + MCInst call_inst; + call_inst.setOpcode(call_opcode); + call_inst.addOperand(call_target_mcop); + OS.EmitInstruction(call_inst, STI); + + // Record our statepoint node in the same section used by STACKMAP + // and PATCHPOINT + SM.recordStatepoint(MI); +} + + // Lower a stackmap of the form: // <id>, <shadowBytes>, ... -static void LowerSTACKMAP(MCStreamer &OS, StackMaps &SM, - const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) { - unsigned NumBytes = MI.getOperand(1).getImm(); +void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { + SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); SM.recordStackMap(MI); - // Emit padding. - // FIXME: These nops ensure that the stackmap's shadow is covered by - // instructions from the same basic block, but the nops should not be - // necessary if instructions from the same block follow the stackmap. - EmitNops(OS, NumBytes, Is64Bit, STI); + unsigned NumShadowBytes = MI.getOperand(1).getImm(); + SMShadowTracker.reset(NumShadowBytes); } // Lower a patchpoint of the form: // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ... -static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM, - const MachineInstr &MI, bool Is64Bit, const MCSubtargetInfo& STI) { - assert(Is64Bit && "Patchpoint currently only supports X86-64"); +void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI) { + assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64"); + + SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + SM.recordPatchPoint(MI); PatchPointOpers opers(&MI); @@ -766,22 +885,111 @@ static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM, EncodedBytes = 13; else EncodedBytes = 12; - OS.EmitInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg) - .addImm(CallTarget), STI); - OS.EmitInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg), STI); + EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg) + .addImm(CallTarget)); + EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); } // Emit padding. unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); assert(NumBytes >= EncodedBytes && "Patchpoint can't request size less than the length of a call."); - EmitNops(OS, NumBytes - EncodedBytes, Is64Bit, STI); + EmitNops(OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(), + getSubtargetInfo()); +} + +// Returns instruction preceding MBBI in MachineFunction. +// If MBBI is the first instruction of the first basic block, returns null. +static MachineBasicBlock::const_iterator +PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) { + const MachineBasicBlock *MBB = MBBI->getParent(); + while (MBBI == MBB->begin()) { + if (MBB == MBB->getParent()->begin()) + return nullptr; + MBB = MBB->getPrevNode(); + MBBI = MBB->end(); + } + return --MBBI; +} + +static const Constant *getConstantFromPool(const MachineInstr &MI, + const MachineOperand &Op) { + if (!Op.isCPI()) + return nullptr; + + ArrayRef<MachineConstantPoolEntry> Constants = + MI.getParent()->getParent()->getConstantPool()->getConstants(); + const MachineConstantPoolEntry &ConstantEntry = + Constants[Op.getIndex()]; + + // Bail if this is a machine constant pool entry, we won't be able to dig out + // anything useful. + if (ConstantEntry.isMachineConstantPoolEntry()) + return nullptr; + + auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal); + assert((!C || ConstantEntry.getType() == C->getType()) && + "Expected a constant of the same type!"); + return C; +} + +static std::string getShuffleComment(const MachineOperand &DstOp, + const MachineOperand &SrcOp, + ArrayRef<int> Mask) { + std::string Comment; + + // Compute the name for a register. This is really goofy because we have + // multiple instruction printers that could (in theory) use different + // names. Fortunately most people use the ATT style (outside of Windows) + // and they actually agree on register naming here. Ultimately, this is + // a comment, and so its OK if it isn't perfect. + auto GetRegisterName = [](unsigned RegNum) -> StringRef { + return X86ATTInstPrinter::getRegisterName(RegNum); + }; + + StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem"; + StringRef SrcName = SrcOp.isReg() ? GetRegisterName(SrcOp.getReg()) : "mem"; + + raw_string_ostream CS(Comment); + CS << DstName << " = "; + bool NeedComma = false; + bool InSrc = false; + for (int M : Mask) { + // Wrap up any prior entry... + if (M == SM_SentinelZero && InSrc) { + InSrc = false; + CS << "]"; + } + if (NeedComma) + CS << ","; + else + NeedComma = true; + + // Print this shuffle... + if (M == SM_SentinelZero) { + CS << "zero"; + } else { + if (!InSrc) { + InSrc = true; + CS << SrcName << "["; + } + if (M == SM_SentinelUndef) + CS << "u"; + else + CS << M; + } + } + if (InSrc) + CS << "]"; + CS.flush(); + + return Comment; } void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); - const X86RegisterInfo *RI = - static_cast<const X86RegisterInfo *>(TM.getRegisterInfo()); + const X86RegisterInfo *RI = static_cast<const X86RegisterInfo *>( + TM.getSubtargetImpl()->getRegisterInfo()); switch (MI->getOpcode()) { case TargetOpcode::DBG_VALUE: @@ -812,7 +1020,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::TLS_addr64: case X86::TLS_base_addr32: case X86::TLS_base_addr64: - return LowerTlsAddr(OutStreamer, MCInstLowering, *MI, getSubtargetInfo()); + return LowerTlsAddr(MCInstLowering, *MI); case X86::MOVPC32r: { // This is a pseudo op for a two instruction sequence with a label, which @@ -825,15 +1033,15 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { MCSymbol *PICBase = MF->getPICBaseSymbol(); // FIXME: We would like an efficient form for this, so we don't have to do a // lot of extra uniquing. - EmitToStreamer(OutStreamer, MCInstBuilder(X86::CALLpcrel32) + EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32) .addExpr(MCSymbolRefExpr::Create(PICBase, OutContext))); // Emit the label. OutStreamer.EmitLabel(PICBase); // popl $reg - EmitToStreamer(OutStreamer, MCInstBuilder(X86::POP32r) - .addReg(MI->getOperand(0).getReg())); + EmitAndCountInstruction(MCInstBuilder(X86::POP32r) + .addReg(MI->getOperand(0).getReg())); return; } @@ -863,29 +1071,32 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext), DotExpr, OutContext); - EmitToStreamer(OutStreamer, MCInstBuilder(X86::ADD32ri) + EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri) .addReg(MI->getOperand(0).getReg()) .addReg(MI->getOperand(1).getReg()) .addExpr(DotExpr)); return; } + case TargetOpcode::STATEPOINT: + return LowerSTATEPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit(), TM, + getSubtargetInfo(), MCInstLowering); case TargetOpcode::STACKMAP: - return LowerSTACKMAP(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo()); + return LowerSTACKMAP(*MI); case TargetOpcode::PATCHPOINT: - return LowerPATCHPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit(), getSubtargetInfo()); + return LowerPATCHPOINT(*MI); case X86::MORESTACK_RET: - EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget))); + EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); return; case X86::MORESTACK_RET_RESTORE_R10: // Return, then restore R10. - EmitToStreamer(OutStreamer, MCInstBuilder(getRetOpcode(*Subtarget))); - EmitToStreamer(OutStreamer, MCInstBuilder(X86::MOV64rr) - .addReg(X86::R10) - .addReg(X86::RAX)); + EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); + EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr) + .addReg(X86::R10) + .addReg(X86::RAX)); return; case X86::SEH_PushReg: @@ -918,9 +1129,151 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::SEH_EndPrologue: OutStreamer.EmitWinCFIEndProlog(); return; + + case X86::SEH_Epilogue: { + MachineBasicBlock::const_iterator MBBI(MI); + // Check if preceded by a call and emit nop if so. + for (MBBI = PrevCrossBBInst(MBBI); MBBI; MBBI = PrevCrossBBInst(MBBI)) { + // Conservatively assume that pseudo instructions don't emit code and keep + // looking for a call. We may emit an unnecessary nop in some cases. + if (!MBBI->isPseudo()) { + if (MBBI->isCall()) + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + break; + } + } + return; + } + + // Lower PSHUFB and VPERMILP normally but add a comment if we can find + // a constant shuffle mask. We won't be able to do this at the MC layer + // because the mask isn't an immediate. + case X86::PSHUFBrm: + case X86::VPSHUFBrm: + case X86::VPSHUFBYrm: { + if (!OutStreamer.isVerboseAsm()) + break; + assert(MI->getNumOperands() > 5 && + "We should always have at least 5 operands!"); + const MachineOperand &DstOp = MI->getOperand(0); + const MachineOperand &SrcOp = MI->getOperand(1); + const MachineOperand &MaskOp = MI->getOperand(5); + + if (auto *C = getConstantFromPool(*MI, MaskOp)) { + SmallVector<int, 16> Mask; + DecodePSHUFBMask(C, Mask); + if (!Mask.empty()) + OutStreamer.AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + } + break; + } + case X86::VPERMILPSrm: + case X86::VPERMILPDrm: + case X86::VPERMILPSYrm: + case X86::VPERMILPDYrm: { + if (!OutStreamer.isVerboseAsm()) + break; + assert(MI->getNumOperands() > 5 && + "We should always have at least 5 operands!"); + const MachineOperand &DstOp = MI->getOperand(0); + const MachineOperand &SrcOp = MI->getOperand(1); + const MachineOperand &MaskOp = MI->getOperand(5); + + if (auto *C = getConstantFromPool(*MI, MaskOp)) { + SmallVector<int, 16> Mask; + DecodeVPERMILPMask(C, Mask); + if (!Mask.empty()) + OutStreamer.AddComment(getShuffleComment(DstOp, SrcOp, Mask)); + } + break; + } + + // For loads from a constant pool to a vector register, print the constant + // loaded. + case X86::MOVAPDrm: + case X86::VMOVAPDrm: + case X86::VMOVAPDYrm: + case X86::MOVUPDrm: + case X86::VMOVUPDrm: + case X86::VMOVUPDYrm: + case X86::MOVAPSrm: + case X86::VMOVAPSrm: + case X86::VMOVAPSYrm: + case X86::MOVUPSrm: + case X86::VMOVUPSrm: + case X86::VMOVUPSYrm: + case X86::MOVDQArm: + case X86::VMOVDQArm: + case X86::VMOVDQAYrm: + case X86::MOVDQUrm: + case X86::VMOVDQUrm: + case X86::VMOVDQUYrm: + if (!OutStreamer.isVerboseAsm()) + break; + if (MI->getNumOperands() > 4) + if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + std::string Comment; + raw_string_ostream CS(Comment); + const MachineOperand &DstOp = MI->getOperand(0); + CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; + if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { + CS << "["; + for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) { + if (i != 0) + CS << ","; + if (CDS->getElementType()->isIntegerTy()) + CS << CDS->getElementAsInteger(i); + else if (CDS->getElementType()->isFloatTy()) + CS << CDS->getElementAsFloat(i); + else if (CDS->getElementType()->isDoubleTy()) + CS << CDS->getElementAsDouble(i); + else + CS << "?"; + } + CS << "]"; + OutStreamer.AddComment(CS.str()); + } else if (auto *CV = dyn_cast<ConstantVector>(C)) { + CS << "<"; + for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { + if (i != 0) + CS << ","; + Constant *COp = CV->getOperand(i); + if (isa<UndefValue>(COp)) { + CS << "u"; + } else if (auto *CI = dyn_cast<ConstantInt>(COp)) { + CS << CI->getZExtValue(); + } else if (auto *CF = dyn_cast<ConstantFP>(COp)) { + SmallString<32> Str; + CF->getValueAPF().toString(Str); + CS << Str; + } else { + CS << "?"; + } + } + CS << ">"; + OutStreamer.AddComment(CS.str()); + } + } + break; } MCInst TmpInst; MCInstLowering.Lower(MI, TmpInst); - EmitToStreamer(OutStreamer, TmpInst); + + // Stackmap shadows cannot include branch targets, so we can count the bytes + // in a call towards the shadow, but must ensure that the no thread returns + // in to the stackmap shadow. The only way to achieve this is if the call + // is at the end of the shadow. + if (MI->isCall()) { + // Count then size of the call towards the shadow + SMShadowTracker.count(TmpInst, getSubtargetInfo()); + // Then flush the shadow so that we fill with nops before the call, not + // after it. + SMShadowTracker.emitShadowPadding(OutStreamer, getSubtargetInfo()); + // Then emit the call + OutStreamer.EmitInstruction(TmpInst, getSubtargetInfo()); + return; + } + + EmitAndCountInstruction(TmpInst); } diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp index 568dc222d9d1..ac2cdc8c6567 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -8,7 +8,26 @@ //===----------------------------------------------------------------------===// #include "X86MachineFunctionInfo.h" +#include "X86RegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; void X86MachineFunctionInfo::anchor() { } + +void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) { + if (!RestoreBasePointerOffset) { + const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>( + MF->getSubtarget().getRegisterInfo()); + unsigned SlotSize = RegInfo->getSlotSize(); + for (const MCPhysReg *CSR = + RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF); + unsigned Reg = *CSR; + ++CSR) + { + if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) + RestoreBasePointerOffset -= SlotSize; + } + } +} + diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 78d20ce870f1..9fd03a7059cf 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -11,10 +11,13 @@ // //===----------------------------------------------------------------------===// -#ifndef X86MACHINEFUNCTIONINFO_H -#define X86MACHINEFUNCTIONINFO_H +#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineValueType.h" +#include <vector> namespace llvm { @@ -29,6 +32,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// contains stack pointer re-alignment code which requires FP. bool ForceFramePointer; + /// RestoreBasePointerOffset - Non-zero if the function has base pointer + /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a + /// displacement from the frame pointer to a slot where the base pointer + /// is stashed. + signed char RestoreBasePointerOffset; + /// CalleeSavedFrameSize - Size of the callee-saved register portion of the /// stack frame in bytes. unsigned CalleeSavedFrameSize; @@ -68,9 +77,18 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { unsigned ArgumentStackSize; /// NumLocalDynamics - Number of local-dynamic TLS accesses. unsigned NumLocalDynamics; + /// HasPushSequences - Keeps track of whether this function uses sequences + /// of pushes to pass function parameters. + bool HasPushSequences; + +private: + /// ForwardedMustTailRegParms - A list of virtual and physical registers + /// that must be forwarded to every musttail call. + SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms; public: X86MachineFunctionInfo() : ForceFramePointer(false), + RestoreBasePointerOffset(0), CalleeSavedFrameSize(0), BytesToPopOnReturn(0), ReturnAddrIndex(0), @@ -82,10 +100,12 @@ public: VarArgsGPOffset(0), VarArgsFPOffset(0), ArgumentStackSize(0), - NumLocalDynamics(0) {} + NumLocalDynamics(0), + HasPushSequences(false) {} explicit X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false), + RestoreBasePointerOffset(0), CalleeSavedFrameSize(0), BytesToPopOnReturn(0), ReturnAddrIndex(0), @@ -97,11 +117,19 @@ public: VarArgsGPOffset(0), VarArgsFPOffset(0), ArgumentStackSize(0), - NumLocalDynamics(0) {} + NumLocalDynamics(0), + HasPushSequences(false) {} bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } + bool getHasPushSequences() const { return HasPushSequences; } + void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; } + + bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; } + void setRestoreBasePointer(const MachineFunction *MF); + int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } @@ -138,6 +166,9 @@ public: unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } + SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() { + return ForwardedMustTailRegParms; + } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp index 6639875d07e3..adc05b2f6ea4 100644 --- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -105,7 +105,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { if (!TM->getSubtarget<X86Subtarget>().padShortFunctions()) return false; - TII = TM->getInstrInfo(); + TII = TM->getSubtargetImpl()->getInstrInfo(); // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); @@ -195,7 +195,8 @@ bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB, return true; } - CyclesToEnd += TII->getInstrLatency(TM->getInstrItineraryData(), MI); + CyclesToEnd += TII->getInstrLatency( + TM->getSubtargetImpl()->getInstrItineraryData(), MI); } VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd); diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index e8a7e84bb7e3..0fa38f453706 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -68,8 +68,10 @@ X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) if (Is64Bit) { SlotSize = 8; - StackPtr = X86::RSP; - FramePtr = X86::RBP; + StackPtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? + X86::RSP : X86::ESP; + FramePtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? + X86::RBP : X86::EBP; } else { SlotSize = 4; StackPtr = X86::ESP; @@ -120,7 +122,7 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass* X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ // Don't allow super-classes of GR8_NOREX. This class is only used after - // extrating sub_8bit_hi sub-registers. The H sub-registers cannot be copied + // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied // to the full GR8 register class in 64-bit mode, so we cannot allow the // reigster class inflation. // @@ -196,7 +198,7 @@ X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { unsigned X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; switch (RC->getID()) { @@ -324,7 +326,7 @@ X86RegisterInfo::getNoPreservedMask() const { BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); // Set the stack-pointer register and its aliases as reserved. for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); @@ -441,7 +443,8 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *F = MF.getFunction(); - unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment(); + unsigned StackAlign = + MF.getSubtarget().getFrameLowering()->getStackAlignment(); bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, @@ -456,24 +459,18 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - - if (Reg == FramePtr && TFI->hasFP(MF)) { - FrameIdx = MF.getFrameInfo()->getObjectIndexBegin(); - return true; - } - return false; + // Since X86 defines assignCalleeSavedSpillSlots which always return true + // this function neither used nor tested. + llvm_unreachable("Unused function on X86. Otherwise need a test case."); } void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { - assert(SPAdj == 0 && "Unexpected"); - MachineInstr &MI = *II; MachineFunction &MF = *MI.getParent()->getParent(); - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); unsigned BasePtr; @@ -488,6 +485,12 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, else BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); + // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit + // register as source operand, semantic is the same and destination is + // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. + if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) + BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false); + // This must be part of a four operand memory reference. Replace the // FrameIndex with base register with EBP. Add an offset to the offset. MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false); @@ -501,6 +504,9 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } else FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); + if (BasePtr == StackPtr) + FIOffset += SPAdj; + // The frame index format for stackmaps and patchpoints is different from the // X86 format. It only has a FI and an offset. if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { @@ -526,10 +532,18 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); return TFI->hasFP(MF) ? FramePtr : StackPtr; } +unsigned X86RegisterInfo::getPtrSizedFrameRegister( + const MachineFunction &MF) const { + unsigned FrameReg = getFrameRegister(MF); + if (Subtarget.isTarget64BitILP32()) + FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false); + return FrameReg; +} + namespace llvm { unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, bool High) { diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index 74efd1fe32d7..406b1fcd8d0f 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86REGISTERINFO_H -#define X86REGISTERINFO_H +#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H +#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H #include "llvm/Target/TargetRegisterInfo.h" @@ -122,6 +122,7 @@ public: // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const override; + unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const; unsigned getStackRegister() const { return StackPtr; } unsigned getBaseRegister() const { return BasePtr; } // FIXME: Move to FrameInfok diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td index 0da98637496e..30cd09a5a2f1 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -166,6 +166,7 @@ def FP3 : X86Reg<"fp3", 0>; def FP4 : X86Reg<"fp4", 0>; def FP5 : X86Reg<"fp5", 0>; def FP6 : X86Reg<"fp6", 0>; +def FP7 : X86Reg<"fp7", 0>; // XMM Registers, used by the various SSE instruction set extensions. def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>; @@ -234,22 +235,18 @@ let SubRegIndices = [sub_ymm] in { def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; -class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> { - let Aliases = A; -} - // Floating point stack registers. These don't map one-to-one to the FP // pseudo registers, but we still mark them as aliasing FP registers. That // way both kinds can be live without exceeding the stack depth. ST registers // are only live around inline assembly. -def ST0 : STRegister<"st(0)", 0, []>, DwarfRegNum<[33, 12, 11]>; -def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>; -def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>; -def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>; -def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>; -def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>; -def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>; -def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>; +def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>; +def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>; +def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>; +def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>; +def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>; +def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>; +def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>; +def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>; // Floating-point status word def FPSW : X86Reg<"fpsw", 0>; @@ -266,14 +263,22 @@ def FS : X86Reg<"fs", 4>; def GS : X86Reg<"gs", 5>; // Debug registers -def DR0 : X86Reg<"dr0", 0>; -def DR1 : X86Reg<"dr1", 1>; -def DR2 : X86Reg<"dr2", 2>; -def DR3 : X86Reg<"dr3", 3>; -def DR4 : X86Reg<"dr4", 4>; -def DR5 : X86Reg<"dr5", 5>; -def DR6 : X86Reg<"dr6", 6>; -def DR7 : X86Reg<"dr7", 7>; +def DR0 : X86Reg<"dr0", 0>; +def DR1 : X86Reg<"dr1", 1>; +def DR2 : X86Reg<"dr2", 2>; +def DR3 : X86Reg<"dr3", 3>; +def DR4 : X86Reg<"dr4", 4>; +def DR5 : X86Reg<"dr5", 5>; +def DR6 : X86Reg<"dr6", 6>; +def DR7 : X86Reg<"dr7", 7>; +def DR8 : X86Reg<"dr8", 8>; +def DR9 : X86Reg<"dr9", 9>; +def DR10 : X86Reg<"dr10", 10>; +def DR11 : X86Reg<"dr11", 11>; +def DR12 : X86Reg<"dr12", 12>; +def DR13 : X86Reg<"dr13", 13>; +def DR14 : X86Reg<"dr14", 14>; +def DR15 : X86Reg<"dr15", 15>; // Control registers def CR0 : X86Reg<"cr0", 0>; diff --git a/contrib/llvm/lib/Target/X86/X86Relocations.h b/contrib/llvm/lib/Target/X86/X86Relocations.h deleted file mode 100644 index 03330566161d..000000000000 --- a/contrib/llvm/lib/Target/X86/X86Relocations.h +++ /dev/null @@ -1,52 +0,0 @@ -//===-- X86Relocations.h - X86 Code Relocations -----------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the X86 target-specific relocation types. -// -//===----------------------------------------------------------------------===// - -#ifndef X86RELOCATIONS_H -#define X86RELOCATIONS_H - -#include "llvm/CodeGen/MachineRelocation.h" - -namespace llvm { - namespace X86 { - /// RelocationType - An enum for the x86 relocation codes. Note that - /// the terminology here doesn't follow x86 convention - word means - /// 32-bit and dword means 64-bit. The relocations will be treated - /// by JIT or ObjectCode emitters, this is transparent to the x86 code - /// emitter but JIT and ObjectCode will treat them differently - enum RelocationType { - /// reloc_pcrel_word - PC relative relocation, add the relocated value to - /// the value already in memory, after we adjust it for where the PC is. - reloc_pcrel_word = 0, - - /// reloc_picrel_word - PIC base relative relocation, add the relocated - /// value to the value already in memory, after we adjust it for where the - /// PIC base is. - reloc_picrel_word = 1, - - /// reloc_absolute_word - absolute relocation, just add the relocated - /// value to the value already in memory. - reloc_absolute_word = 2, - - /// reloc_absolute_word_sext - absolute relocation, just add the relocated - /// value to the value already in memory. In object files, it represents a - /// value which must be sign-extended when resolving the relocation. - reloc_absolute_word_sext = 3, - - /// reloc_absolute_dword - absolute relocation, just add the relocated - /// value to the value already in memory. - reloc_absolute_dword = 4 - }; - } -} - -#endif diff --git a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td index 6966d616f8e3..73a32304302a 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/contrib/llvm/lib/Target/X86/X86SchedHaswell.td @@ -48,13 +48,17 @@ def HWPort6 : ProcResource<1>; def HWPort7 : ProcResource<1>; // Many micro-ops are capable of issuing on multiple ports. +def HWPort01 : ProcResGroup<[HWPort0, HWPort1]>; def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>; def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>; +def HWPort04 : ProcResGroup<[HWPort0, HWPort4]>; def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>; -def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>; +def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>; def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>; def HWPort16 : ProcResGroup<[HWPort1, HWPort6]>; +def HWPort56 : ProcResGroup<[HWPort5, HWPort6]>; def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>; +def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>; def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>; // 60 Entry Unified Scheduler @@ -125,6 +129,7 @@ defm : HWWriteResPair<WriteFAdd, HWPort1, 3>; defm : HWWriteResPair<WriteFMul, HWPort0, 5>; defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles. defm : HWWriteResPair<WriteFRcp, HWPort0, 5>; +defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>; defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>; defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>; defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>; @@ -261,4 +266,1882 @@ def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; } def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; } def : WriteRes<WriteFence, [HWPort23, HWPort4]>; def : WriteRes<WriteNop, []>; + +//================ Exceptions ================// + +//-- Specific Scheduling Models --// + +// Starting with P0. +def WriteP0 : SchedWriteRes<[HWPort0]>; + +def WriteP0_P1_Lat4 : SchedWriteRes<[HWPort0, HWPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +def WriteP0_P1_Lat4Ld : SchedWriteRes<[HWPort0, HWPort1, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} + +def WriteP01 : SchedWriteRes<[HWPort01]>; + +def Write2P01 : SchedWriteRes<[HWPort01]> { + let NumMicroOps = 2; +} +def Write3P01 : SchedWriteRes<[HWPort01]> { + let NumMicroOps = 3; +} + +def WriteP015 : SchedWriteRes<[HWPort015]>; + +def WriteP01_P5 : SchedWriteRes<[HWPort01, HWPort5]> { + let NumMicroOps = 2; +} +def WriteP06 : SchedWriteRes<[HWPort06]>; + +def Write2P06 : SchedWriteRes<[HWPort06]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +def Write3P06_Lat2 : SchedWriteRes<[HWPort06]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} + +def WriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> { + let NumMicroOps = 2; +} + +def Write2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> { + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} + +def Write2P0156_Lat2 : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def Write2P0156_Lat2Ld : SchedWriteRes<[HWPort0156, HWPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +def Write5P0156 : SchedWriteRes<[HWPort0156]> { + let NumMicroOps = 5; + let ResourceCycles = [5]; +} + +def WriteP0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [1, 2, 1]; +} + +def Write2P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [2, 2, 1]; +} + +def Write3P0156_2P237_P4 : SchedWriteRes<[HWPort0156, HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [3, 2, 1]; +} + +// Starting with P1. +def WriteP1 : SchedWriteRes<[HWPort1]>; + +def WriteP1_P23 : SchedWriteRes<[HWPort1, HWPort23]> { + let NumMicroOps = 2; +} +def WriteP1_Lat3 : SchedWriteRes<[HWPort1]> { + let Latency = 3; +} +def WriteP1_Lat3Ld : SchedWriteRes<[HWPort1, HWPort23]> { + let Latency = 7; +} + +def Write2P1 : SchedWriteRes<[HWPort1]> { + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def Write2P1_P23 : SchedWriteRes<[HWPort1, HWPort23]> { + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def WriteP15 : SchedWriteRes<[HWPort15]>; +def WriteP15Ld : SchedWriteRes<[HWPort15, HWPort23]> { + let Latency = 4; +} + +def WriteP1_P5_Lat4 : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +def WriteP1_P5_Lat4Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} + +def WriteP1_P5_Lat6 : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +def WriteP1_P5_Lat6Ld : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} + +// Starting with P2. +def Write2P237_P4 : SchedWriteRes<[HWPort237, HWPort4]> { + let Latency = 1; + let ResourceCycles = [2, 1]; +} + +// Starting with P5. +def WriteP5 : SchedWriteRes<[HWPort5]>; +def WriteP5Ld : SchedWriteRes<[HWPort5, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} + +// Notation: +// - r: register. +// - mm: 64 bit mmx register. +// - x = 128 bit xmm register. +// - (x)mm = mmx or xmm register. +// - y = 256 bit ymm register. +// - v = any vector register. +// - m = memory. + +//=== Integer Instructions ===// +//-- Move instructions --// + +// MOV. +// r16,m. +def : InstRW<[WriteALULd], (instregex "MOV16rm")>; + +// MOVSX, MOVZX. +// r,m. +def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>; + +// CMOVcc. +// r,r. +def : InstRW<[Write2P0156_Lat2], + (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>; +// r,m. +def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], + (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>; + +// XCHG. +// r,r. +def WriteXCHG : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let ResourceCycles = [3]; +} + +def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>; + +// r,m. +def WriteXCHGrm : SchedWriteRes<[]> { + let Latency = 21; + let NumMicroOps = 8; +} +def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>; + +// XLAT. +def WriteXLAT : SchedWriteRes<[]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteXLAT], (instregex "XLAT")>; + +// PUSH. +// m. +def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>; + +// PUSHF. +def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> { + let NumMicroOps = 4; +} +def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>; + +// PUSHA. +def WritePushA : SchedWriteRes<[]> { + let NumMicroOps = 19; +} +def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>; + +// POP. +// m. +def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>; + +// POPF. +def WritePopF : SchedWriteRes<[]> { + let NumMicroOps = 9; +} +def : InstRW<[WritePopF], (instregex "POPF(16|32)")>; + +// POPA. +def WritePopA : SchedWriteRes<[]> { + let NumMicroOps = 18; +} +def : InstRW<[WritePopA], (instregex "POPA(16|32)")>; + +// LAHF SAHF. +def : InstRW<[WriteP06], (instregex "(S|L)AHF")>; + +// BSWAP. +// r32. +def WriteBSwap32 : SchedWriteRes<[HWPort15]>; +def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>; + +// r64. +def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>; + +// MOVBE. +// r16,m16 / r64,m64. +def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>; + +// r32, m32. +def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>; + +// m16,r16. +def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>; + +// m32,r32. +def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>; + +// m64,r64. +def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> { + let NumMicroOps = 4; +} +def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>; + +//-- Arithmetic instructions --// + +// ADD SUB. +// m,r/i. +def : InstRW<[Write2P0156_2P237_P4], + (instregex "(ADD|SUB)(8|16|32|64)m(r|i)", + "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>; + +// ADC SBB. +// r,r/i. +def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)", + "(ADC|SBB)(16|32|64)ri8", + "(ADC|SBB)64ri32", + "(ADC|SBB)(8|16|32|64)rr_REV")>; + +// r,m. +def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>; + +// m,r/i. +def : InstRW<[Write3P0156_2P237_P4], + (instregex "(ADC|SBB)(8|16|32|64)m(r|i)", + "(ADC|SBB)(16|32|64)mi8", + "(ADC|SBB)64mi32")>; + +// INC DEC NOT NEG. +// m. +def : InstRW<[WriteP0156_2P237_P4], + (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m", + "(INC|DEC)64(16|32)m")>; + +// MUL IMUL. +// r16. +def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>; + +// m16. +def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 8; + let NumMicroOps = 5; +} +def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>; + +// r32. +def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 3; +} +def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>; + +// m32. +def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 8; + let NumMicroOps = 4; +} +def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>; + +// r64. +def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> { + let Latency = 3; + let NumMicroOps = 2; +} +def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>; + +// m64. +def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>; + +// r16,r16. +def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; +} +def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>; + +// r16,m16. +def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; +} +def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>; + +// MULX. +// r32,r32,r32. +def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteMulX32], (instregex "MULX32rr")>; + +// r32,r32,m32. +def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>; + +// r64,r64,r64. +def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> { + let Latency = 4; + let NumMicroOps = 2; +} +def : InstRW<[WriteMulX64], (instregex "MULX64rr")>; + +// r64,r64,m64. +def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; +} +def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>; + +// DIV. +// r8. +def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 9; +} +def : InstRW<[WriteDiv8], (instregex "DIV8r")>; + +// r16. +def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 10; +} +def : InstRW<[WriteDiv16], (instregex "DIV16r")>; + +// r32. +def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 10; +} +def : InstRW<[WriteDiv32], (instregex "DIV32r")>; + +// r64. +def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 32; + let NumMicroOps = 36; +} +def : InstRW<[WriteDiv64], (instregex "DIV64r")>; + +// IDIV. +// r8. +def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 9; +} +def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>; + +// r16. +def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 10; +} +def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>; + +// r32. +def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 9; +} +def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>; + +// r64. +def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 39; + let NumMicroOps = 59; +} +def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>; + +//-- Logic instructions --// + +// AND OR XOR. +// m,r/i. +def : InstRW<[Write2P0156_2P237_P4], + (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)", + "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>; + +// SHR SHL SAR. +// m,i. +def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>; + +// r,cl. +def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>; + +// m,cl. +def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> { + let NumMicroOps = 6; + let ResourceCycles = [3, 2, 1]; +} +def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>; + +// ROR ROL. +// r,1. +def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>; + +// m,i. +def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 5; + let ResourceCycles = [2, 2, 1]; +} +def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>; + +// r,cl. +def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>; + +// m,cl. +def WriteRotateRMWCL : SchedWriteRes<[]> { + let NumMicroOps = 6; +} +def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>; + +// RCR RCL. +// r,1. +def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>; + +// m,1. +def WriteRCm1 : SchedWriteRes<[]> { + let NumMicroOps = 6; +} +def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>; + +// r,i. +def WriteRCri : SchedWriteRes<[HWPort0156]> { + let Latency = 6; + let NumMicroOps = 8; +} +def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>; + +// m,i. +def WriteRCmi : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>; + +// SHRD SHLD. +// r,r,i. +def WriteShDrr : SchedWriteRes<[HWPort1]> { + let Latency = 3; +} +def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>; + +// m,r,i. +def WriteShDmr : SchedWriteRes<[]> { + let NumMicroOps = 5; +} +def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>; + +// r,r,cl. +def WriteShlDCL : SchedWriteRes<[HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; +} +def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>; + +// r,r,cl. +def WriteShrDCL : SchedWriteRes<[HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>; + +// m,r,cl. +def WriteShDmrCL : SchedWriteRes<[]> { + let NumMicroOps = 7; +} +def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>; + +// BT. +// r,r/i. +def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>; + +// m,r. +def WriteBTmr : SchedWriteRes<[]> { + let NumMicroOps = 10; +} +def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>; + +// m,i. +def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>; + +// BTR BTS BTC. +// r,r,i. +def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>; + +// m,r. +def WriteBTRSCmr : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>; + +// m,i. +def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>; + +// BSF BSR. +// r,r. +def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>; +// r,m. +def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>; + +// SETcc. +// r. +def : InstRW<[WriteShift], + (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>; +// m. +def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteSetCCm], + (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>; + +// CLD STD. +def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>; + +// LZCNT TZCNT. +// r,r. +def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>; +// r,m. +def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>; + +// ANDN. +// r,r. +def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>; +// r,m. +def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>; + +// BLSI BLSMSK BLSR. +// r,r. +def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>; +// r,m. +def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>; + +// BEXTR. +// r,r,r. +def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>; +// r,m,r. +def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>; + +// BZHI. +// r,r,r. +def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>; +// r,m,r. +def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>; + +// PDEP PEXT. +// r,r,r. +def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; +// r,m,r. +def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; + +//-- Control transfer instructions --// + +// J(E|R)CXZ. +def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>; + +// LOOP. +def WriteLOOP : SchedWriteRes<[]> { + let NumMicroOps = 7; +} +def : InstRW<[WriteLOOP], (instregex "LOOP")>; + +// LOOP(N)E +def WriteLOOPE : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>; + +// CALL. +// r. +def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>; + +// m. +def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>; + +// RET. +def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>; + +// i. +def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> { + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>; + +// BOUND. +// r,m. +def WriteBOUND : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteBOUND], (instregex "BOUNDS(16|32)rm")>; + +// INTO. +def WriteINTO : SchedWriteRes<[]> { + let NumMicroOps = 4; +} +def : InstRW<[WriteINTO], (instregex "INTO")>; + +//-- String instructions --// + +// LODSB/W. +def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>; + +// LODSD/Q. +def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>; + +// STOS. +def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>; + +// MOVS. +def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [2, 1, 2]; +} +def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>; + +// SCAS. +def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>; + +// CMPS. +def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [2, 3]; +} +def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>; + +//-- Synchronization instructions --// + +// XADD. +def WriteXADD : SchedWriteRes<[]> { + let NumMicroOps = 5; +} +def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>; + +// CMPXCHG. +def WriteCMPXCHG : SchedWriteRes<[]> { + let NumMicroOps = 6; +} +def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>; + +// CMPXCHG8B. +def WriteCMPXCHG8B : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>; + +// CMPXCHG16B. +def WriteCMPXCHG16B : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>; + +//-- Other --// + +// PAUSE. +def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> { + let NumMicroOps = 5; + let ResourceCycles = [1, 3]; +} +def : InstRW<[WritePAUSE], (instregex "PAUSE")>; + +// LEAVE. +def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>; + +// XGETBV. +def WriteXGETBV : SchedWriteRes<[]> { + let NumMicroOps = 8; +} +def : InstRW<[WriteXGETBV], (instregex "XGETBV")>; + +// RDTSC. +def WriteRDTSC : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteRDTSC], (instregex "RDTSC")>; + +// RDPMC. +def WriteRDPMC : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[WriteRDPMC], (instregex "RDPMC")>; + +// RDRAND. +def WriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> { + let NumMicroOps = 17; + let ResourceCycles = [1, 16]; +} +def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>; + +//=== Floating Point x87 Instructions ===// +//-- Move instructions --// + +// FLD. +// m80. +def : InstRW<[WriteP01], (instregex "LD_Frr")>; + +def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [2, 2]; +} +def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>; + +// FBLD. +// m80. +def WriteFBLD : SchedWriteRes<[]> { + let Latency = 47; + let NumMicroOps = 43; +} +def : InstRW<[WriteFBLD], (instregex "FBLDm")>; + +// FST(P). +// r. +def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>; + +// m80. +def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> { + let NumMicroOps = 7; + let ResourceCycles = [3, 2, 2]; +} +def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>; + +// FBSTP. +// m80. +def WriteFBSTP : SchedWriteRes<[]> { + let NumMicroOps = 226; +} +def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>; + +// FXCHG. +def : InstRW<[WriteNop], (instregex "XCH_F")>; + +// FILD. +def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; +} +def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>; + +// FIST(P) FISTTP. +def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>; + +// FLDZ. +def : InstRW<[WriteP01], (instregex "LD_F0")>; + +// FLD1. +def : InstRW<[Write2P01], (instregex "LD_F1")>; + +// FLDPI FLDL2E etc. +def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>; + +// FCMOVcc. +def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>; + +// FNSTSW. +// AX. +def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> { + let NumMicroOps = 2; +} +def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>; + +// m16. +def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> { + let Latency = 6; + let NumMicroOps = 3; +} +def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>; + +// FLDCW. +def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>; + +// FNSTCW. +def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { + let NumMicroOps = 3; +} +def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>; + +// FINCSTP FDECSTP. +def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>; + +// FFREE. +def : InstRW<[WriteP01], (instregex "FFREE")>; + +// FNSAVE. +def WriteFNSAVE : SchedWriteRes<[]> { + let NumMicroOps = 147; +} +def : InstRW<[WriteFNSAVE], (instregex "FSAVEm")>; + +// FRSTOR. +def WriteFRSTOR : SchedWriteRes<[]> { + let NumMicroOps = 90; +} +def : InstRW<[WriteFRSTOR], (instregex "FRSTORm")>; + +//-- Arithmetic instructions --// + +// FABS. +def : InstRW<[WriteP0], (instregex "ABS_F")>; + +// FCHS. +def : InstRW<[WriteP0], (instregex "CHS_F")>; + +// FCOM(P) FUCOM(P). +// r. +def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr", + "UCOM_FPr")>; +// m. +def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>; + +// FCOMPP FUCOMPP. +// r. +def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>; + +// FCOMI(P) FUCOMI(P). +// m. +def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr", + "UCOM_FIPr")>; + +// FICOM(P). +def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>; + +// FTST. +def : InstRW<[WriteP1], (instregex "TST_F")>; + +// FXAM. +def : InstRW<[Write2P1], (instregex "FXAM")>; + +// FPREM. +def WriteFPREM : SchedWriteRes<[]> { + let Latency = 19; + let NumMicroOps = 28; +} +def : InstRW<[WriteFPREM], (instregex "FPREM")>; + +// FPREM1. +def WriteFPREM1 : SchedWriteRes<[]> { + let Latency = 27; + let NumMicroOps = 41; +} +def : InstRW<[WriteFPREM1], (instregex "FPREM1")>; + +// FRNDINT. +def WriteFRNDINT : SchedWriteRes<[]> { + let Latency = 11; + let NumMicroOps = 17; +} +def : InstRW<[WriteFRNDINT], (instregex "FRNDINT")>; + +//-- Math instructions --// + +// FSCALE. +def WriteFSCALE : SchedWriteRes<[]> { + let Latency = 75; // 49-125 + let NumMicroOps = 50; // 25-75 +} +def : InstRW<[WriteFSCALE], (instregex "FSCALE")>; + +// FXTRACT. +def WriteFXTRACT : SchedWriteRes<[]> { + let Latency = 15; + let NumMicroOps = 17; +} +def : InstRW<[WriteFXTRACT], (instregex "FXTRACT")>; + +//-- Other instructions --// + +// FNOP. +def : InstRW<[WriteP01], (instregex "FNOP")>; + +// WAIT. +def : InstRW<[Write2P01], (instregex "WAIT")>; + +// FNCLEX. +def : InstRW<[Write5P0156], (instregex "FNCLEX")>; + +// FNINIT. +def WriteFNINIT : SchedWriteRes<[]> { + let NumMicroOps = 26; +} +def : InstRW<[WriteFNINIT], (instregex "FNINIT")>; + +//=== Integer MMX and XMM Instructions ===// +//-- Move instructions --// + +// MOVD. +// r32/64 <- (x)mm. +def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr", + "VMOVPDI2DIrr", "MOVPDI2DIrr")>; + +// (x)mm <- r32/64. +def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr", + "VMOVDI2PDIrr", "MOVDI2PDIrr")>; + +// MOVQ. +// r64 <- (x)mm. +def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>; + +// (x)mm <- r64. +def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>; + +// (x)mm <- (x)mm. +def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>; + +// (V)MOVDQA/U. +// x <- x. +def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr", + "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV", + "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>; + +// MOVDQ2Q. +def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>; + +// MOVQ2DQ. +def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>; + + +// PACKSSWB/DW. +// mm <- mm. +def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr", + "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>; + +// mm <- m64. +def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1, 3]; +} +def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm", + "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>; + +// VPMOVSX/ZX BW BD BQ DW DQ. +// y <- x. +def WriteVPMOVSX : SchedWriteRes<[HWPort5]> { + let Latency = 3; + let NumMicroOps = 1; +} +def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>; + +// PBLENDW. +// x,x,i / v,v,v,i +def WritePBLENDWr : SchedWriteRes<[HWPort5]>; +def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>; + +// x,m,i / v,v,m,i +def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> { + let NumMicroOps = 2; + let Latency = 4; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>; + +// VPBLENDD. +// v,v,v,i. +def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>; +def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>; + +// v,v,m,i +def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> { + let NumMicroOps = 2; + let Latency = 4; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>; + +// MASKMOVQ. +def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 2]; +} +def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>; + +// MASKMOVDQU. +def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [4, 2, 4]; +} +def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>; + +// VPMASKMOV D/Q. +// v,v,m. +def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVPMASKMOVr, ReadAfterLd], + (instregex "VPMASKMOV(D|Q)(Y?)rm")>; + +// m, v,v. +def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>; + +// PMOVMSKB. +def WritePMOVMSKB : SchedWriteRes<[HWPort0]> { + let Latency = 3; +} +def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>; + +// PEXTR B/W/D/Q. +// r32,x,i. +def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>; + +// m8,x,i. +def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> { + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>; + +// VPBROADCAST B/W. +// x, m8/16. +def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd], + (instregex "VPBROADCAST(B|W)rm")>; + +// y, m8/16 +def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd], + (instregex "VPBROADCAST(B|W)Yrm")>; + +// VPGATHERDD. +// x. +def WriteVPGATHERDD128 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>; + +// y. +def WriteVPGATHERDD256 : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>; + +// VPGATHERQD. +// x. +def WriteVPGATHERQD128 : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>; + +// y. +def WriteVPGATHERQD256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>; + +// VPGATHERDQ. +// x. +def WriteVPGATHERDQ128 : SchedWriteRes<[]> { + let NumMicroOps = 12; +} +def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>; + +// y. +def WriteVPGATHERDQ256 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>; + +// VPGATHERQQ. +// x. +def WriteVPGATHERQQ128 : SchedWriteRes<[]> { + let NumMicroOps = 14; +} +def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>; + +// y. +def WriteVPGATHERQQ256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>; + +//-- Arithmetic instructions --// + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64", + "MMX_PHADDSWrr64", + "MMX_PHSUB(W|D)rr64", + "MMX_PHSUBSWrr64", + "(V?)PH(ADD|SUB)(W|D)(Y?)rr", + "(V?)PH(ADD|SUB)SWrr(256)?")>; + +// v <- v,m. +def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WritePHADDSUBm, ReadAfterLd], + (instregex "MMX_PHADD(W?)rm64", + "MMX_PHADDSWrm64", + "MMX_PHSUB(W|D)rm64", + "MMX_PHSUBSWrm64", + "(V?)PH(ADD|SUB)(W|D)(Y?)rm", + "(V?)PH(ADD|SUB)SWrm(128|256)?")>; + +// PCMPGTQ. +// v <- v,v. +def WritePCMPGTQr : SchedWriteRes<[HWPort0]> { + let Latency = 5; + let NumMicroOps = 1; +} +def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; + +// v <- v,m. +def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>; + +// PMULLD. +// x,x / y,y,y. +def WritePMULLDr : SchedWriteRes<[HWPort0]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>; + +// x,m / y,y,m. +def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>; + +//-- Logic instructions --// + +// PTEST. +// v,v. +def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>; + +// v,m. +def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>; + +// PSLL,PSRL,PSRA W/D/Q. +// x,x / v,v,x. +def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>; + +// PSLL,PSRL DQ. +def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>; + +//-- Other --// + +// EMMS. +def WriteEMMS : SchedWriteRes<[]> { + let Latency = 13; + let NumMicroOps = 31; +} +def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>; + +//=== Floating Point XMM and YMM Instructions ===// +//-- Move instructions --// + +// MOVMSKP S/D. +// r32 <- x. +def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> { + let Latency = 3; +} +def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>; + +// r32 <- y. +def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> { + let Latency = 2; +} +def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>; + +// VPERM2F128. +def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>; +def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>; + +// BLENDVP S/D. +def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>; +def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>; + +// VBROADCASTF128. +def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>; + +// EXTRACTPS. +// r32,x,i. +def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> { + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>; + +// m32,x,i. +def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>; + +// VEXTRACTF128. +// x,y,i. +def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>; + +// m128,y,i. +def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>; + +// VINSERTF128. +// y,y,x,i. +def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>; + +// y,y,m128,i. +def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>; + +// VMASKMOVP S/D. +// v,v,m. +def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>; + +// m128,x,x. +def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>; + +// m256,y,y. +def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>; + +// VGATHERDPS. +// x. +def WriteVGATHERDPS128 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>; + +// y. +def WriteVGATHERDPS256 : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>; + +// VGATHERQPS. +// x. +def WriteVGATHERQPS128 : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>; + +// y. +def WriteVGATHERQPS256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>; + +// VGATHERDPD. +// x. +def WriteVGATHERDPD128 : SchedWriteRes<[]> { + let NumMicroOps = 12; +} +def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>; + +// y. +def WriteVGATHERDPD256 : SchedWriteRes<[]> { + let NumMicroOps = 20; +} +def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>; + +// VGATHERQPD. +// x. +def WriteVGATHERQPD128 : SchedWriteRes<[]> { + let NumMicroOps = 14; +} +def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>; + +// y. +def WriteVGATHERQPD256 : SchedWriteRes<[]> { + let NumMicroOps = 22; +} +def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>; + +//-- Conversion instructions --// + +// CVTPD2PS. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>; + +// x,m128. +def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>; + +// x,y. +def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>; + +// x,m256. +def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>; + +// CVTSD2SS. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>; + +// x,m64. +def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>; + +// CVTPS2PD. +// x,x. +def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>; + +// x,m64. +// y,m128. +def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>; + +// y,x. +def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>; + +// CVTSS2SD. +// x,x. +def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>; + +// x,m32. +def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>; + +// CVTDQ2PD. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>; + +// y,x. +def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>; + +// CVT(T)PD2DQ. +// x,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>; +// x,m128. +def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>; +// x,y. +def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>; +// x,m256. +def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>; + +// CVT(T)PS2PI. +// mm,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>; + +// CVTPI2PD. +// x,mm. +def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>; + +// CVT(T)PD2PI. +// mm,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>; + +// CVSTSI2SS. +// x,r32. +def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>; + +// CVT(T)SS2SI. +// r32,x. +def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>; +// r32,m32. +def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>; + +// CVTSI2SD. +// x,r32/64. +def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>; + +// CVTSD2SI. +// r32/64 +def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>; +// r32,m32. +def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>; + +// VCVTPS2PH. +// x,v,i. +def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>; +// m,v,i. +def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>; + +// VCVTPH2PS. +// v,x. +def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>; + +//-- Arithmetic instructions --// + +// HADD, HSUB PS/PD +// x,x / v,v,v. +def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>; + +// x,m / v,v,m. +def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>; + +// MULL SS/SD PS/PD. +// x,x / v,v,v. +def WriteMULr : SchedWriteRes<[HWPort01]> { + let Latency = 5; +} +def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>; + +// x,m / v,v,m. +def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>; + +// VDIVPS. +// y,y,y. +def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 19; // 18-21 cycles. + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>; + +// y,y,m256. +def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 23; // 18-21 + 4 cycles. + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>; + +// VDIVPD. +// y,y,y. +def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 27; // 19-35 cycles. + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>; + +// y,y,m256. +def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 31; // 19-35 + 4 cycles. + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>; + +// VRCPPS. +// y,y. +def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>; + +// y,m256. +def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>; + +// ROUND SS/SD PS/PD. +// v,v,i. +def WriteROUNDr : SchedWriteRes<[HWPort1]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>; + +// v,m,i. +def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>; + +// DPPS. +// x,x,i / v,v,v,i. +def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>; + +// x,m,i / v,v,m,i. +def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> { + let Latency = 18; + let NumMicroOps = 6; + let ResourceCycles = [2, 1, 1, 1, 1]; +} +def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>; + +// DPPD. +// x,x,i. +def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>; + +// x,m,i. +def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>; + +// VFMADD. +// v,v,v. +def WriteFMADDr : SchedWriteRes<[HWPort01]> { + let Latency = 5; + let NumMicroOps = 1; +} +def : InstRW<[WriteFMADDr], + (instregex + // 3p forms. + "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?", + // 3s forms. + "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)r", + // 4s/4s_int forms. + "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?", + // 4p forms. + "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>; + +// v,v,m. +def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteFMADDm], + (instregex + // 3p forms. + "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?", + // 3s forms. + "VF(N?)M(ADD|SUB)S(S|D)(r132|231|213)m", + // 4s/4s_int forms. + "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?", + // 4p forms. + "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>; + +//-- Math instructions --// + +// VSQRTPS. +// y,y. +def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 19; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>; + +// y,m256. +def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 23; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>; + +// VSQRTPD. +// y,y. +def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 28; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>; + +// y,m256. +def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 32; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>; + +// RSQRT SS/PS. +// x,x. +def WriteRSQRTr : SchedWriteRes<[HWPort0]> { + let Latency = 5; +} +def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>; + +// x,m128. +def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>; + +// RSQRTPS 256. +// y,y. +def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} +def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>; + +// y,m256. +def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2, 1, 1]; +} +def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>; + +//-- Logic instructions --// + +// AND, ANDN, OR, XOR PS/PD. +// x,x / v,v,v. +def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>; +// x,m / v,v,m. +def : InstRW<[WriteP5Ld, ReadAfterLd], + (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>; + +//-- Other instructions --// + +// VZEROUPPER. +def WriteVZEROUPPER : SchedWriteRes<[]> { + let NumMicroOps = 4; +} +def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>; + +// VZEROALL. +def WriteVZEROALL : SchedWriteRes<[]> { + let NumMicroOps = 12; +} +def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>; + +// LDMXCSR. +def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 1, 1]; +} +def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>; + +// STMXCSR. +def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1, 1, 1, 1]; +} +def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>; + } // SchedModel diff --git a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td index 83f053425aa1..eca65c2892b7 100644 --- a/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/contrib/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -117,6 +117,7 @@ defm : SBWriteResPair<WriteFAdd, SBPort1, 3>; defm : SBWriteResPair<WriteFMul, SBPort0, 5>; defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles. defm : SBWriteResPair<WriteFRcp, SBPort0, 5>; +defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>; defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>; defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>; defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>; diff --git a/contrib/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm/lib/Target/X86/X86Schedule.td index b76850aa1c8b..a261356afe6a 100644 --- a/contrib/llvm/lib/Target/X86/X86Schedule.td +++ b/contrib/llvm/lib/Target/X86/X86Schedule.td @@ -63,12 +63,13 @@ def WriteZero : SchedWrite; defm WriteJump : X86SchedWritePair; // Floating point. This covers both scalar and vector operations. -defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare. -defm WriteFMul : X86SchedWritePair; // Floating point multiplication. -defm WriteFDiv : X86SchedWritePair; // Floating point division. -defm WriteFSqrt : X86SchedWritePair; // Floating point square root. -defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal. -defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare. +defm WriteFMul : X86SchedWritePair; // Floating point multiplication. +defm WriteFDiv : X86SchedWritePair; // Floating point division. +defm WriteFSqrt : X86SchedWritePair; // Floating point square root. +defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate. +defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate. +defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles. defm WriteFBlend : X86SchedWritePair; // Floating point vector blends. defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends. @@ -314,6 +315,11 @@ def IIC_SSE_SQRTPD_RM : InstrItinClass; def IIC_SSE_SQRTSD_RR : InstrItinClass; def IIC_SSE_SQRTSD_RM : InstrItinClass; +def IIC_SSE_RSQRTPS_RR : InstrItinClass; +def IIC_SSE_RSQRTPS_RM : InstrItinClass; +def IIC_SSE_RSQRTSS_RR : InstrItinClass; +def IIC_SSE_RSQRTSS_RM : InstrItinClass; + def IIC_SSE_RCPP_RR : InstrItinClass; def IIC_SSE_RCPP_RM : InstrItinClass; def IIC_SSE_RCPS_RR : InstrItinClass; @@ -640,3 +646,5 @@ include "X86ScheduleAtom.td" include "X86SchedSandyBridge.td" include "X86SchedHaswell.td" include "X86ScheduleSLM.td" +include "X86ScheduleBtVer2.td" + diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td index c8820aa2d8df..4c559c9c1798 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -224,6 +224,11 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >, InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >, + InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >, + InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >, InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >, diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td new file mode 100644 index 000000000000..ce1ece34e431 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -0,0 +1,341 @@ +//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for AMD btver2 (Jaguar) to support +// instruction scheduling and other instruction cost heuristics. Based off AMD Software +// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. +// +//===----------------------------------------------------------------------===// + +def BtVer2Model : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and btver2 can + // decode 2 instructions per cycle. + let IssueWidth = 2; + let MicroOpBufferSize = 64; // Retire Control Unit + let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) + let HighLatency = 25; + let MispredictPenalty = 14; // Minimum branch misdirection penalty + let PostRAScheduler = 1; + + // FIXME: SSE4/AVX is unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = BtVer2Model in { + +// Jaguar can issue up to 6 micro-ops in one cycle +def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) +def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV +def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU +def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) +def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA +def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM + +// Any pipe - FIXME we need this until we can discriminate between int/fpu load/store/moves properly +def JAny : ProcResGroup<[JALU0, JALU1, JLAGU, JSAGU, JFPU0, JFPU1]>; + +// Integer Pipe Scheduler +def JALU01 : ProcResGroup<[JALU0, JALU1]> { + let BufferSize=20; +} + +// AGU Pipe Scheduler +def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { + let BufferSize=12; +} + +// Fpu Pipe Scheduler +def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { + let BufferSize=18; +} + +def JDiv : ProcResource<1>; // integer division +def JMul : ProcResource<1>; // integer multiplication +def JVALU0 : ProcResource<1>; // vector integer +def JVALU1 : ProcResource<1>; // vector integer +def JVIMUL : ProcResource<1>; // vector integer multiplication +def JSTC : ProcResource<1>; // vector store/convert +def JFPM : ProcResource<1>; // FP multiplication +def JFPA : ProcResource<1>; // FP addition + +// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 3>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when dispatched by the schedulers. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> { + let Latency = !add(Lat, 3); + } +} + +multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [JLAGU, ExePort]> { + let Latency = !add(Lat, 5); + } +} + +// A folded store needs a cycle on the SAGU for the store data. +def : WriteRes<WriteRMW, [JSAGU]>; + +//////////////////////////////////////////////////////////////////////////////// +// Arithmetic. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteALU, JALU01, 1>; +defm : JWriteResIntPair<WriteIMul, JALU1, 3>; + +def : WriteRes<WriteIMulH, [JALU1]> { + let Latency = 6; + let ResourceCycles = [4]; +} + +// FIXME 8/16 bit divisions +def : WriteRes<WriteIDiv, [JALU1, JDiv]> { + let Latency = 25; + let ResourceCycles = [1, 25]; +} +def : WriteRes<WriteIDivLd, [JALU1, JLAGU, JDiv]> { + let Latency = 41; + let ResourceCycles = [1, 1, 25]; +} + +// This is for simple LEAs with one or two input operands. +// FIXME: SAGU 3-operand LEA +def : WriteRes<WriteLEA, [JALU01]>; + +//////////////////////////////////////////////////////////////////////////////// +// Integer shifts and rotates. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteShift, JALU01, 1>; + +//////////////////////////////////////////////////////////////////////////////// +// Loads, stores, and moves, not folded with other operations. +// FIXME: Split x86 and SSE load/store/moves +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; } +def : WriteRes<WriteStore, [JSAGU]>; +def : WriteRes<WriteMove, [JAny]>; + +//////////////////////////////////////////////////////////////////////////////// +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteZero, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteJump, JALU01, 1>; + +//////////////////////////////////////////////////////////////////////////////// +// Floating point. This covers both scalar and vector operations. +// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions? +// FIXME: Double precision latencies +// FIXME: SS vs PS latencies +// FIXME: ymm latencies +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>; +defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>; +defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>; +defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>; + +def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> { + let Latency = 21; + let ResourceCycles = [1, 1, 21]; +} +def : WriteRes<WriteFSqrtLd, [JFPU1, JLAGU, JFPM]> { + let Latency = 26; + let ResourceCycles = [1, 1, 21]; +} + +def : WriteRes<WriteFDiv, [JFPU1, JLAGU, JFPM]> { + let Latency = 19; + let ResourceCycles = [1, 1, 19]; +} +def : WriteRes<WriteFDivLd, [JFPU1, JLAGU, JFPM]> { + let Latency = 24; + let ResourceCycles = [1, 1, 19]; +} + +// FIXME: integer pipes +defm : JWriteResFpuPair<WriteCvtF2I, JFPU1, 3>; // Float -> Integer. +defm : JWriteResFpuPair<WriteCvtI2F, JFPU1, 3>; // Integer -> Float. +defm : JWriteResFpuPair<WriteCvtF2F, JFPU1, 3>; // Float -> Float size conversion. + +def : WriteRes<WriteFVarBlend, [JFPU01]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> { + let Latency = 7; + let ResourceCycles = [1, 2]; +} + +// Vector integer operations. +defm : JWriteResFpuPair<WriteVecALU, JFPU01, 1>; +defm : JWriteResFpuPair<WriteVecShift, JFPU01, 1>; +defm : JWriteResFpuPair<WriteVecIMul, JFPU0, 2>; +defm : JWriteResFpuPair<WriteShuffle, JFPU01, 1>; +defm : JWriteResFpuPair<WriteBlend, JFPU01, 1>; +defm : JWriteResFpuPair<WriteVecLogic, JFPU01, 1>; +defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>; + +def : WriteRes<WriteVarBlend, [JFPU01]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> { + let Latency = 7; + let ResourceCycles = [1, 2]; +} + +// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2? +def : WriteRes<WriteVarVecShift, [JFPU01]> { + let Latency = 1; + let ResourceCycles = [1]; +} +def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> { + let Latency = 6; + let ResourceCycles = [1, 1]; +} + +def : WriteRes<WriteMPSAD, [JFPU0]> { + let Latency = 3; + let ResourceCycles = [2]; +} +def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> { + let Latency = 8; + let ResourceCycles = [1, 2]; +} + +//////////////////////////////////////////////////////////////////////////////// +// String instructions. +// Packed Compare Implicit Length Strings, Return Mask +// FIXME: approximate latencies + pipe dependencies +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WritePCmpIStrM, [JFPU01]> { + let Latency = 7; + let ResourceCycles = [2]; +} +def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> { + let Latency = 12; + let ResourceCycles = [1, 2]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [JFPU01]> { + let Latency = 13; + let ResourceCycles = [5]; +} +def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> { + let Latency = 18; + let ResourceCycles = [1, 5]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [JFPU01]> { + let Latency = 6; + let ResourceCycles = [2]; +} +def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> { + let Latency = 11; + let ResourceCycles = [1, 2]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [JFPU01]> { + let Latency = 13; + let ResourceCycles = [5]; +} +def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> { + let Latency = 18; + let ResourceCycles = [1, 5]; +} + +//////////////////////////////////////////////////////////////////////////////// +// AES Instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteAESDecEnc, [JFPU01, JVIMUL]> { + let Latency = 3; + let ResourceCycles = [1, 1]; +} +def : WriteRes<WriteAESDecEncLd, [JFPU01, JLAGU, JVIMUL]> { + let Latency = 8; + let ResourceCycles = [1, 1, 1]; +} + +def : WriteRes<WriteAESIMC, [JVIMUL]> { + let Latency = 2; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESIMCLd, [JLAGU, JVIMUL]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +def : WriteRes<WriteAESKeyGen, [JVIMUL]> { + let Latency = 2; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESKeyGenLd, [JLAGU, JVIMUL]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Carry-less multiplication instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteCLMul, [JVIMUL]> { + let Latency = 2; + let ResourceCycles = [1]; +} +def : WriteRes<WriteCLMulLd, [JLAGU, JVIMUL]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +// FIXME: pipe for system/microcode? +def : WriteRes<WriteSystem, [JAny]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; } +def : WriteRes<WriteFence, [JSAGU]>; +def : WriteRes<WriteNop, []>; +} // SchedModel + diff --git a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td index 90d858788124..f95d4fa04177 100644 --- a/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/contrib/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -101,6 +101,7 @@ def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> { // Scalar and vector floating point. defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>; defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>; +defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>; defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>; defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>; defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>; diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index a83dd9b2eea7..821044f4bccd 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -29,6 +29,26 @@ X86SelectionDAGInfo::X86SelectionDAGInfo(const DataLayout &DL) X86SelectionDAGInfo::~X86SelectionDAGInfo() {} +bool X86SelectionDAGInfo::isBaseRegConflictPossible( + SelectionDAG &DAG, ArrayRef<unsigned> ClobberSet) const { + // We cannot use TRI->hasBasePointer() until *after* we select all basic + // blocks. Legalization may introduce new stack temporaries with large + // alignment requirements. Fall back to generic code if there are any + // dynamic stack adjustments (hopefully rare) and the base pointer would + // conflict if we had to use it. + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + if (!MFI->hasVarSizedObjects() && !MFI->hasInlineAsmWithSPAdjust()) + return false; + + const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>( + DAG.getSubtarget().getRegisterInfo()); + unsigned BaseReg = TRI->getBaseRegister(); + for (unsigned R : ClobberSet) + if (BaseReg == R) + return true; + return false; +} + SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, @@ -39,6 +59,13 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); const X86Subtarget &Subtarget = DAG.getTarget().getSubtarget<X86Subtarget>(); +#ifndef NDEBUG + // If the base register might conflict with our physical registers, bail out. + unsigned ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI, + X86::ECX, X86::EAX, X86::EDI}; + assert(!isBaseRegConflictPossible(DAG, ClobberSet)); +#endif + // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256) return SDValue(); @@ -201,12 +228,10 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SrcPtrInfo.getAddrSpace() >= 256) return SDValue(); - // ESI might be used as a base pointer, in that case we can't simply overwrite - // the register. Fall back to generic code. - const X86RegisterInfo *TRI = - static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo()); - if (TRI->hasBasePointer(DAG.getMachineFunction()) && - TRI->getBaseRegister() == X86::ESI) + // If the base register might conflict with our physical registers, bail out. + unsigned ClobberSet[] = {X86::RCX, X86::RSI, X86::RDI, + X86::ECX, X86::ESI, X86::EDI}; + if (isBaseRegConflictPossible(DAG, ClobberSet)) return SDValue(); MVT AVT; diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h index c12555a59617..eb7e0ed9de6c 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86SELECTIONDAGINFO_H -#define X86SELECTIONDAGINFO_H +#ifndef LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H +#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H #include "llvm/Target/TargetSelectionDAGInfo.h" @@ -23,6 +23,11 @@ class X86TargetMachine; class X86Subtarget; class X86SelectionDAGInfo : public TargetSelectionDAGInfo { + /// Returns true if it is possible for the base register to conflict with the + /// given set of clobbers for a memory intrinsic. + bool isBaseRegConflictPossible(SelectionDAG &DAG, + ArrayRef<unsigned> ClobberSet) const; + public: explicit X86SelectionDAGInfo(const DataLayout &DL); ~X86SelectionDAGInfo(); diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp index 41551a1d677f..983169886ad3 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -13,6 +13,7 @@ #include "X86Subtarget.h" #include "X86InstrInfo.h" +#include "X86TargetMachine.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" @@ -67,12 +68,7 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { if (GV->hasDLLImportStorageClass()) return X86II::MO_DLLIMPORT; - // Determine whether this is a reference to a definition or a declaration. - // Materializable GVs (in JIT lazy compilation mode) do not require an extra - // load from stub. - bool isDecl = GV->hasAvailableExternallyLinkage(); - if (GV->isDeclaration() && !GV->isMaterializable()) - isDecl = true; + bool isDecl = GV->isDeclarationForLinker(); // X86-64 in PIC mode. if (isPICStyleRIPRel()) { @@ -182,23 +178,7 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { return isTargetELF() || TM.getRelocationModel() == Reloc::Static; } -void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) { - AttributeSet FnAttrs = MF->getFunction()->getAttributes(); - Attribute CPUAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); - Attribute FSAttr = - FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); - std::string CPU = - !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : ""; - std::string FS = - !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : ""; - if (!FS.empty()) { - initializeEnvironment(); - resetSubtargetFeatures(CPU, FS); - } -} - -void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { +void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { std::string CPUName = CPU; if (CPUName.empty()) CPUName = "generic"; @@ -277,56 +257,64 @@ void X86Subtarget::initializeEnvironment() { HasVLX = false; HasADX = false; HasSHA = false; + HasSGX = false; HasPRFCHW = false; HasRDSEED = false; + HasSMAP = false; IsBTMemSlow = false; IsSHLDSlow = false; IsUAMemFast = false; - HasVectorUAMem = false; + IsUAMem32Slow = false; + HasSSEUnalignedMem = false; HasCmpxchg16b = false; UseLeaForSP = false; - HasSlowDivide = false; + HasSlowDivide32 = false; + HasSlowDivide64 = false; PadShortFunctions = false; CallRegIndirect = false; LEAUsesAG = false; SlowLEA = false; SlowIncDec = false; + UseSqrtEst = false; + UseReciprocalEst = false; stackAlignment = 4; // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; } -static std::string computeDataLayout(const X86Subtarget &ST) { +static std::string computeDataLayout(const Triple &TT) { // X86 is little endian std::string Ret = "e"; - Ret += DataLayout::getManglingComponent(ST.getTargetTriple()); + Ret += DataLayout::getManglingComponent(TT); // X86 and x32 have 32 bit pointers. - if (ST.isTarget64BitILP32() || !ST.is64Bit()) + if ((TT.isArch64Bit() && + (TT.getEnvironment() == Triple::GNUX32 || TT.isOSNaCl())) || + !TT.isArch64Bit()) Ret += "-p:32:32"; // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. - if (ST.is64Bit() || ST.isOSWindows() || ST.isTargetNaCl()) + if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) Ret += "-i64:64"; else Ret += "-f64:32:64"; // Some ABIs align long double to 128 bits, others to 32. - if (ST.isTargetNaCl()) + if (TT.isOSNaCl()) ; // No f80 - else if (ST.is64Bit() || ST.isTargetDarwin()) + else if (TT.isArch64Bit() || TT.isOSDarwin()) Ret += "-f80:128"; else Ret += "-f80:32"; // The registers can hold 8, 16, 32 or, in x86-64, 64 bits. - if (ST.is64Bit()) + if (TT.isArch64Bit()) Ret += "-n8:16:32:64"; else Ret += "-n8:16:32"; // The stack is aligned to 32 bits on some ABIs and 128 bits on others. - if (!ST.is64Bit() && ST.isOSWindows()) + if (!TT.isArch64Bit() && TT.isOSWindows()) Ret += "-S32"; else Ret += "-S128"; @@ -337,26 +325,45 @@ static std::string computeDataLayout(const X86Subtarget &ST) { X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { initializeEnvironment(); - resetSubtargetFeatures(CPU, FS); + initSubtargetFeatures(CPU, FS); return *this; } X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, X86TargetMachine &TM, + const std::string &FS, const X86TargetMachine &TM, unsigned StackAlignOverride) : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others), PICStyle(PICStyles::None), TargetTriple(TT), + DL(computeDataLayout(TargetTriple)), StackAlignOverride(StackAlignOverride), In64BitMode(TargetTriple.getArch() == Triple::x86_64), In32BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() != Triple::CODE16), In16BitMode(TargetTriple.getArch() == Triple::x86 && TargetTriple.getEnvironment() == Triple::CODE16), - DL(computeDataLayout(*this)), TSInfo(DL), - InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM), - FrameLowering(TargetFrameLowering::StackGrowsDown, getStackAlignment(), - is64Bit() ? -8 : -4), - JITInfo(hasSSE1()) {} + TSInfo(DL), InstrInfo(initializeSubtargetDependencies(CPU, FS)), + TLInfo(TM), FrameLowering(TargetFrameLowering::StackGrowsDown, + getStackAlignment(), is64Bit() ? -8 : -4) { + // Determine the PICStyle based on the target selected. + if (TM.getRelocationModel() == Reloc::Static) { + // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. + setPICStyle(PICStyles::None); + } else if (is64Bit()) { + // PIC in 64 bit mode is always rip-rel. + setPICStyle(PICStyles::RIPRel); + } else if (isTargetCOFF()) { + setPICStyle(PICStyles::None); + } else if (isTargetDarwin()) { + if (TM.getRelocationModel() == Reloc::PIC_) + setPICStyle(PICStyles::StubPIC); + else { + assert(TM.getRelocationModel() == Reloc::DynamicNoPIC); + setPICStyle(PICStyles::StubDynamicNoPIC); + } + } else if (isTargetELF()) { + setPICStyle(PICStyles::GOT); + } +} bool X86Subtarget::enableEarlyIfConversion() const { return hasCMov() && X86EarlyIfConv; diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index 5f5df5e0818c..e0263d66e928 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -11,13 +11,12 @@ // //===----------------------------------------------------------------------===// -#ifndef X86SUBTARGET_H -#define X86SUBTARGET_H +#ifndef LLVM_LIB_TARGET_X86_X86SUBTARGET_H +#define LLVM_LIB_TARGET_X86_X86SUBTARGET_H #include "X86FrameLowering.h" #include "X86ISelLowering.h" #include "X86InstrInfo.h" -#include "X86JITInfo.h" #include "X86SelectionDAGInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/IR/CallingConv.h" @@ -139,12 +138,18 @@ protected: /// HasSHA - Processor has SHA instructions. bool HasSHA; + /// HasSGX - Processor has SGX instructions. + bool HasSGX; + /// HasPRFCHW - Processor has PRFCHW instructions. bool HasPRFCHW; /// HasRDSEED - Processor has RDSEED instructions. bool HasRDSEED; + /// HasSMAP - Processor has SMAP instructions. + bool HasSMAP; + /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; @@ -154,9 +159,12 @@ protected: /// IsUAMemFast - True if unaligned memory access is fast. bool IsUAMemFast; - /// HasVectorUAMem - True if SIMD operations can have unaligned memory - /// operands. This may require setting a feature bit in the processor. - bool HasVectorUAMem; + /// True if unaligned 32-byte memory accesses are slow. + bool IsUAMem32Slow; + + /// True if SSE operations can have unaligned memory operands. + /// This may require setting a configuration bit in the processor. + bool HasSSEUnalignedMem; /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction; /// this is true for most x86-64 chips, but not the first AMD chips. @@ -166,9 +174,13 @@ protected: /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; - /// HasSlowDivide - True if smaller divides are significantly faster than - /// full divides and should be used when possible. - bool HasSlowDivide; + /// HasSlowDivide32 - True if 8-bit divisions are significantly faster than + /// 32-bit divisions and should be used when possible. + bool HasSlowDivide32; + + /// HasSlowDivide64 - True if 16-bit divides are significantly faster than + /// 64-bit divisions and should be used when possible. + bool HasSlowDivide64; /// PadShortFunctions - True if the short functions should be padded to prevent /// a stall when returning too early. @@ -187,6 +199,16 @@ protected: /// SlowIncDec - True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; + /// Use the RSQRT* instructions to optimize square root calculations. + /// For this to be profitable, the cost of FSQRT and FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseSqrtEst; + + /// Use the RCP* instructions to optimize FP division calculations. + /// For this to be profitable, the cost of FDIV must be + /// substantially higher than normal FP ops like FADD and FMUL. + bool UseReciprocalEst; + /// Processor has AVX-512 PreFetch Instructions bool HasPFI; @@ -220,6 +242,9 @@ protected: InstrItineraryData InstrItins; private: + // Calculates type size & alignment + const DataLayout DL; + /// StackAlignOverride - Override the stack alignment. unsigned StackAlignOverride; @@ -232,30 +257,35 @@ private: /// In16BitMode - True if compiling for 16-bit, false for 32-bit or 64-bit. bool In16BitMode; - // Calculates type size & alignment - const DataLayout DL; X86SelectionDAGInfo TSInfo; // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which // X86TargetLowering needs. X86InstrInfo InstrInfo; X86TargetLowering TLInfo; X86FrameLowering FrameLowering; - X86JITInfo JITInfo; public: /// This constructor initializes the data members to match that /// of the specified triple. /// X86Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, X86TargetMachine &TM, + const std::string &FS, const X86TargetMachine &TM, unsigned StackAlignOverride); - const X86TargetLowering *getTargetLowering() const { return &TLInfo; } - const X86InstrInfo *getInstrInfo() const { return &InstrInfo; } - const DataLayout *getDataLayout() const { return &DL; } - const X86FrameLowering *getFrameLowering() const { return &FrameLowering; } - const X86SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; } - X86JITInfo *getJITInfo() { return &JITInfo; } + const X86TargetLowering *getTargetLowering() const override { + return &TLInfo; + } + const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; } + const DataLayout *getDataLayout() const override { return &DL; } + const X86FrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const X86SelectionDAGInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + const X86RegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); + } /// getStackAlignment - Returns the minimum alignment known to hold of the /// stack frame on entry to the function and which must be maintained by every @@ -270,14 +300,12 @@ public: /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - /// \brief Reset the features for the X86 target. - void resetSubtargetFeatures(const MachineFunction *MF) override; private: /// \brief Initialize the full set of dependencies so we can use an initializer /// list for X86Subtarget. X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); void initializeEnvironment(); - void resetSubtargetFeatures(StringRef CPU, StringRef FS); + void initSubtargetFeatures(StringRef CPU, StringRef FS); public: /// Is this x86_64? (disregarding specific ABI / programming model) bool is64Bit() const { @@ -295,12 +323,13 @@ public: /// Is this x86_64 with the ILP32 programming model (x32 ABI)? bool isTarget64BitILP32() const { return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32 || - TargetTriple.getOS() == Triple::NaCl); + TargetTriple.isOSNaCl()); } /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? bool isTarget64BitLP64() const { - return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32); + return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32 && + !TargetTriple.isOSNaCl()); } PICStyles::Style getPICStyle() const { return PICStyle; } @@ -341,20 +370,26 @@ public: bool hasHLE() const { return HasHLE; } bool hasADX() const { return HasADX; } bool hasSHA() const { return HasSHA; } + bool hasSGX() const { return HasSGX; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } + bool hasSMAP() const { return HasSMAP; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } bool isUnalignedMemAccessFast() const { return IsUAMemFast; } - bool hasVectorUAMem() const { return HasVectorUAMem; } + bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } + bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } - bool hasSlowDivide() const { return HasSlowDivide; } + bool hasSlowDivide32() const { return HasSlowDivide32; } + bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slowIncDec() const { return SlowIncDec; } + bool useSqrtEst() const { return UseSqrtEst; } + bool useReciprocalEst() const { return UseReciprocalEst; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } @@ -368,16 +403,13 @@ public: const Triple &getTargetTriple() const { return TargetTriple; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } - bool isTargetFreeBSD() const { - return TargetTriple.getOS() == Triple::FreeBSD; - } - bool isTargetSolaris() const { - return TargetTriple.getOS() == Triple::Solaris; - } + bool isTargetFreeBSD() const { return TargetTriple.isOSFreeBSD(); } + bool isTargetDragonFly() const { return TargetTriple.isOSDragonFly(); } + bool isTargetSolaris() const { return TargetTriple.isOSSolaris(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } - bool isTargetMacho() const { return TargetTriple.isOSBinFormatMachO(); } + bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } @@ -400,6 +432,10 @@ public: return TargetTriple.isWindowsGNUEnvironment(); } + bool isTargetWindowsItanium() const { + return TargetTriple.isWindowsItaniumEnvironment(); + } + bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } bool isOSWindows() const { return TargetTriple.isOSWindows(); } @@ -466,7 +502,9 @@ public: /// getInstrItins = Return the instruction itineraries based on the /// subtarget selection. - const InstrItineraryData &getInstrItineraryData() const { return InstrItins; } + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } AntiDepBreakMode getAntiDepBreakMode() const override { return TargetSubtargetInfo::ANTIDEP_CRITICAL; diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index f12140f1f161..1fc6b20eab09 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -13,7 +13,9 @@ #include "X86TargetMachine.h" #include "X86.h" +#include "X86TargetObjectFile.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" #include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" @@ -27,7 +29,23 @@ extern "C" void LLVMInitializeX86Target() { RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target); } -void X86TargetMachine::anchor() { } +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.isOSBinFormatMachO()) { + if (TT.getArch() == Triple::x86_64) + return make_unique<X86_64MachoTargetObjectFile>(); + return make_unique<TargetLoweringObjectFileMachO>(); + } + + if (TT.isOSLinux()) + return make_unique<X86LinuxTargetObjectFile>(); + if (TT.isOSBinFormatELF()) + return make_unique<TargetLoweringObjectFileELF>(); + if (TT.isKnownWindowsMSVCEnvironment()) + return make_unique<X86WindowsTargetObjectFile>(); + if (TT.isOSBinFormatCOFF()) + return make_unique<TargetLoweringObjectFileCOFF>(); + llvm_unreachable("unknown subtarget type"); +} /// X86TargetMachine ctor - Create an X86 target. /// @@ -36,27 +54,8 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + TLOF(createTLOF(Triple(getTargetTriple()))), Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { - // Determine the PICStyle based on the target selected. - if (getRelocationModel() == Reloc::Static) { - // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None. - Subtarget.setPICStyle(PICStyles::None); - } else if (Subtarget.is64Bit()) { - // PIC in 64 bit mode is always rip-rel. - Subtarget.setPICStyle(PICStyles::RIPRel); - } else if (Subtarget.isTargetCOFF()) { - Subtarget.setPICStyle(PICStyles::None); - } else if (Subtarget.isTargetDarwin()) { - if (getRelocationModel() == Reloc::PIC_) - Subtarget.setPICStyle(PICStyles::StubPIC); - else { - assert(getRelocationModel() == Reloc::DynamicNoPIC); - Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC); - } - } else if (Subtarget.isTargetELF()) { - Subtarget.setPICStyle(PICStyles::GOT); - } - // default to hard float ABI if (Options.FloatABIType == FloatABI::Default) this->Options.FloatABIType = FloatABI::Hard; @@ -71,6 +70,47 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, initAsmInfo(); } +X86TargetMachine::~X86TargetMachine() {} + +const X86Subtarget * +X86TargetMachine::getSubtargetImpl(const Function &F) const { + AttributeSet FnAttrs = F.getAttributes(); + Attribute CPUAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); + Attribute FSAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); + + std::string CPU = !CPUAttr.hasAttribute(Attribute::None) + ? CPUAttr.getValueAsString().str() + : TargetCPU; + std::string FS = !FSAttr.hasAttribute(Attribute::None) + ? FSAttr.getValueAsString().str() + : TargetFS; + + // FIXME: This is related to the code below to reset the target options, + // we need to know whether or not the soft float flag is set on the + // function before we can generate a subtarget. We also need to use + // it as a key for the subtarget since that can be the only difference + // between two functions. + Attribute SFAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float"); + bool SoftFloat = !SFAttr.hasAttribute(Attribute::None) + ? SFAttr.getValueAsString() == "true" + : Options.UseSoftFloat; + + auto &I = SubtargetMap[CPU + FS + (SoftFloat ? "use-soft-float=true" + : "use-soft-float=false")]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this, + Options.StackAlignmentOverride); + } + return I.get(); +} + //===----------------------------------------------------------------------===// // Command line options for x86 //===----------------------------------------------------------------------===// @@ -114,9 +154,9 @@ public: void addIRPasses() override; bool addInstSelector() override; bool addILPOpts() override; - bool addPreRegAlloc() override; - bool addPostRegAlloc() override; - bool addPreEmitPass() override; + void addPreRegAlloc() override; + void addPostRegAlloc() override; + void addPreEmitPass() override; }; } // namespace @@ -125,7 +165,7 @@ TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { } void X86PassConfig::addIRPasses() { - addPass(createX86AtomicExpandPass(&getX86TargetMachine())); + addPass(createAtomicExpandPass(&getX86TargetMachine())); TargetPassConfig::addIRPasses(); } @@ -148,39 +188,23 @@ bool X86PassConfig::addILPOpts() { return true; } -bool X86PassConfig::addPreRegAlloc() { - return false; // -print-machineinstr shouldn't print after this. +void X86PassConfig::addPreRegAlloc() { + addPass(createX86CallFrameOptimization()); } -bool X86PassConfig::addPostRegAlloc() { +void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); - return true; // -print-machineinstr should print after this. } -bool X86PassConfig::addPreEmitPass() { - bool ShouldPrint = false; - if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) { +void X86PassConfig::addPreEmitPass() { + if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); - ShouldPrint = true; - } - if (UseVZeroUpper) { + if (UseVZeroUpper) addPass(createX86IssueVZeroUpperPass()); - ShouldPrint = true; - } if (getOptLevel() != CodeGenOpt::None) { addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); - ShouldPrint = true; } - - return ShouldPrint; -} - -bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM, - JITCodeEmitter &JCE) { - PM.add(createX86JITCodeEmitterPass(*this, JCE)); - - return false; } diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm/lib/Target/X86/X86TargetMachine.h index 41d51570b9ab..916278cd7de3 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.h +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef X86TARGETMACHINE_H -#define X86TARGETMACHINE_H +#ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H +#define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/IR/DataLayout.h" @@ -23,46 +23,29 @@ namespace llvm { class StringRef; class X86TargetMachine final : public LLVMTargetMachine { - virtual void anchor(); + std::unique_ptr<TargetLoweringObjectFile> TLOF; X86Subtarget Subtarget; + mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap; + public: X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); + ~X86TargetMachine() override; - const DataLayout *getDataLayout() const override { - return getSubtargetImpl()->getDataLayout(); - } - const X86InstrInfo *getInstrInfo() const override { - return getSubtargetImpl()->getInstrInfo(); - } - const TargetFrameLowering *getFrameLowering() const override { - return getSubtargetImpl()->getFrameLowering(); - } - X86JITInfo *getJITInfo() override { return Subtarget.getJITInfo(); } const X86Subtarget *getSubtargetImpl() const override { return &Subtarget; } - const X86TargetLowering *getTargetLowering() const override { - return getSubtargetImpl()->getTargetLowering(); - } - const X86SelectionDAGInfo *getSelectionDAGInfo() const override { - return getSubtargetImpl()->getSelectionDAGInfo(); - } - const X86RegisterInfo *getRegisterInfo() const override { - return &getInstrInfo()->getRegisterInfo(); - } - const InstrItineraryData *getInstrItineraryData() const override { - return &getSubtargetImpl()->getInstrItineraryData(); - } + const X86Subtarget *getSubtargetImpl(const Function &F) const override; /// \brief Register X86 analysis passes with a pass manager. void addAnalysisPasses(PassManagerBase &PM) override; // Set up the pass pipeline. TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - - bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override; + TargetLoweringObjectFile *getObjFileLowering() const override { + return TLOF.get(); + } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h index 4a10b7ea6b42..6a6988a2e2f6 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H -#define LLVM_TARGET_X86_TARGETOBJECTFILE_H +#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/Target/TargetLoweringObjectFile.h" diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index c961e2f5b2c8..67488f7ad791 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -48,8 +48,8 @@ public: } X86TTI(const X86TargetMachine *TM) - : ImmutablePass(ID), ST(TM->getSubtargetImpl()), - TLI(TM->getTargetLowering()) { + : ImmutablePass(ID), ST(TM->getSubtargetImpl()), + TLI(TM->getSubtargetImpl()->getTargetLowering()) { initializeX86TTIPass(*PassRegistry::getPassRegistry()); } @@ -82,9 +82,10 @@ public: unsigned getNumberOfRegisters(bool Vector) const override; unsigned getRegisterBitWidth(bool Vector) const override; - unsigned getMaximumUnrollFactor() const override; + unsigned getMaxInterleaveFactor() const override; unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind) const override; + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override; unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const override; unsigned getCastInstrCost(unsigned Opcode, Type *Dst, @@ -110,6 +111,8 @@ public: Type *Ty) const override; unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) const override; + bool isLegalMaskedLoad (Type *DataType, int Consecutive) const override; + bool isLegalMaskedStore(Type *DataType, int Consecutive) const override; /// @} }; @@ -166,7 +169,7 @@ unsigned X86TTI::getRegisterBitWidth(bool Vector) const { } -unsigned X86TTI::getMaximumUnrollFactor() const { +unsigned X86TTI::getMaxInterleaveFactor() const { if (ST->isAtom()) return 1; @@ -178,15 +181,37 @@ unsigned X86TTI::getMaximumUnrollFactor() const { return 2; } -unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { +unsigned X86TTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { // Legalize the type. std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + if (ISD == ISD::SDIV && + Op2Info == TargetTransformInfo::OK_UniformConstantValue && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { + // On X86, vector signed division by constants power-of-two are + // normally expanded to the sequence SRA + SRL + ADD + SRA. + // The OperandValue properties many not be same as that of previous + // operation;conservatively assume OP_None. + unsigned Cost = + 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + + return Cost; + } + static const CostTblEntry<MVT::SimpleValueType> AVX2UniformConstCostTable[] = { { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence @@ -202,6 +227,15 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * AVX2UniformConstCostTable[Idx].Cost; } + static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = { + { ISD::SHL, MVT::v16i32, 1 }, + { ISD::SRL, MVT::v16i32, 1 }, + { ISD::SRA, MVT::v16i32, 1 }, + { ISD::SHL, MVT::v8i64, 1 }, + { ISD::SRL, MVT::v8i64, 1 }, + { ISD::SRA, MVT::v8i64, 1 }, + }; + static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. @@ -237,6 +271,11 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::UDIV, MVT::v4i64, 4*20 }, }; + if (ST->hasAVX512()) { + int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second); + if (Idx != -1) + return LT.first * AVX512CostTable[Idx].Cost; + } // Look for AVX2 lowering tricks. if (ST->hasAVX2()) { if (ISD == ISD::SHL && LT.second == MVT::v16i16 && @@ -315,7 +354,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. - { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. @@ -488,7 +527,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or }; - + if (ST->hasSSSE3()) { int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); if (Idx != -1) @@ -501,7 +540,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd - + // This is expanded into a long sequence of four extract + four insert. {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw. @@ -509,7 +548,7 @@ unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48} }; - // Fall-back (SSE3 and SSE2). + // Fall-back (SSE3 and SSE2). int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); if (Idx != -1) return LT.first * SSEAltShuffleTbl[Idx].Cost; @@ -541,7 +580,7 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, // There are faster sequences for float conversions. { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, @@ -557,6 +596,45 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { return LTSrc.first * SSE2ConvTbl[Idx].Cost; } + static const TypeConversionCostTblEntry<MVT::SimpleValueType> + AVX512ConversionTbl[] = { + { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, + { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, + { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, + { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 }, + + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, + { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 }, + + // v16i1 -> v16i32 - load + broadcast + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, + + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, + + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + }; + + if (ST->hasAVX512()) { + int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second, + LTSrc.second); + if (Idx != -1) + return AVX512ConversionTbl[Idx].Cost; + } EVT SrcTy = TLI->getValueType(Src); EVT DstTy = TLI->getValueType(Dst); @@ -589,6 +667,11 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, + + { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, + { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, + + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, }; static const TypeConversionCostTblEntry<MVT::SimpleValueType> @@ -715,6 +798,19 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v32i8, 1 }, }; + static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = { + { ISD::SETCC, MVT::v8i64, 1 }, + { ISD::SETCC, MVT::v16i32, 1 }, + { ISD::SETCC, MVT::v8f64, 1 }, + { ISD::SETCC, MVT::v16f32, 1 }, + }; + + if (ST->hasAVX512()) { + int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy); + if (Idx != -1) + return LT.first * AVX512CostTbl[Idx].Cost; + } + if (ST->hasAVX2()) { int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); if (Idx != -1) @@ -836,17 +932,17 @@ unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, bool IsPairwise) const { - + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); - + MVT MTy = LT.second; - + int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - - // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput - // and make it as the cost. - + + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput + // and make it as the cost. + static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, @@ -854,7 +950,7 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". { ISD::ADD, MVT::v8i16, 5 }, }; - + static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { { ISD::FADD, MVT::v4f32, 4 }, { ISD::FADD, MVT::v4f64, 5 }, @@ -873,7 +969,7 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". }; - + static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v4f64, 3 }, @@ -884,14 +980,14 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i16, 4 }, { ISD::ADD, MVT::v8i32, 5 }, }; - + if (IsPairwise) { if (ST->hasAVX()) { int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); if (Idx != -1) return LT.first * AVX1CostTblPairWise[Idx].Cost; } - + if (ST->hasSSE42()) { int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); if (Idx != -1) @@ -903,7 +999,7 @@ unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, if (Idx != -1) return LT.first * AVX1CostTblNoPairWise[Idx].Cost; } - + if (ST->hasSSE42()) { int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); if (Idx != -1) @@ -1062,3 +1158,19 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, } return X86TTI::getIntImmCost(Imm, Ty); } + +bool X86TTI::isLegalMaskedLoad(Type *DataTy, int Consecutive) const { + int DataWidth = DataTy->getPrimitiveSizeInBits(); + + // Todo: AVX512 allows gather/scatter, works with strided and random as well + if ((DataWidth < 32) || (Consecutive == 0)) + return false; + if (ST->hasAVX512() || ST->hasAVX2()) + return true; + return false; +} + +bool X86TTI::isLegalMaskedStore(Type *DataType, int Consecutive) const { + return isLegalMaskedLoad(DataType, Consecutive); +} + diff --git a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp index 0bb5f990cae7..5c01f3e1ff63 100644 --- a/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/contrib/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -250,20 +250,24 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>(); if (!ST.hasAVX() || ST.hasAVX512()) return false; - TII = MF.getTarget().getInstrInfo(); + TII = MF.getSubtarget().getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); EverMadeChange = false; + bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); + // Fast check: if the function doesn't use any ymm registers, we don't need // to insert any VZEROUPPER instructions. This is constant-time, so it is // cheap in the common case of no ymm use. - bool YMMUsed = false; - const TargetRegisterClass *RC = &X86::VR256RegClass; - for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); - i != e; i++) { - if (!MRI.reg_nodbg_empty(*i)) { - YMMUsed = true; - break; + bool YMMUsed = FnHasLiveInYmm; + if (!YMMUsed) { + const TargetRegisterClass *RC = &X86::VR256RegClass; + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e; + i++) { + if (!MRI.reg_nodbg_empty(*i)) { + YMMUsed = true; + break; + } } } if (!YMMUsed) { @@ -282,7 +286,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { // If any YMM regs are live in to this function, add the entry block to the // DirtySuccessors list - if (checkFnHasLiveInYmm(MRI)) + if (FnHasLiveInYmm) addDirtySuccessor(MF.front()); // Re-visit all blocks that are successors of EXITS_DIRTY bsocks. Add |