diff options
Diffstat (limited to 'lib/Target/X86')
55 files changed, 1741 insertions, 1194 deletions
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index 6ba897b8636d..9eee4a0f3d82 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -1080,4 +1080,4 @@ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, return new X86AsmInstrumentation(STI); } -} // namespace llvm +} // End llvm namespace diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 341fc81c0480..19ebcc44f61e 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -61,6 +61,6 @@ protected: unsigned InitialFrameReg; }; -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index b3066efbab24..7ec02408ffa4 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -238,18 +238,34 @@ struct X86Operand : public MCParsedAsmOperand { return Kind == Memory && (!Mem.Size || Mem.Size == 32) && getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; } + bool isMemVX32X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31; + } bool isMemVY32() const { return Kind == Memory && (!Mem.Size || Mem.Size == 32) && getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; } + bool isMemVY32X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 32) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31; + } bool isMemVX64() const { return Kind == Memory && (!Mem.Size || Mem.Size == 64) && getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM15; } + bool isMemVX64X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::XMM0 && getMemIndexReg() <= X86::XMM31; + } bool isMemVY64() const { return Kind == Memory && (!Mem.Size || Mem.Size == 64) && getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15; } + bool isMemVY64X() const { + return Kind == Memory && (!Mem.Size || Mem.Size == 64) && + getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM31; + } bool isMemVZ32() const { return Kind == Memory && (!Mem.Size || Mem.Size == 32) && getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31; diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 5b53fbef3f71..cfc3ee2fb08f 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -69,7 +69,7 @@ namespace X86 { extern Target TheX86_32Target, TheX86_64Target; -} // namespace llvm +} static bool translateInstruction(MCInst &target, InternalInstruction &source, @@ -551,9 +551,15 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, case TYPE_REL8: isBranch = true; pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; - if(immediate & 0x80) + if (immediate & 0x80) immediate |= ~(0xffull); break; + case TYPE_REL16: + isBranch = true; + pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize; + if (immediate & 0x8000) + immediate |= ~(0xffffull); + break; case TYPE_REL32: case TYPE_REL64: isBranch = true; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index d990bf3484bf..f73fa75f888e 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -1165,35 +1165,30 @@ static int readSIB(struct InternalInstruction* insn) { return -1; index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); + + // FIXME: The fifth bit (bit index 4) is only to be used for instructions + // that understand VSIB indexing. ORing the bit in here is mildy dangerous + // because performing math on an 'enum SIBIndex' can produce garbage. + // Excluding the "none" value, it should cover 6 spaces of register names: + // - 16 possibilities for 16-bit GPR starting at SIB_INDEX_BX_SI + // - 16 possibilities for 32-bit GPR starting at SIB_INDEX_EAX + // - 16 possibilities for 64-bit GPR starting at SIB_INDEX_RAX + // - 32 possibilities for each of XMM, YMM, ZMM registers + // When sibIndexBase gets assigned SIB_INDEX_RAX as it does in 64-bit mode, + // summing in a fully decoded index between 0 and 31 can end up with a value + // that looks like something in the low half of the XMM range. + // translateRMMemory() tries to reverse the damage, with only partial success, + // as evidenced by known bugs in "test/MC/Disassembler/X86/x86-64.txt" if (insn->vectorExtensionType == TYPE_EVEX) index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4; - switch (index) { - case 0x4: + if (index == 0x4) { insn->sibIndex = SIB_INDEX_NONE; - break; - default: + } else { insn->sibIndex = (SIBIndex)(sibIndexBase + index); - if (insn->sibIndex == SIB_INDEX_sib || - insn->sibIndex == SIB_INDEX_sib64) - insn->sibIndex = SIB_INDEX_NONE; - break; } - switch (scaleFromSIB(insn->sib)) { - case 0: - insn->sibScale = 1; - break; - case 1: - insn->sibScale = 2; - break; - case 2: - insn->sibScale = 4; - break; - case 3: - insn->sibScale = 8; - break; - } + insn->sibScale = 1 << scaleFromSIB(insn->sib); base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index ac484f317276..62b6b73e7864 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -140,6 +140,6 @@ public: private: bool HasCustomInstComment; }; -} // namespace llvm +} #endif diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 2bee518fed68..6e371da37290 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -159,6 +159,6 @@ public: } }; -} // namespace llvm +} #endif diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 2d85f84d6669..3e0dc1424609 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -29,13 +29,6 @@ #include "llvm/Support/raw_ostream.h" using namespace llvm; -// Option to allow disabling arithmetic relaxation to workaround PR9807, which -// is useful when running bitwise comparison experiments on Darwin. We should be -// able to remove this once PR9807 is resolved. -static cl::opt<bool> -MCDisableArithRelaxation("mc-x86-disable-arith-relaxation", - cl::desc("Disable relaxation of arithmetic instruction for X86")); - static unsigned getFixupKindLog2Size(unsigned Kind) { switch (Kind) { default: @@ -243,29 +236,18 @@ bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { if (getRelaxedOpcodeBranch(Inst.getOpcode()) != Inst.getOpcode()) return true; - if (MCDisableArithRelaxation) - return false; - // Check if this instruction is ever relaxable. if (getRelaxedOpcodeArith(Inst.getOpcode()) == Inst.getOpcode()) return false; - // Check if it has an expression and is not RIP relative. - bool hasExp = false; - bool hasRIP = false; - for (unsigned i = 0; i < Inst.getNumOperands(); ++i) { - const MCOperand &Op = Inst.getOperand(i); - if (Op.isExpr()) - hasExp = true; - - if (Op.isReg() && Op.getReg() == X86::RIP) - hasRIP = true; - } + // Check if the relaxable operand has an expression. For the current set of + // relaxable instructions, the relaxable operand is always the last operand. + unsigned RelaxableOp = Inst.getNumOperands() - 1; + if (Inst.getOperand(RelaxableOp).isExpr()) + return true; - // FIXME: Why exactly do we need the !hasRIP? Is it just a limitation on - // how we do relaxations? - return hasExp && !hasRIP; + return false; } bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, @@ -426,7 +408,7 @@ namespace CU { UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF }; -} // namespace CU +} // end CU namespace class DarwinX86AsmBackend : public X86AsmBackend { const MCRegisterInfo &MRI; diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index 69e9c7b4a83e..f0d00b0c1bc3 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -41,7 +41,7 @@ namespace X86 { /// AddrNumOperands - Total number of operands in a memory reference. AddrNumOperands = 5 }; -} // namespace X86 +} // end namespace X86; /// X86II - This namespace holds all of the target specific flags that /// instruction info tracks. @@ -213,11 +213,7 @@ namespace X86II { /// the offset from beginning of section. /// /// This is the TLS offset for the COFF/Windows TLS mechanism. - MO_SECREL, - - /// MO_NOPREFIX - On a symbol operand this indicates that the symbol should - /// not be mangled with a prefix. - MO_NOPREFIX, + MO_SECREL }; enum : uint64_t { @@ -762,8 +758,8 @@ namespace X86II { return (reg == X86::SPL || reg == X86::BPL || reg == X86::SIL || reg == X86::DIL); } -} // namespace X86II +} -} // namespace llvm +} // end namespace llvm; #endif diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 512afebf482e..a33468dc4769 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -28,7 +28,7 @@ namespace { unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const override; }; -} // namespace +} X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp index 7c09e5d59580..89f394582631 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp @@ -26,14 +26,17 @@ public: X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {} const MCExpr *createExprForRelocation(RelocationRef Rel) override { - uint64_t RelType; Rel.getType(RelType); - symbol_iterator SymI = Rel.getSymbol(); + uint64_t RelType = Rel.getType(); + elf_symbol_iterator SymI = Rel.getSymbol(); + + ErrorOr<StringRef> SymNameOrErr = SymI->getName(); + if (std::error_code EC = SymNameOrErr.getError()) + report_fatal_error(EC.message()); + StringRef SymName = *SymNameOrErr; - StringRef SymName; SymI->getName(SymName); uint64_t SymAddr; SymI->getAddress(SymAddr); uint64_t SymSize = SymI->getSize(); - auto *Obj = cast<ELFObjectFileBase>(Rel.getObjectFile()); - int64_t Addend = *Obj->getRelocationAddend(Rel.getRawDataRefImpl()); + int64_t Addend = *ELFRelocationRef(Rel).getAddend(); MCSymbol *Sym = Ctx.getOrCreateSymbol(SymName); // FIXME: check that the value is actually the same. diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h index a523a32b2a2d..4899900dcef9 100644 --- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h +++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -28,7 +28,7 @@ enum Fixups { LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind }; -} // namespace X86 -} // namespace llvm +} +} #endif diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 020803b57f76..6221baba1793 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -62,7 +62,7 @@ void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI); /// do not need to go through TargetRegistry. MCSubtargetInfo *createX86MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS); -} // namespace X86_MC +} MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, @@ -98,7 +98,7 @@ MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx); /// Construct X86-64 ELF relocation info. MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx); -} // namespace llvm +} // End llvm namespace // Defines symbolic names for X86 registers. This defines a mapping from diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp index a5aadd6a385e..c9479b62f7b6 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp @@ -25,12 +25,15 @@ public: X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {} const MCExpr *createExprForRelocation(RelocationRef Rel) override { - const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObjectFile()); + const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObject()); - uint64_t RelType; Rel.getType(RelType); + uint64_t RelType = Rel.getType(); symbol_iterator SymI = Rel.getSymbol(); - StringRef SymName; SymI->getName(SymName); + ErrorOr<StringRef> SymNameOrErr = SymI->getName(); + if (std::error_code EC = SymNameOrErr.getError()) + report_fatal_error(EC.message()); + StringRef SymName = *SymNameOrErr; uint64_t SymAddr; SymI->getAddress(SymAddr); any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl()); @@ -89,10 +92,11 @@ public: symbol_iterator RSymI = Rel.getSymbol(); uint64_t RSymAddr; RSymI->getAddress(RSymAddr); - StringRef RSymName; - RSymI->getName(RSymName); + ErrorOr<StringRef> RSymName = RSymI->getName(); + if (std::error_code EC = RSymName.getError()) + report_fatal_error(EC.message()); - MCSymbol *RSym = Ctx.getOrCreateSymbol(RSymName); + MCSymbol *RSym = Ctx.getOrCreateSymbol(*RSymName); if (!RSym->isVariable()) RSym->setVariableValue(MCConstantExpr::create(RSymAddr, Ctx)); diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 773fbf41a7b1..9e801fc8f191 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -69,7 +69,7 @@ public: FixedValue); } }; -} // namespace +} static bool isFixupKindRIPRel(unsigned Kind) { return Kind == X86::reloc_riprel_4byte || diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 7d262cdbf51d..bd1bc9943b6d 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -31,7 +31,7 @@ namespace { bool IsCrossSection, const MCAsmBackend &MAB) const override; }; -} // namespace +} X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit) : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64 diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index dc6dd66bcd85..92f42b68ae51 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -46,7 +46,7 @@ void X86WinCOFFStreamer::FinishImpl() { MCWinCOFFStreamer::FinishImpl(); } -} // namespace +} MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, raw_pwrite_stream &OS, diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 1e7d94287c4a..ef3318ba7580 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -431,4 +431,4 @@ void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) { for (unsigned i = 1; i < NumElts; i++) Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i); } -} // namespace llvm +} // llvm namespace diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 0139297fc72d..14b69434806e 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -100,6 +100,6 @@ void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); /// \brief Decode a scalar float move instruction as a shuffle mask. void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &ShuffleMask); -} // namespace llvm +} // llvm namespace #endif diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 80f457984951..8403ae6101df 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -80,6 +80,6 @@ FunctionPass *createX86WinEHStatePass(); /// must run after prologue/epilogue insertion and before lowering /// the MachineInstr to MC. FunctionPass *createX86ExpandPseudoPass(); -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index 205140144ab5..ba33248d2039 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -581,34 +581,6 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { return AsmPrinter::GetCPISymbol(CPID); } -void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) { - SmallString<128> Directive; - raw_svector_ostream OS(Directive); - StringRef Name = Sym->getName(); - const Triple &TT = TM.getTargetTriple(); - - if (TT.isKnownWindowsMSVCEnvironment()) - OS << " /EXPORT:"; - else - OS << " -export:"; - - if ((TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) && - (Name[0] == getDataLayout().getGlobalPrefix())) - Name = Name.drop_front(); - - OS << Name; - - if (IsData) { - if (TT.isKnownWindowsMSVCEnvironment()) - OS << ",DATA"; - else - OS << ",data"; - } - - OS.flush(); - OutStreamer->EmitBytes(Directive); -} - void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { const Triple &TT = TM.getTargetTriple(); @@ -692,39 +664,28 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { } if (TT.isOSBinFormatCOFF()) { - // Necessary for dllexport support - std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals; + const TargetLoweringObjectFileCOFF &TLOFCOFF = + static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering()); - for (const auto &Function : M) - if (Function.hasDLLExportStorageClass() && !Function.isDeclaration()) - DLLExportedFns.push_back(getSymbol(&Function)); + std::string Flags; + raw_string_ostream FlagsOS(Flags); + for (const auto &Function : M) + TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function, *Mang); for (const auto &Global : M.globals()) - if (Global.hasDLLExportStorageClass() && !Global.isDeclaration()) - DLLExportedGlobals.push_back(getSymbol(&Global)); - - for (const auto &Alias : M.aliases()) { - if (!Alias.hasDLLExportStorageClass()) - continue; - - if (Alias.getType()->getElementType()->isFunctionTy()) - DLLExportedFns.push_back(getSymbol(&Alias)); - else - DLLExportedGlobals.push_back(getSymbol(&Alias)); - } + TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global, *Mang); + for (const auto &Alias : M.aliases()) + TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias, *Mang); - // Output linker support code for dllexported globals on windows. - if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) { - const TargetLoweringObjectFileCOFF &TLOFCOFF = - static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering()); + FlagsOS.flush(); + // Output collected flags. + if (!Flags.empty()) { OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection()); - - for (auto & Symbol : DLLExportedGlobals) - GenerateExportDirective(Symbol, /*IsData=*/true); - for (auto & Symbol : DLLExportedFns) - GenerateExportDirective(Symbol, /*IsData=*/false); + OutStreamer->EmitBytes(Flags); } + + SM.serializeToStackMapSection(); } if (TT.isOSBinFormatELF()) { diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index acba21169c9c..7f5d127c68d5 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -30,8 +30,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { StackMaps SM; FaultMaps FM; - void GenerateExportDirective(const MCSymbol *Sym, bool IsData); - // This utility class tracks the length of a stackmap instruction's 'shadow'. // It is used by the X86AsmPrinter to ensure that the stackmap shadow // invariants (i.e. no other stackmaps, patchpoints, or control flow within diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp index 6d6831b18b0a..031ba4ba9e66 100644 --- a/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -78,7 +78,7 @@ private: typedef DenseMap<MachineInstr *, CallContext> ContextMap; bool isLegal(MachineFunction &MF); - + bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap); void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, @@ -90,6 +90,13 @@ private: MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, unsigned Reg); + enum InstClassification { Convert, Skip, Exit }; + + InstClassification classifyInstruction(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const X86RegisterInfo &RegInfo, + DenseSet<unsigned int> &UsedRegs); + const char *getPassName() const override { return "X86 Optimize Call Frame"; } const TargetInstrInfo *TII; @@ -99,13 +106,13 @@ private: }; char X86CallFrameOptimization::ID = 0; -} // namespace +} FunctionPass *llvm::createX86CallFrameOptimization() { return new X86CallFrameOptimization(); } -// This checks whether the transformation is legal. +// This checks whether the transformation is legal. // Also returns false in cases where it's potentially legal, but // we don't even want to try. bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { @@ -170,9 +177,8 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, if (!OptForSize) return false; - unsigned StackAlign = TFL->getStackAlignment(); - + int64_t Advantage = 0; for (auto CC : CallSeqMap) { // Call sites where no parameters are passed on the stack @@ -205,7 +211,6 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, return (Advantage >= 0); } - bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget().getInstrInfo(); TFL = MF.getSubtarget().getFrameLowering(); @@ -237,6 +242,64 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { return Changed; } +X86CallFrameOptimization::InstClassification +X86CallFrameOptimization::classifyInstruction( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const X86RegisterInfo &RegInfo, DenseSet<unsigned int> &UsedRegs) { + if (MI == MBB.end()) + return Exit; + + // The instructions we actually care about are movs onto the stack + int Opcode = MI->getOpcode(); + if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr) + return Convert; + + // Not all calling conventions have only stack MOVs between the stack + // adjust and the call. + + // We want to tolerate other instructions, to cover more cases. + // In particular: + // a) PCrel calls, where we expect an additional COPY of the basereg. + // b) Passing frame-index addresses. + // c) Calling conventions that have inreg parameters. These generate + // both copies and movs into registers. + // To avoid creating lots of special cases, allow any instruction + // that does not write into memory, does not def or use the stack + // pointer, and does not def any register that was used by a preceding + // push. + // (Reading from memory is allowed, even if referenced through a + // frame index, since these will get adjusted properly in PEI) + + // The reason for the last condition is that the pushes can't replace + // the movs in place, because the order must be reversed. + // So if we have a MOV32mr that uses EDX, then an instruction that defs + // EDX, and then the call, after the transformation the push will use + // the modified version of EDX, and not the original one. + // Since we are still in SSA form at this point, we only need to + // make sure we don't clobber any *physical* registers that were + // used by an earlier mov that will become a push. + + if (MI->isCall() || MI->mayStore()) + return Exit; + + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg()) + continue; + unsigned int Reg = MO.getReg(); + if (!RegInfo.isPhysicalRegister(Reg)) + continue; + if (RegInfo.regsOverlap(Reg, RegInfo.getStackRegister())) + return Exit; + if (MO.isDef()) { + for (unsigned int U : UsedRegs) + if (RegInfo.regsOverlap(Reg, U)) + return Exit; + } + } + + return Skip; +} + void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -254,8 +317,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, // How much do we adjust the stack? This puts an upper bound on // the number of parameters actually passed on it. - unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; - + unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; + // A zero adjustment means no stack parameters if (!MaxAdjust) { Context.NoStackParams = true; @@ -284,11 +347,17 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, if (MaxAdjust > 4) Context.MovVector.resize(MaxAdjust, nullptr); - do { - int Opcode = I->getOpcode(); - if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) - break; + InstClassification Classification; + DenseSet<unsigned int> UsedRegs; + while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) != + Exit) { + if (Classification == Skip) { + ++I; + continue; + } + + // We know the instruction is a MOV32mi/MOV32mr. // We only want movs of the form: // movl imm/r32, k(%esp) // If we run into something else, bail. @@ -323,24 +392,20 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, return; Context.MovVector[StackDisp] = I; - ++I; - } while (I != MBB.end()); - - // We now expect the end of the sequence - a call and a stack adjust. - if (I == MBB.end()) - return; + for (const MachineOperand &MO : I->uses()) { + if (!MO.isReg()) + continue; + unsigned int Reg = MO.getReg(); + if (RegInfo.isPhysicalRegister(Reg)) + UsedRegs.insert(Reg); + } - // For PCrel calls, we expect an additional COPY of the basereg. - // If we find one, skip it. - if (I->isCopy()) { - if (I->getOperand(1).getReg() == - MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg()) - ++I; - else - return; + ++I; } - if (!I->isCall()) + // We now expect the end of the sequence. If we stopped early, + // or reached the end of the block without finding a call, bail. + if (I == MBB.end() || !I->isCall()) return; Context.Call = I; diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h index a377eb6051ae..0eb2494f1d63 100644 --- a/lib/Target/X86/X86CallingConv.h +++ b/lib/Target/X86/X86CallingConv.h @@ -42,7 +42,7 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, return false; } -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 3dc75d76cee3..02645460b6a2 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; @@ -2821,7 +2822,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { bool &IsTailCall = CLI.IsTailCall; bool IsVarArg = CLI.IsVarArg; const Value *Callee = CLI.Callee; - const char *SymName = CLI.SymName; + MCSymbol *Symbol = CLI.Symbol; bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CC); @@ -3117,8 +3118,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { } MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); - if (SymName) - MIB.addExternalSymbol(SymName, OpFlags); + if (Symbol) + MIB.addSym(Symbol, OpFlags); else MIB.addGlobalAddress(GV, 0, OpFlags); } diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 8305a0454c80..5eb4faeedff4 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -91,7 +91,7 @@ private: const X86InstrInfo *TII; // Machine instruction info. }; char FixupLEAPass::ID = 0; -} // namespace +} MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 6f1d8e523732..40b9c8a863a3 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -279,7 +279,7 @@ namespace { void setKillFlags(MachineBasicBlock &MBB) const; }; char FPS::ID = 0; -} // namespace +} FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); } @@ -544,7 +544,7 @@ namespace { return V < TE.from; } }; -} // namespace +} #ifndef NDEBUG static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) { @@ -1530,7 +1530,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { if (Op.isKill()) moveToTop(FPReg, Inst); else - duplicateToTop(FPReg, FPReg, Inst); + duplicateToTop(FPReg, ScratchFPReg, Inst); // Emit the call. This will pop the operand. BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 2858e86cd0e0..c274c8820149 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -153,6 +153,6 @@ private: bool InEpilogue) const; }; -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index f6785e161188..6b23e62a2d35 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -67,19 +67,19 @@ namespace { const Constant *CP; const BlockAddress *BlockAddr; const char *ES; + MCSymbol *MCSym; int JT; unsigned Align; // CP alignment. unsigned char SymbolFlags; // X86II::MO_* X86ISelAddressMode() - : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), - Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), - JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) { - } + : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), + Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), + MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {} bool hasSymbolicDisplacement() const { return GV != nullptr || CP != nullptr || ES != nullptr || - JT != -1 || BlockAddr != nullptr; + MCSym != nullptr || JT != -1 || BlockAddr != nullptr; } bool hasBaseOrIndexReg() const { @@ -134,11 +134,16 @@ namespace { dbgs() << ES; else dbgs() << "nul"; + dbgs() << " MCSym "; + if (MCSym) + dbgs() << MCSym; + else + dbgs() << "nul"; dbgs() << " JT" << JT << " Align" << Align << '\n'; } #endif }; -} // namespace +} namespace { //===--------------------------------------------------------------------===// @@ -258,6 +263,10 @@ namespace { else if (AM.ES) { assert(!AM.Disp && "Non-zero displacement is ignored with ES."); Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); + } else if (AM.MCSym) { + assert(!AM.Disp && "Non-zero displacement is ignored with MCSym."); + assert(AM.SymbolFlags == 0 && "oo"); + Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); } else if (AM.JT != -1) { assert(!AM.Disp && "Non-zero displacement is ignored with JT."); Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); @@ -310,7 +319,7 @@ namespace { return true; } }; -} // namespace +} bool @@ -604,7 +613,7 @@ static bool isDispSafeForFrameIndex(int64_t Val) { bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM) { // Cannot combine ExternalSymbol displacements with integer offsets. - if (Offset != 0 && AM.ES) + if (Offset != 0 && (AM.ES || AM.MCSym)) return true; int64_t Val = AM.Disp + Offset; CodeModel::Model M = TM.getCodeModel(); @@ -690,6 +699,8 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { AM.ES = S->getSymbol(); AM.SymbolFlags = S->getTargetFlags(); + } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { + AM.MCSym = S->getMCSymbol(); } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { AM.JT = J->getIndex(); AM.SymbolFlags = J->getTargetFlags(); @@ -728,6 +739,8 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { AM.ES = S->getSymbol(); AM.SymbolFlags = S->getTargetFlags(); + } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) { + AM.MCSym = S->getMCSymbol(); } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) { AM.JT = J->getIndex(); AM.SymbolFlags = J->getTargetFlags(); @@ -1001,7 +1014,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // FIXME: JumpTable and ExternalSymbol address currently don't like // displacements. It isn't very important, but this should be fixed for // consistency. - if (!AM.ES && AM.JT != -1) return true; + if (!(AM.ES || AM.MCSym) && AM.JT != -1) + return true; if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM)) @@ -1013,13 +1027,11 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, default: break; case ISD::FRAME_ALLOC_RECOVER: { if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) - if (const auto *ESNode = dyn_cast<ExternalSymbolSDNode>(N.getOperand(0))) - if (ESNode->getOpcode() == ISD::TargetExternalSymbol) { - // Use the symbol and don't prefix it. - AM.ES = ESNode->getSymbol(); - AM.SymbolFlags = X86II::MO_NOPREFIX; - return false; - } + if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) { + // Use the symbol and don't prefix it. + AM.MCSym = ESNode->getMCSymbol(); + return false; + } break; } case ISD::Constant: { @@ -1473,6 +1485,7 @@ bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { N->getOpcode() != ISD::TargetJumpTable && N->getOpcode() != ISD::TargetGlobalAddress && N->getOpcode() != ISD::TargetExternalSymbol && + N->getOpcode() != ISD::MCSymbol && N->getOpcode() != ISD::TargetBlockAddress) return false; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ce1ca20ee81a..b16bd18aefaa 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1111,7 +1111,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); - if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { + if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); @@ -6259,42 +6259,6 @@ is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, return true; } -/// \brief Test whether a shuffle mask is equivalent within each 256-bit lane. -/// -/// This checks a shuffle mask to see if it is performing the same -/// 256-bit lane-relative shuffle in each 256-bit lane. This trivially implies -/// that it is also not lane-crossing. It may however involve a blend from the -/// same lane of a second vector. -/// -/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is -/// non-trivial to compute in the face of undef lanes. The representation is -/// *not* suitable for use with existing 256-bit shuffles as it will contain -/// entries from both V1 and V2 inputs to the wider mask. -static bool -is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, - SmallVectorImpl<int> &RepeatedMask) { - int LaneSize = 256 / VT.getScalarSizeInBits(); - RepeatedMask.resize(LaneSize, -1); - int Size = Mask.size(); - for (int i = 0; i < Size; ++i) { - if (Mask[i] < 0) - continue; - if ((Mask[i] % Size) / LaneSize != i / LaneSize) - // This entry crosses lanes, so there is no way to model this shuffle. - return false; - - // Ok, handle the in-lane shuffles by detecting if and when they repeat. - if (RepeatedMask[i % LaneSize] == -1) - // This is the first non-undef entry in this slot of a 256-bit lane. - RepeatedMask[i % LaneSize] = - Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; - else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) - // Found a mismatch with the repeated mask. - return false; - } - return true; -} - /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// @@ -6354,22 +6318,6 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, return DAG.getConstant(Imm, DL, MVT::i8); } -/// \brief Get a 8-bit shuffle, 1 bit per lane, immediate for a mask. -/// -/// This helper function produces an 8-bit shuffle immediate corresponding to -/// the ubiquitous shuffle encoding scheme used in x86 instructions for -/// shuffling 8 lanes. -static SDValue get1bitLaneShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, - SelectionDAG &DAG) { - assert(Mask.size() <= 8 && - "Up to 8 elts may be in Imm8 1-bit lane shuffle mask"); - unsigned Imm = 0; - for (unsigned i = 0; i < Mask.size(); ++i) - if (Mask[i] >= 0) - Imm |= (Mask[i] % 2) << i; - return DAG.getConstant(Imm, DL, MVT::i8); -} - /// \brief Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are @@ -9385,30 +9333,6 @@ static SDValue lowerV2X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, DAG.getConstant(PermMask, DL, MVT::i8)); } -/// \brief Handle lowering 4-lane 128-bit shuffles. -static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> WidenedMask, - SelectionDAG &DAG) { - - assert(WidenedMask.size() == 4 && "Unexpected mask size for 128bit shuffle!"); - // form a 128-bit permutation. - // convert the 64-bit shuffle mask selection values into 128-bit selection - // bits defined by a vshuf64x2 instruction's immediate control byte. - unsigned PermMask = 0, Imm = 0; - - for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { - if(WidenedMask[i] == SM_SentinelZero) - return SDValue(); - - // use first element in place of undef musk - Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; - PermMask |= (Imm % 4) << (i * 2); - } - - return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, - DAG.getConstant(PermMask, DL, MVT::i8)); -} - /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then /// shuffling each lane. /// @@ -10144,105 +10068,86 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } -static SDValue lowerVectorShuffleWithVALIGN(SDLoc DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { - - assert(VT.getScalarSizeInBits() >= 32 && "Unexpected data type for VALIGN"); - // VALIGN pattern 2, 3, 4, 5, .. (sequential, shifted right) - int AlignVal = -1; - for (int i = 0; i < (signed)VT.getVectorNumElements(); ++i) { - if (Mask[i] < 0) - continue; - if (Mask[i] < i) - return SDValue(); - if (AlignVal == -1) - AlignVal = Mask[i] - i; - else if (Mask[i] - i != AlignVal) - return SDValue(); - } - // Vector source operands should be swapped - return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1, - DAG.getConstant(AlignVal, DL, MVT::i8)); -} +/// \brief Handle lowering of 8-lane 64-bit floating point shuffles. +static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); -static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, - ArrayRef<int> Mask, SDValue V1, - SDValue V2, SelectionDAG &DAG) { + // X86 has dedicated unpack instructions that can handle specific blend + // operations: UNPCKH and UNPCKL. + if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); - assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); +} - MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); - MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); +/// \brief Handle lowering of 16-lane 32-bit floating point shuffles. +static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - SmallVector<SDValue, 32> VPermMask; - for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) - VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) : - DAG.getConstant(Mask[i], DL,MaskEltVT)); - SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT, - VPermMask); - if (isSingleInputShuffleMask(Mask)) - return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + // Use dedicated unpack instructions for masks that match their pattern. + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 0, 16, 1, 17, 4, 20, 5, 21, + // Second 128-bit lane. + 8, 24, 9, 25, 12, 28, 13, 29})) + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, + {// First 128-bit lane. + 2, 18, 3, 19, 6, 22, 7, 23, + // Second 128-bit lane. + 10, 26, 11, 27, 14, 30, 15, 31})) + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); - return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2); + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); } - -/// \brief Handle lowering of 8-lane 64-bit floating point shuffles. -static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, +/// \brief Handle lowering of 8-lane 64-bit integer shuffles. +static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); - assert((V1.getSimpleValueType() == MVT::v8f64 || - V1.getSimpleValueType() == MVT::v8i64) && "Bad operand type!"); - assert((V2.getSimpleValueType() == MVT::v8f64 || - V2.getSimpleValueType() == MVT::v8i64) && "Bad operand type!"); + assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - SmallVector<int, 4> WidenedMask; - if (canWidenShuffleElements(Mask, WidenedMask)) - if(SDValue Op = lowerV4X128VectorShuffle(DL, VT, V1, V2, WidenedMask, DAG)) - return Op; // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); - - if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG)) - return Op; - - if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, VT, Mask, V1, V2, DAG)) - return Op; - - // PERMILPD instruction - mask 0/1, 0/1, 2/3, 2/3, 4/5, 4/5, 6/7, 6/7 - if (isSingleInputShuffleMask(Mask)) { - if (!is128BitLaneCrossingShuffleMask(VT, Mask)) - return DAG.getNode(X86ISD::VPERMILPI, DL, VT, V1, - get1bitLaneShuffleImm8ForMask(Mask, DL, DAG)); + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); - SmallVector<int, 4> RepeatedMask; - if (is256BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) - return DAG.getNode(X86ISD::VPERMI, DL, VT, V1, - getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); - } - return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG); + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. -static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, +static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); - assert((V1.getSimpleValueType() == MVT::v16i32 || - V1.getSimpleValueType() == MVT::v16f32) && "Bad operand type!"); - assert((V2.getSimpleValueType() == MVT::v16i32 || - V2.getSimpleValueType() == MVT::v16f32) && "Bad operand type!"); + assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); @@ -10253,39 +10158,16 @@ static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, 0, 16, 1, 17, 4, 20, 5, 21, // Second 128-bit lane. 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, {// First 128-bit lane. 2, 18, 3, 19, 6, 22, 7, 23, // Second 128-bit lane. 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, - 12, 12, 14, 14})) - return DAG.getNode(X86ISD::MOVSLDUP, DL, VT, V1); - if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, - 13, 13, 15, 15})) - return DAG.getNode(X86ISD::MOVSHDUP, DL, VT, V1); - - SmallVector<int, 4> RepeatedMask; - if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) { - if (isSingleInputShuffleMask(Mask)) { - unsigned Opc = VT.isInteger() ? X86ISD::PSHUFD : X86ISD::VPERMILPI; - return DAG.getNode(Opc, DL, VT, V1, - getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); - } - - for (int i = 0; i < 4; ++i) - if (RepeatedMask[i] >= 16) - RepeatedMask[i] -= 12; - return lowerVectorShuffleWithSHUFPS(DL, VT, RepeatedMask, V1, V2, DAG); - } - - if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG)) - return Op; - - return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG); + // FIXME: Implement direct support for this type! + return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. @@ -10345,11 +10227,13 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, // the requisite ISA extensions for that element type are available. switch (VT.SimpleTy) { case MVT::v8f64: - case MVT::v8i64: - return lowerV8X64VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16f32: + return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG); + case MVT::v8i64: + return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i32: - return lowerV16X32VectorShuffle(Op, V1, V2, Subtarget, DAG); + return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v32i16: if (Subtarget->hasBWI()) return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG); @@ -10759,11 +10643,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, assert(VecVT.is128BitVector() && "Unexpected vector length"); - if (Subtarget->hasSSE41()) { - SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); - if (Res.getNode()) + if (Subtarget->hasSSE41()) + if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) return Res; - } MVT VT = Op.getSimpleValueType(); // TODO: handle v16i8. @@ -12253,11 +12135,9 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (Subtarget->hasFp256()) { - SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); - if (Res.getNode()) + if (Subtarget->hasFp256()) + if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; - } return SDValue(); } @@ -12272,11 +12152,9 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG); - if (Subtarget->hasFp256()) { - SDValue Res = LowerAVXExtend(Op, DAG, Subtarget); - if (Res.getNode()) + if (Subtarget->hasFp256()) + if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; - } assert(!VT.is256BitVector() || !SVT.is128BitVector() || VT.getVectorNumElements() != SVT.getVectorNumElements()); @@ -15117,6 +14995,54 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } +/// When the 32-bit MSVC runtime transfers control to us, either to an outlined +/// function or when returning to a parent frame after catching an exception, we +/// recover the parent frame pointer by doing arithmetic on the incoming EBP. +/// Here's the math: +/// RegNodeBase = EntryEBP - RegNodeSize +/// ParentFP = RegNodeBase - RegNodeFrameOffset +/// Subtracting RegNodeSize takes us to the offset of the registration node, and +/// subtracting the offset (negative on x86) takes us back to the parent FP. +static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, + SDValue EntryEBP) { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc dl; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT PtrVT = TLI.getPointerTy(); + + // It's possible that the parent function no longer has a personality function + // if the exceptional code was optimized away, in which case we just return + // the incoming EBP. + if (!Fn->hasPersonalityFn()) + return EntryEBP; + + // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See + // WinEHStatePass for the full struct definition. + int RegNodeSize; + switch (classifyEHPersonality(Fn->getPersonalityFn())) { + default: + report_fatal_error("can only recover FP for MSVC EH personality functions"); + case EHPersonality::MSVC_X86SEH: RegNodeSize = 24; break; + case EHPersonality::MSVC_CXX: RegNodeSize = 16; break; + } + + // Get an MCSymbol that will ultimately resolve to the frame offset of the EH + // registration. + MCSymbol *OffsetSym = + MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( + GlobalValue::getRealLinkageName(Fn->getName())); + SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); + SDValue RegNodeFrameOffset = + DAG.getNode(ISD::FRAME_ALLOC_RECOVER, dl, PtrVT, OffsetSymVal); + + // RegNodeBase = EntryEBP - RegNodeSize + // ParentFP = RegNodeBase - RegNodeFrameOffset + SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, + DAG.getConstant(RegNodeSize, dl, PtrVT)); + return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset); +} + static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -15206,6 +15132,23 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1,Src2), Mask, PassThru, Subtarget, DAG); } + case INTR_TYPE_2OP_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue PassThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + // We specify 2 possible modes for intrinsics, with/without rounding modes. + // First, we check if the intrinsic have rounding mode (6 operands), + // if not, we set rounding mode to "current". + SDValue Rnd; + if (Op.getNumOperands() == 6) + Rnd = Op.getOperand(5); + else + Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Rnd), + Mask, PassThru, Subtarget, DAG); + } case INTR_TYPE_3OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); @@ -15230,11 +15173,26 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case VPERM_3OP_MASKZ: + case VPERM_3OP_MASK: + case FMA_OP_MASK3: + case FMA_OP_MASKZ: case FMA_OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); + EVT VT = Op.getValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else if (IntrData->Type == FMA_OP_MASK3) + PassThru = Src3; + else + PassThru = Src1; + // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -15246,12 +15204,12 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src1, Src2, Src3, Rnd), - Mask, Src1, Subtarget, DAG); + Mask, PassThru, Subtarget, DAG); } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src1, Src2, Src3), - Mask, Src1, Subtarget, DAG); + Mask, PassThru, Subtarget, DAG); } case CMP_MASK: case CMP_MASK_CC: { @@ -15330,18 +15288,10 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue PassThru = Op.getOperand(2); if (isAllOnes(Mask)) // return data as is return Op.getOperand(1); - EVT VT = Op.getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDLoc dl(Op); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - return DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToCompress, - PassThru); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + DataToCompress), + Mask, PassThru, Subtarget, DAG); } case BLEND: { SDValue Mask = Op.getOperand(3); @@ -15532,15 +15482,23 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal()); MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( GlobalValue::getRealLinkageName(Fn->getName())); - StringRef Name = LSDASym->getName(); - assert(Name.data()[Name.size()] == '\0' && "not null terminated"); // Generate a simple absolute symbol reference. This intrinsic is only // supported on 32-bit Windows, which isn't PIC. - SDValue Result = - DAG.getTargetExternalSymbol(Name.data(), VT, X86II::MO_NOPREFIX); + SDValue Result = DAG.getMCSymbol(LSDASym, VT); return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); } + + case Intrinsic::x86_seh_recoverfp: { + SDValue FnOp = Op.getOperand(1); + SDValue IncomingFPOp = Op.getOperand(2); + GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); + auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); + if (!Fn) + report_fatal_error( + "llvm.x86.seh.recoverfp must take a function as the first argument"); + return recoverFramePointer(DAG, Fn, IncomingFPOp); + } } } @@ -15550,7 +15508,12 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, const X86Subtarget * Subtarget) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - assert(C && "Invalid scale type"); + if (!C) + llvm_unreachable("Invalid scale type"); + unsigned ScaleVal = C->getZExtValue(); + if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) + llvm_unreachable("Valid scale values are 1, 2, 4, 8"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); EVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); @@ -15558,8 +15521,16 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else - MaskInReg = DAG.getBitcast(MaskVT, Mask); + else { + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -15576,7 +15547,12 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - assert(C && "Invalid scale type"); + if (!C) + llvm_unreachable("Invalid scale type"); + unsigned ScaleVal = C->getZExtValue(); + if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) + llvm_unreachable("Valid scale values are 1, 2, 4, 8"); + SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -15586,8 +15562,16 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); - else - MaskInReg = DAG.getBitcast(MaskVT, Mask); + else { + EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Mask.getValueType().getSizeInBits()); + + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + MaskInReg = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); @@ -15725,37 +15709,38 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } -static SDValue LowerEXCEPTIONINFO(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); SDLoc dl(Op); - SDValue FnOp = Op.getOperand(2); - SDValue FPOp = Op.getOperand(3); + SDValue Chain = Op.getOperand(0); - // Compute the symbol for the parent EH registration. We know it'll get - // emitted later. - auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(FnOp)->getGlobal()); - MCSymbol *ParentFrameSym = - MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( - GlobalValue::getRealLinkageName(Fn->getName())); - StringRef Name = ParentFrameSym->getName(); - assert(Name.data()[Name.size()] == '\0' && "not null terminated"); - - // Create a TargetExternalSymbol for the label to avoid any target lowering - // that would make this PC relative. - MVT PtrVT = Op.getSimpleValueType(); - SDValue OffsetSym = DAG.getTargetExternalSymbol(Name.data(), PtrVT); - SDValue OffsetVal = - DAG.getNode(ISD::FRAME_ALLOC_RECOVER, dl, PtrVT, OffsetSym); - - // Add the offset to the FP. - SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, FPOp, OffsetVal); - - // Load the second field of the struct, which is 4 bytes in. See - // WinEHStatePass for more info. - Add = DAG.getNode(ISD::ADD, dl, PtrVT, Add, DAG.getConstant(4, dl, PtrVT)); - return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Add, MachinePointerInfo(), - false, false, false, 0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT VT = TLI.getPointerTy(); + + const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned FrameReg = + RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); + unsigned SPReg = RegInfo->getStackRegister(); + + // Get incoming EBP. + SDValue IncomingEBP = + DAG.getCopyFromReg(Chain, dl, FrameReg, VT); + + // Load [EBP-24] into SP. + SDValue SPAddr = + DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, DAG.getConstant(-24, dl, VT)); + SDValue NewSP = + DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false, + false, VT.getScalarSizeInBits() / 8); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); + + // FIXME: Restore the base pointer in case of stack realignment! + + // Adjust EBP to point back to the original frame position. + SDValue NewFP = recoverFramePointer(DAG, MF.getFunction(), IncomingEBP); + Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP); + return Chain; } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, @@ -15764,8 +15749,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { - if (IntNo == Intrinsic::x86_seh_exceptioninfo) - return LowerEXCEPTIONINFO(Op, Subtarget, DAG); + if (IntNo == llvm::Intrinsic::x86_seh_restoreframe) + return LowerSEHRESTOREFRAME(Op, Subtarget, DAG); return SDValue(); } @@ -15884,16 +15869,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); - - SDValue Compressed = DAG.getNode(IntrData->Opc0, dl, VT, VMask, - DataToCompress, DAG.getUNDEF(VT)); + SDValue Compressed = + getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), + Mask, DAG.getUNDEF(VT), Subtarget, DAG); return DAG.getStore(Chain, dl, Compressed, Addr, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); @@ -15901,7 +15879,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, case EXPAND_FROM_MEM: { SDLoc dl(Op); SDValue Mask = Op.getOperand(4); - SDValue PathThru = Op.getOperand(3); + SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); EVT VT = Op.getValueType(); @@ -15909,21 +15887,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, if (isAllOnes(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits()/8); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); SDValue DataToExpand = DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits()/8); SDValue Results[] = { - DAG.getNode(IntrData->Opc0, dl, VT, VMask, DataToExpand, PathThru), - Chain}; + getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToExpand), + Mask, PassThru, Subtarget, DAG), Chain}; return DAG.getMergeValues(Results, dl); } } @@ -18476,6 +18447,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UMIN: return "X86ISD::UMIN"; case X86ISD::SMAX: return "X86ISD::SMAX"; case X86ISD::SMIN: return "X86ISD::SMIN"; + case X86ISD::ABS: return "X86ISD::ABS"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; @@ -18618,9 +18590,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND"; + case X86ISD::SCALEF: return "X86ISD::SCALEF"; case X86ISD::ADDS: return "X86ISD::ADDS"; case X86ISD::SUBS: return "X86ISD::SUBS"; - case X86ISD::AVG: return "X86ISD::AVG"; + case X86ISD::AVG: return "X86ISD::AVG"; case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; } @@ -18777,7 +18750,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) + if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512())) return false; VT = VT.getScalarType(); @@ -19962,6 +19935,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, // Replace 213-type (isel default) FMA3 instructions with 231-type for // accumulator loops. Writing back to the accumulator allows the coalescer // to remove extra copies in the loop. +// FIXME: Do this on AVX512. We don't support 231 variants yet (PR23937). MachineBasicBlock * X86TargetLowering::emitFMA3Instr(MachineInstr *MI, MachineBasicBlock *MBB) const { @@ -21302,8 +21276,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); - SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true); - if (LD.getNode()) + if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true)) return LD; if (isTargetShuffle(N->getOpcode())) { @@ -21451,8 +21424,7 @@ static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { /// use 64-bit extracts and shifts. static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { - SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI); - if (NewOp.getNode()) + if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; SDValue InputVector = N->getOperand(0); @@ -22895,16 +22867,14 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { - if (N->getOpcode() == ISD::SHL) { - SDValue V = PerformSHLCombine(N, DAG); - if (V.getNode()) return V; - } + if (N->getOpcode() == ISD::SHL) + if (SDValue V = PerformSHLCombine(N, DAG)) + return V; - if (N->getOpcode() != ISD::SRA) { - // Try to fold this logical shift into a zero vector. - SDValue V = performShiftToAllZeros(N, DAG, Subtarget); - if (V.getNode()) return V; - } + // Try to fold this logical shift into a zero vector. + if (N->getOpcode() != ISD::SRA) + if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) + return V; return SDValue(); } @@ -23284,8 +23254,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget); - if (R.getNode()) + if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; SDValue N0 = N->getOperand(0); @@ -23480,11 +23449,9 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (Subtarget->hasCMov()) { - SDValue RV = performIntegerAbsCombine(N, DAG); - if (RV.getNode()) + if (Subtarget->hasCMov()) + if (SDValue RV = performIntegerAbsCombine(N, DAG)) return RV; - } return SDValue(); } @@ -24266,23 +24233,37 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } - if (VT.isVector()) { - auto ExtendToVec128 = [&DAG](SDLoc DL, SDValue N) { + if (VT.isVector() && Subtarget->hasSSE2()) { + auto ExtendVecSize = [&DAG](SDLoc DL, SDValue N, unsigned Size) { EVT InVT = N.getValueType(); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(), - 128 / InVT.getScalarSizeInBits()); - SmallVector<SDValue, 8> Opnds(128 / InVT.getSizeInBits(), + Size / InVT.getScalarSizeInBits()); + SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(), DAG.getUNDEF(InVT)); Opnds[0] = N; return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds); }; + // If target-size is less than 128-bits, extend to a type that would extend + // to 128 bits, extend that and extract the original target vector. + if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits()) && + (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && + (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { + unsigned Scale = 128 / VT.getSizeInBits(); + EVT ExVT = + EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits()); + SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits()); + SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, ExVT, Ex); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt, + DAG.getIntPtrConstant(0, DL)); + } + // If target-size is 128-bits, then convert to ISD::SIGN_EXTEND_VECTOR_INREG // which ensures lowering to X86ISD::VSEXT (pmovsx*). if (VT.getSizeInBits() == 128 && (SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16) && (InSVT == MVT::i32 || InSVT == MVT::i16 || InSVT == MVT::i8)) { - SDValue ExOp = ExtendToVec128(DL, N0); + SDValue ExOp = ExtendVecSize(DL, N0, 128); return DAG.getSignExtendVectorInReg(ExOp, DL, VT); } @@ -24301,7 +24282,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, ++i, Offset += NumSubElts) { SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0, DAG.getIntPtrConstant(Offset, DL)); - SrcVec = ExtendToVec128(DL, SrcVec); + SrcVec = ExtendVecSize(DL, SrcVec, 128); SrcVec = DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT); Opnds.push_back(SrcVec); } @@ -24312,11 +24293,9 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, if (!Subtarget->hasFp256()) return SDValue(); - if (VT.isVector() && VT.getSizeInBits() == 256) { - SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); - if (R.getNode()) + if (VT.isVector() && VT.getSizeInBits() == 256) + if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - } return SDValue(); } @@ -24332,7 +24311,8 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, EVT ScalarVT = VT.getScalarType(); if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) + (!Subtarget->hasFMA() && !Subtarget->hasFMA4() && + !Subtarget->hasAVX512())) return SDValue(); SDValue A = N->getOperand(0); @@ -24398,11 +24378,10 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(1, dl, VT)); } } - if (VT.is256BitVector()) { - SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget); - if (R.getNode()) + + if (VT.is256BitVector()) + if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; - } // (i8,i32 zext (udivrem (i8 x, i8 y)) -> // (i8,i32 (udivrem_zext_hreg (i8 x, i8 y) @@ -24606,10 +24585,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, if (CC == X86::COND_B) return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0)); - SDValue Flags; - - Flags = checkBoolTestSetCCCombine(EFLAGS, CC); - if (Flags.getNode()) { + if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); } @@ -24628,10 +24604,7 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, SDValue EFLAGS = N->getOperand(3); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); - SDValue Flags; - - Flags = checkBoolTestSetCCCombine(EFLAGS, CC); - if (Flags.getNode()) { + if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, Flags); @@ -24695,16 +24668,18 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, // Now move on to more general possibilities. SDValue Op0 = N->getOperand(0); - EVT InVT = Op0->getValueType(0); + EVT VT = N->getValueType(0); + EVT InVT = Op0.getValueType(); + EVT InSVT = InVT.getScalarType(); // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) - if (InVT == MVT::v8i8 || InVT == MVT::v4i8 || - InVT == MVT::v8i16 || InVT == MVT::v4i16) { + if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) { SDLoc dl(N); - MVT DstVT = MVT::getVectorVT(MVT::i32, InVT.getVectorNumElements()); + EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); - return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); + return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have @@ -24714,10 +24689,10 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, EVT LdVT = Ld->getValueType(0); // This transformation is not supported if the result type is f16 - if (N->getValueType(0) == MVT::f16) + if (VT == MVT::f16) return SDValue(); - if (!Ld->isVolatile() && !N->getValueType(0).isVector() && + if (!Ld->isVolatile() && !VT.isVector() && ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && !Subtarget->is64Bit() && LdVT == MVT::i64) { SDValue FILDChain = Subtarget->getTargetLowering()->BuildFILD( @@ -25683,75 +25658,40 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Otherwise, check to see if this is a register class of the wrong value // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to // turn into {ax},{dx}. - if (Res.second->hasType(VT)) + // MVT::Other is used to specify clobber names. + if (Res.second->hasType(VT) || VT == MVT::Other) return Res; // Correct type already, nothing to do. - // All of the single-register GCC register classes map their values onto - // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we - // really want an 8-bit or 32-bit register, map to the appropriate register - // class and return the appropriate register. - if (Res.second == &X86::GR16RegClass) { - if (VT == MVT::i8 || VT == MVT::i1) { - unsigned DestReg = 0; - switch (Res.first) { - default: break; - case X86::AX: DestReg = X86::AL; break; - case X86::DX: DestReg = X86::DL; break; - case X86::CX: DestReg = X86::CL; break; - case X86::BX: DestReg = X86::BL; break; - } - if (DestReg) { - Res.first = DestReg; - Res.second = &X86::GR8RegClass; - } - } else if (VT == MVT::i32 || VT == MVT::f32) { - unsigned DestReg = 0; - switch (Res.first) { - default: break; - case X86::AX: DestReg = X86::EAX; break; - case X86::DX: DestReg = X86::EDX; break; - case X86::CX: DestReg = X86::ECX; break; - case X86::BX: DestReg = X86::EBX; break; - case X86::SI: DestReg = X86::ESI; break; - case X86::DI: DestReg = X86::EDI; break; - case X86::BP: DestReg = X86::EBP; break; - case X86::SP: DestReg = X86::ESP; break; - } - if (DestReg) { - Res.first = DestReg; - Res.second = &X86::GR32RegClass; - } - } else if (VT == MVT::i64 || VT == MVT::f64) { - unsigned DestReg = 0; - switch (Res.first) { - default: break; - case X86::AX: DestReg = X86::RAX; break; - case X86::DX: DestReg = X86::RDX; break; - case X86::CX: DestReg = X86::RCX; break; - case X86::BX: DestReg = X86::RBX; break; - case X86::SI: DestReg = X86::RSI; break; - case X86::DI: DestReg = X86::RDI; break; - case X86::BP: DestReg = X86::RBP; break; - case X86::SP: DestReg = X86::RSP; break; - } - if (DestReg) { - Res.first = DestReg; - Res.second = &X86::GR64RegClass; - } - } else if (VT != MVT::Other) { - // Type mismatch and not a clobber: Return an error; + // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should + // return "eax". This should even work for things like getting 64bit integer + // registers when given an f64 type. + const TargetRegisterClass *Class = Res.second; + if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || + Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { + unsigned Size = VT.getSizeInBits(); + MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8 + : Size == 16 ? MVT::i16 + : Size == 32 ? MVT::i32 + : Size == 64 ? MVT::i64 + : MVT::Other; + unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy); + if (DestReg > 0) { + Res.first = DestReg; + Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass + : SimpleTy == MVT::i16 ? &X86::GR16RegClass + : SimpleTy == MVT::i32 ? &X86::GR32RegClass + : &X86::GR64RegClass; + assert(Res.second->contains(Res.first) && "Register in register class"); + } else { + // No register found/type mismatch. Res.first = 0; Res.second = nullptr; } - } else if (Res.second == &X86::FR32RegClass || - Res.second == &X86::FR64RegClass || - Res.second == &X86::VR128RegClass || - Res.second == &X86::VR256RegClass || - Res.second == &X86::FR32XRegClass || - Res.second == &X86::FR64XRegClass || - Res.second == &X86::VR128XRegClass || - Res.second == &X86::VR256XRegClass || - Res.second == &X86::VR512RegClass) { + } else if (Class == &X86::FR32RegClass || Class == &X86::FR64RegClass || + Class == &X86::VR128RegClass || Class == &X86::VR256RegClass || + Class == &X86::FR32XRegClass || Class == &X86::FR64XRegClass || + Class == &X86::VR128XRegClass || Class == &X86::VR256XRegClass || + Class == &X86::VR512RegClass) { // Handle references to XMM physical registers that got mapped into the // wrong class. This can happen with constraints like {xmm0} where the // target independent register mapper will just pick the first match it can @@ -25767,15 +25707,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, Res.second = &X86::VR256RegClass; else if (X86::VR512RegClass.hasType(VT)) Res.second = &X86::VR512RegClass; - else if (VT != MVT::Other) { + else { // Type mismatch and not a clobber: Return an error; Res.first = 0; Res.second = nullptr; } - } else if (VT != MVT::Other) { - // Type mismatch and not a clobber: Return an error; - Res.first = 0; - Res.second = nullptr; } return Res; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 9c98333776cf..17660891635c 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -211,7 +211,8 @@ namespace llvm { // FP vector get exponent FGETEXP_RND, - + // FP Scale + SCALEF, // Integer add/sub with unsigned saturation. ADDUS, SUBUS, @@ -238,6 +239,9 @@ namespace llvm { /// Signed integer max and min. SMAX, SMIN, + // Integer absolute value + ABS, + /// Floating point max and min. FMAX, FMIN, @@ -516,7 +520,7 @@ namespace llvm { // have memop! In fact, starting from ATOMADD64_DAG all opcodes will be // thought as target memory ops! }; - } // namespace X86ISD + } /// Define some predicates that are used for node matching. namespace X86 { @@ -583,7 +587,7 @@ namespace llvm { TO_ZERO = 3, CUR_DIRECTION = 4 }; - } // namespace X86 + } //===--------------------------------------------------------------------===// // X86 Implementation of the TargetLowering interface @@ -638,9 +642,8 @@ namespace llvm { /// legal as the hook is used before type legalization. bool isSafeMemOpType(MVT VT) const override; - /// Returns true if the target allows - /// unaligned memory accesses. of the specified type. Returns whether it - /// is "fast" by reference in the second argument. + /// Returns true if the target allows unaligned memory accesses of the + /// specified type. Returns whether it is "fast" in the last argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *Fast) const override; @@ -1120,6 +1123,6 @@ namespace llvm { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } -} // namespace llvm +} #endif // X86ISELLOWERING_H diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index de6a83506b28..b309b8210851 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -274,6 +274,16 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>; +multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common<O, F, _, Outs, + !con((ins _.RC:$src1), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (X86select _.KRCWM:$mask, RHS, _.RC:$src1)>; multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, @@ -3436,7 +3446,7 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, } multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, - X86VectorVTInfo _, bit IsCommutable> { + X86VectorVTInfo _> { defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, "$rc, $src2, $src1", "$src1, $src2, $rc", @@ -3446,7 +3456,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRn multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, - X86VectorVTInfo _, bit IsCommutable> { + X86VectorVTInfo _> { defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "{sae}, $src2, $src1", "$src1, $src2, {sae}", @@ -3481,16 +3491,16 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, } multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { - defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info, 0>, + defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info, 0>, + defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>, EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; } multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { - defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info, 0>, + defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info, 0>, + defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>, EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; } @@ -3513,6 +3523,48 @@ let Predicates = [HasDQI] in { defm VXOR : avx512_fp_binop_p<0x57, "vxor", X86fxor, 1>; } +multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V; + let mayLoad = 1 in { + defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V; + defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>, + EVEX_4V, EVEX_B; + }//let mayLoad = 1 +} + +multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, + avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, + avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v2f64x_info>, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f64x_info>, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } +} +defm VSCALEF : avx512_fp_scalef_all<0x2C, "vscalef", X86scalef>, T8PD; + //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// @@ -3870,6 +3922,19 @@ defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw", X86PShufhw>, EVEX, AVX512XSIi8Base, VEX_W; defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw", X86PShuflw>, EVEX, AVX512XDIi8Base, VEX_W; + +multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> { + let Predicates = [HasBWI] in + defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512; + + let Predicates = [HasVLX, HasBWI] in { + defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, v32i8x_info>, EVEX_V256; + defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, v16i8x_info>, EVEX_V128; + } +} + +defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>; + //===----------------------------------------------------------------------===// // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// @@ -3950,188 +4015,295 @@ let Predicates = [HasAVX512] in { // let Constraints = "$src1 = $dst" in { -// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching. -multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - SDPatternOperator OpNode = null_frag> { +multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, AVX512FMA3Base; - let mayLoad = 1 in - defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + let mayLoad = 1 in { + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, AVX512FMA3Base; - defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src1, _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, AVX512FMA3Base, EVEX_B; - } -} // Constraints = "$src1 = $dst" + } +} -let Constraints = "$src1 = $dst" in { -// Omitting the parameter OpNode (= null_frag) disables ISel pattern matching. -multiclass avx512_fma3_round_rrb<bits<8> opc, string OpcodeStr, - X86VectorVTInfo _, - SDPatternOperator OpNode> { - defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), +multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>, AVX512FMA3Base, EVEX_B, EVEX_RC; - } +} } // Constraints = "$src1 = $dst" -multiclass avx512_fma3_round_forms<bits<8> opc213, string OpcodeStr, - X86VectorVTInfo VTI, SDPatternOperator OpNode> { - defm v213r : avx512_fma3_round_rrb<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix), - VTI, OpNode>, EVEX_CD8<VTI.EltSize, CD8VF>; +multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info512>, + avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, _.info512>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + } + let Predicates = [HasVLX, HasAVX512] in { + defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; + } } -multiclass avx512_fma3p_forms<bits<8> opc213, bits<8> opc231, - string OpcodeStr, X86VectorVTInfo VTI, - SDPatternOperator OpNode> { - defm v213r : avx512_fma3p_rm<opc213, !strconcat(OpcodeStr, "213", VTI.Suffix), - VTI, OpNode>, EVEX_CD8<VTI.EltSize, CD8VF>; - defm v231r : avx512_fma3p_rm<opc231, !strconcat(OpcodeStr, "231", VTI.Suffix), - VTI>, EVEX_CD8<VTI.EltSize, CD8VF>; +multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd > { + defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, + avx512vl_f32_info>; + defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, + avx512vl_f64_info>, VEX_W; } -multiclass avx512_fma3p<bits<8> opc213, bits<8> opc231, - string OpcodeStr, - SDPatternOperator OpNode, - SDPatternOperator OpNodeRnd> { -let ExeDomain = SSEPackedSingle in { - defm NAME##PSZ : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v16f32_info, OpNode>, - avx512_fma3_round_forms<opc213, OpcodeStr, - v16f32_info, OpNodeRnd>, EVEX_V512; - defm NAME##PSZ256 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v8f32x_info, OpNode>, EVEX_V256; - defm NAME##PSZ128 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v4f32x_info, OpNode>, EVEX_V128; +defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>; +defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>; + + +let Constraints = "$src1 = $dst" in { +multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>, + AVX512FMA3Base; + + let mayLoad = 1 in { + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src2, + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + _.RC:$src1))>, AVX512FMA3Base, EVEX_B; } -let ExeDomain = SSEPackedDouble in { - defm NAME##PDZ : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v8f64_info, OpNode>, - avx512_fma3_round_forms<opc213, OpcodeStr, v8f64_info, - OpNodeRnd>, EVEX_V512, VEX_W; - defm NAME##PDZ256 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v4f64x_info, OpNode>, - EVEX_V256, VEX_W; - defm NAME##PDZ128 : avx512_fma3p_forms<opc213, opc231, OpcodeStr, - v2f64x_info, OpNode>, - EVEX_V128, VEX_W; +} + +multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc)))>, + AVX512FMA3Base, EVEX_B, EVEX_RC; +} +} // Constraints = "$src1 = $dst" + +multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info512>, + avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, _.info512>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + } + let Predicates = [HasVLX, HasAVX512] in { + defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } -defm VFMADD : avx512_fma3p<0xA8, 0xB8, "vfmadd", X86Fmadd, X86FmaddRnd>; -defm VFMSUB : avx512_fma3p<0xAA, 0xBA, "vfmsub", X86Fmsub, X86FmsubRnd>; -defm VFMADDSUB : avx512_fma3p<0xA6, 0xB6, "vfmaddsub", X86Fmaddsub, X86FmaddsubRnd>; -defm VFMSUBADD : avx512_fma3p<0xA7, 0xB7, "vfmsubadd", X86Fmsubadd, X86FmsubaddRnd>; -defm VFNMADD : avx512_fma3p<0xAC, 0xBC, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB : avx512_fma3p<0xAE, 0xBE, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; +multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd > { + defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, + avx512vl_f32_info>; + defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, + avx512vl_f64_info>, VEX_W; +} + +defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>; +defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>; let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { - let mayLoad = 1 in - def m: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src3, _.MemOp:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"), - [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), - _.RC:$src3)))]>; - def mb: AVX512FMA3<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src3, _.ScalarMemOp:$src2), - !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, - ", $src3, $dst|$dst, $src3, ${src2}", _.BroadcastStr, "}"), - [(set _.RC:$dst, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))), - _.RC:$src3))]>, EVEX_B; +multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src3, _.RC:$src2), + OpcodeStr, "$src2, $src3", "$src3, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + AVX512FMA3Base; + + let mayLoad = 1 in { + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src3, _.MemOp:$src2), + OpcodeStr, "$src2, $src3", "$src3, $src2", + (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2), _.RC:$src3))>, + AVX512FMA3Base; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src3, _.ScalarMemOp:$src2), + OpcodeStr, "${src2}"##_.BroadcastStr##", $src3", + "$src3, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + _.RC:$src3))>, AVX512FMA3Base, EVEX_B; + } } -} // Constraints = "$src1 = $dst" -multiclass avx512_fma3p_m132_f<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src3, _.RC:$src2, AVX512RC:$rc), + OpcodeStr, "$rc, $src2, $src3", "$src3, $src2, $rc", + (_.VT ( OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc)))>, + AVX512FMA3Base, EVEX_B, EVEX_RC; +} +} // Constraints = "$src1 = $dst" -let ExeDomain = SSEPackedSingle in { - defm NAME##PSZ : avx512_fma3p_m132<opc, OpcodeStr##ps, - OpNode,v16f32_info>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm NAME##PSZ256 : avx512_fma3p_m132<opc, OpcodeStr##ps, - OpNode, v8f32x_info>, EVEX_V256, - EVEX_CD8<32, CD8VF>; - defm NAME##PSZ128 : avx512_fma3p_m132<opc, OpcodeStr##ps, - OpNode, v4f32x_info>, EVEX_V128, - EVEX_CD8<32, CD8VF>; +multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info512>, + avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, _.info512>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } -let ExeDomain = SSEPackedDouble in { - defm NAME##PDZ : avx512_fma3p_m132<opc, OpcodeStr##pd, - OpNode, v8f64_info>, EVEX_V512, - VEX_W, EVEX_CD8<32, CD8VF>; - defm NAME##PDZ256 : avx512_fma3p_m132<opc, OpcodeStr##pd, - OpNode, v4f64x_info>, EVEX_V256, - VEX_W, EVEX_CD8<32, CD8VF>; - defm NAME##PDZ128 : avx512_fma3p_m132<opc, OpcodeStr##pd, - OpNode, v2f64x_info>, EVEX_V128, - VEX_W, EVEX_CD8<32, CD8VF>; + let Predicates = [HasVLX, HasAVX512] in { + defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } -defm VFMADD132 : avx512_fma3p_m132_f<0x98, "vfmadd132", X86Fmadd>; -defm VFMSUB132 : avx512_fma3p_m132_f<0x9A, "vfmsub132", X86Fmsub>; -defm VFMADDSUB132 : avx512_fma3p_m132_f<0x96, "vfmaddsub132", X86Fmaddsub>; -defm VFMSUBADD132 : avx512_fma3p_m132_f<0x97, "vfmsubadd132", X86Fmsubadd>; -defm VFNMADD132 : avx512_fma3p_m132_f<0x9C, "vfnmadd132", X86Fnmadd>; -defm VFNMSUB132 : avx512_fma3p_m132_f<0x9E, "vfnmsub132", X86Fnmsub>; +multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd > { + defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, + avx512vl_f32_info>; + defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, + avx512vl_f64_info>, VEX_W; +} + +defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>; +defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>; // Scalar FMA let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, ValueType OpVT, - X86MemOperand x86memop, Operand memop, - PatFrag mem_frag> { - let isCommutable = 1 in - def r : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; +multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb, + dag RHS_r, dag RHS_m > { + defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), OpcodeStr, + "$src3, $src2", "$src2, $src3", RHS_VEC_r>, AVX512FMA3Base; + let mayLoad = 1 in - def m : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src2, f128mem:$src3), - !strconcat(OpcodeStr, + defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, + "$src3, $src2", "$src2, $src3", RHS_VEC_m>, AVX512FMA3Base; + + defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb>, + AVX512FMA3Base, EVEX_B, EVEX_RC; + + let isCodeGenOnly = 1 in { + def r : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3), + !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src2, RC:$src1, - (mem_frag addr:$src3))))]>; + [RHS_r]>; + let mayLoad = 1 in + def m : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [RHS_m]>; + }// isCodeGenOnly = 1 +} +}// Constraints = "$src1 = $dst" + +multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, X86VectorVTInfo _ , + string SUFF> { + + defm NAME#213#SUFF: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ , + (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), + (_.VT (OpNode _.RC:$src2, _.RC:$src1, + (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))))), + (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src1, _.RC:$src3, + (i32 imm:$rc))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, + _.FRC:$src3))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, + (_.ScalarLdFrag addr:$src3))))>; + + defm NAME#231#SUFF: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ , + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), + (_.VT (OpNode _.RC:$src2, + (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), + _.RC:$src1)), + (_.VT ( OpNodeRnd _.RC:$src2, _.RC:$src3, _.RC:$src1, + (i32 imm:$rc))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3, + _.FRC:$src1))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, + (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>; + + defm NAME#132#SUFF: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ , + (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), + (_.VT (OpNode _.RC:$src1, + (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), + _.RC:$src2)), + (_.VT ( OpNodeRnd _.RC:$src1, _.RC:$src3, _.RC:$src2, + (i32 imm:$rc))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3, + _.FRC:$src2))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>; +} + +multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd>{ + let Predicates = [HasAVX512] in { + defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, + OpNodeRnd, f32x_info, "SS">, + EVEX_CD8<32, CD8VT1>, VEX_LIG; + defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, + OpNodeRnd, f64x_info, "SD">, + EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; + } } -} // Constraints = "$src1 = $dst" -defm VFMADDSSZ : avx512_fma3s_rm<0xA9, "vfmadd213ss", X86Fmadd, FR32X, - f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>; -defm VFMADDSDZ : avx512_fma3s_rm<0xA9, "vfmadd213sd", X86Fmadd, FR64X, - f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VFMSUBSSZ : avx512_fma3s_rm<0xAB, "vfmsub213ss", X86Fmsub, FR32X, - f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>; -defm VFMSUBSDZ : avx512_fma3s_rm<0xAB, "vfmsub213sd", X86Fmsub, FR64X, - f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VFNMADDSSZ : avx512_fma3s_rm<0xAD, "vfnmadd213ss", X86Fnmadd, FR32X, - f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>; -defm VFNMADDSDZ : avx512_fma3s_rm<0xAD, "vfnmadd213sd", X86Fnmadd, FR64X, - f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VFNMSUBSSZ : avx512_fma3s_rm<0xAF, "vfnmsub213ss", X86Fnmsub, FR32X, - f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>; -defm VFNMSUBSDZ : avx512_fma3s_rm<0xAF, "vfnmsub213sd", X86Fnmsub, FR64X, - f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from sign integer to float/double @@ -5427,10 +5599,11 @@ defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">; multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86MemOperand memop, PatFrag GatherNode> { - let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in + let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb", + ExeDomain = _.ExeDomain in def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb), (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2), - !strconcat(OpcodeStr, + !strconcat(OpcodeStr#_.Suffix, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [(set _.RC:$dst, _.KRCWM:$mask_wb, (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask, @@ -5438,67 +5611,104 @@ multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, EVEX_CD8<_.EltSize, CD8VT1>; } -let ExeDomain = SSEPackedDouble in { -defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", v8f64_info, vy64xmem, - mgatherv8i32>, EVEX_V512, VEX_W; -defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", v8f64_info, vz64mem, - mgatherv8i64>, EVEX_V512, VEX_W; +multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, + vy32xmem, mgatherv8i32>, EVEX_V512, VEX_W; + defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512, + vz64mem, mgatherv8i64>, EVEX_V512, VEX_W; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256, + vx32xmem, mgatherv4i32>, EVEX_V256, VEX_W; + defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256, + vy64xmem, mgatherv4i64>, EVEX_V256, VEX_W; + defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, + vx32xmem, mgatherv4i32>, EVEX_V128, VEX_W; + defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mgatherv2i64>, EVEX_V128, VEX_W; +} +} + +multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz32mem, + mgatherv16i32>, EVEX_V512; + defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz64mem, + mgatherv8i64>, EVEX_V512; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256, + vy32xmem, mgatherv8i32>, EVEX_V256; + defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128, + vy64xmem, mgatherv4i64>, EVEX_V256; + defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, + vx32xmem, mgatherv4i32>, EVEX_V128; + defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mgatherv2i64>, EVEX_V128; } - -let ExeDomain = SSEPackedSingle in { -defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", v16f32_info, vz32mem, - mgatherv16i32>, EVEX_V512; -defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", v8f32x_info, vz64mem, - mgatherv8i64>, EVEX_V512; } -defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", v8i64_info, vy64xmem, - mgatherv8i32>, EVEX_V512, VEX_W; -defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", v16i32_info, vz32mem, - mgatherv16i32>, EVEX_V512; -defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", v8i64_info, vz64mem, - mgatherv8i64>, EVEX_V512, VEX_W; -defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", v8i32x_info, vz64mem, - mgatherv8i64>, EVEX_V512; +defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">, + avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">; + +defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">, + avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">; multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86MemOperand memop, PatFrag ScatterNode> { -let mayStore = 1, Constraints = "$mask = $mask_wb" in +let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in def mr : AVX5128I<opc, MRMDestMem, (outs _.KRCWM:$mask_wb), (ins memop:$dst, _.KRCWM:$mask, _.RC:$src), - !strconcat(OpcodeStr, + !strconcat(OpcodeStr#_.Suffix, "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src), _.KRCWM:$mask, vectoraddr:$dst))]>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; } -let ExeDomain = SSEPackedDouble in { -defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", v8f64_info, vy64xmem, - mscatterv8i32>, EVEX_V512, VEX_W; -defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", v8f64_info, vz64mem, - mscatterv8i64>, EVEX_V512, VEX_W; +multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, + vy32xmem, mscatterv8i32>, EVEX_V512, VEX_W; + defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512, + vz64mem, mscatterv8i64>, EVEX_V512, VEX_W; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256, + vx32xmem, mscatterv4i32>, EVEX_V256, VEX_W; + defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256, + vy64xmem, mscatterv4i64>, EVEX_V256, VEX_W; + defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128, + vx32xmem, mscatterv4i32>, EVEX_V128, VEX_W; + defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mscatterv2i64>, EVEX_V128, VEX_W; +} +} + +multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz32mem, + mscatterv16i32>, EVEX_V512; + defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz64mem, + mscatterv8i64>, EVEX_V512; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256, + vy32xmem, mscatterv8i32>, EVEX_V256; + defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128, + vy64xmem, mscatterv4i64>, EVEX_V256; + defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128, + vx32xmem, mscatterv4i32>, EVEX_V128; + defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mscatterv2i64>, EVEX_V128; } - -let ExeDomain = SSEPackedSingle in { -defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", v16f32_info, vz32mem, - mscatterv16i32>, EVEX_V512; -defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", v8f32x_info, vz64mem, - mscatterv8i64>, EVEX_V512; } -defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", v8i64_info, vy64xmem, - mscatterv8i32>, EVEX_V512, VEX_W; -defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", v16i32_info, vz32mem, - mscatterv16i32>, EVEX_V512; +defm VSCATTER : avx512_scatter_q_pd<0xA2, 0xA3, avx512vl_f64_info, "vscatter", "PD">, + avx512_scatter_d_ps<0xA2, 0xA3, avx512vl_f32_info, "vscatter", "PS">; -defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", v8i64_info, vz64mem, - mscatterv8i64>, EVEX_V512, VEX_W; -defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", v8i32x_info, vz64mem, - mscatterv8i64>, EVEX_V512; +defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", "Q">, + avx512_scatter_d_ps<0xA0, 0xA1, avx512vl_i32_info, "vpscatter", "D">; // prefetch multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr, @@ -5599,77 +5809,6 @@ def : Pat<(v8i64 (X86Shufp VR512:$src1, def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; -multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, ValueType OpVT, - RegisterClass KRC, RegisterClass RC, - X86MemOperand x86memop, X86MemOperand x86scalar_mop, - string BrdcstStr> { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX; - def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), - []>, EVEX, EVEX_K; - def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), - (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX; - def rmk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), - (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), - []>, EVEX, EVEX_K; - def rmkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), - (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - def rmb : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), - (ins x86scalar_mop:$src), - !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, - ", $dst|$dst, ${src}", BrdcstStr, "}"), - []>, EVEX, EVEX_B; - def rmbk : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), - (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, - ", $dst {${mask}}|$dst {${mask}}, ${src}", BrdcstStr, "}"), - []>, EVEX, EVEX_B, EVEX_K; - def rmbkz : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), - (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, - ", $dst {${mask}} {z}|$dst {${mask}} {z}, ${src}", - BrdcstStr, "}"), - []>, EVEX, EVEX_B, EVEX_KZ; - } -} - -defm VPABSDZ : avx512_vpabs<0x1E, "vpabsd", v16i32, VK16WM, VR512, - i512mem, i32mem, "{1to16}">, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPABSQZ : avx512_vpabs<0x1F, "vpabsq", v8i64, VK8WM, VR512, - i512mem, i64mem, "{1to8}">, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - -def : Pat<(xor - (bc_v16i32 (v16i1sextv16i32)), - (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))), - (VPABSDZrr VR512:$src)>; -def : Pat<(xor - (bc_v8i64 (v8i1sextv8i64)), - (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))), - (VPABSQZrr VR512:$src)>; - -def : Pat<(v16i32 (int_x86_avx512_mask_pabs_d_512 (v16i32 VR512:$src), - (v16i32 immAllZerosV), (i16 -1))), - (VPABSDZrr VR512:$src)>; -def : Pat<(v8i64 (int_x86_avx512_mask_pabs_q_512 (v8i64 VR512:$src), - (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), - (VPABSQZrr VR512:$src)>; - multiclass avx512_conflict<bits<8> opc, string OpcodeStr, RegisterClass RC, RegisterClass KRC, X86MemOperand x86memop, @@ -5868,26 +6007,24 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", //===----------------------------------------------------------------------===// // AVX-512 - COMPRESS and EXPAND // + multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { - def rrkz : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst), - (ins _.KRCWM:$mask, _.RC:$src), - OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", - [(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, - _.ImmAllZerosV)))]>, EVEX_KZ; - - let Constraints = "$src0 = $dst" in - def rrk : AVX5128I<opc, MRMDestReg, (outs _.RC:$dst), - (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src), - OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", - [(set _.RC:$dst, (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, - _.RC:$src0)))]>, EVEX_K; + defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (_.VT (X86compress _.RC:$src1))>, AVX5128IBase; let mayStore = 1 in { + def mr : AVX5128I<opc, MRMDestMem, (outs), + (ins _.MemOp:$dst, _.RC:$src), + OpcodeStr # "\t{$src, $dst |$dst, $src}", + []>, EVEX_CD8<_.EltSize, CD8VT1>; + def mrk : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", - [(store (_.VT (X86compress _.KRCWM:$mask, _.RC:$src, undef)), + [(store (_.VT (vselect _.KRCWM:$mask, + (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)), addr:$dst)]>, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; } @@ -5915,37 +6052,16 @@ defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info // expand multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { - def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), - (ins _.KRCWM:$mask, _.RC:$src), - OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", - [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, (_.VT _.RC:$src), - _.ImmAllZerosV)))]>, EVEX_KZ; - - let Constraints = "$src0 = $dst" in - def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src), - OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", - [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, - (_.VT _.RC:$src), _.RC:$src0)))]>, EVEX_K; - - let mayLoad = 1, Constraints = "$src0 = $dst" in - def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src), - OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", - [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, - (_.VT (bitconvert - (_.LdFrag addr:$src))), - _.RC:$src0)))]>, - EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (_.VT (X86expand _.RC:$src1))>, AVX5128IBase; let mayLoad = 1 in - def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.KRCWM:$mask, _.MemOp:$src), - OpcodeStr # "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", - [(set _.RC:$dst, (_.VT (X86expand _.KRCWM:$mask, - (_.VT (bitconvert (_.LdFrag addr:$src))), - _.ImmAllZerosV)))]>, - EVEX_KZ, EVEX_CD8<_.EltSize, CD8VT1>; + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", + (_.VT (X86expand (_.VT (bitconvert + (_.LdFrag addr:$src1)))))>, + AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>; } multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr, @@ -6175,3 +6291,91 @@ defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>, EVEX_CD8<64, CD8VF>, VEX_W; + +multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr##_.Suffix, + "$src1", "$src1", + (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase; + + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1), OpcodeStr##_.Suffix, + "$src1", "$src1", + (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, + EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>; +} + +multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> : + avx512_unary_rm<opc, OpcodeStr, OpNode, _> { + let mayLoad = 1 in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1), OpcodeStr##_.Suffix, + "${src1}"##_.BroadcastStr, + "${src1}"##_.BroadcastStr, + (_.VT (OpNode (X86VBroadcast + (_.ScalarLdFrag addr:$src1))))>, + EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; +} + +multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, Predicate prd> { + defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr, OpNode, avx512vl_i64_info, + prd>, VEX_W; + defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr, OpNode, avx512vl_i32_info, prd>; +} + +multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, + SDNode OpNode, Predicate prd> { + defm W : avx512_unary_rm_vl<opc_w, OpcodeStr, OpNode, avx512vl_i16_info, prd>; + defm B : avx512_unary_rm_vl<opc_b, OpcodeStr, OpNode, avx512vl_i8_info, prd>; +} + +multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, + bits<8> opc_d, bits<8> opc_q, + string OpcodeStr, SDNode OpNode> { + defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, + HasAVX512>, + avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, + HasBWI>; +} + +defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>; + +def : Pat<(xor + (bc_v16i32 (v16i1sextv16i32)), + (bc_v16i32 (add (v16i32 VR512:$src), (v16i1sextv16i32)))), + (VPABSDZrr VR512:$src)>; +def : Pat<(xor + (bc_v8i64 (v8i1sextv8i64)), + (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))), + (VPABSQZrr VR512:$src)>; diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h index eb4dc48a7a65..2056056d23a5 100644 --- a/lib/Target/X86/X86InstrBuilder.h +++ b/lib/Target/X86/X86InstrBuilder.h @@ -179,6 +179,6 @@ addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI, .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0); } -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 912a0fb356ed..7f850d6830e1 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -869,6 +869,7 @@ def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>; def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>; def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>; def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; +def : Pat<(i32 (X86Wrapper mcsym:$dst)), (MOV32ri mcsym:$dst)>; def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>; def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)), @@ -879,6 +880,8 @@ def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)), (ADD32ri GR32:$src1, tglobaladdr:$src2)>; def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)), (ADD32ri GR32:$src1, texternalsym:$src2)>; +def : Pat<(add GR32:$src1, (X86Wrapper mcsym:$src2)), + (ADD32ri GR32:$src1, mcsym:$src2)>; def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)), (ADD32ri GR32:$src1, tblockaddress:$src2)>; @@ -886,6 +889,8 @@ def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst), (MOV32mi addr:$dst, tglobaladdr:$src)>; def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), (MOV32mi addr:$dst, texternalsym:$src)>; +def : Pat<(store (i32 (X86Wrapper mcsym:$src)), addr:$dst), + (MOV32mi addr:$dst, mcsym:$src)>; def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst), (MOV32mi addr:$dst, tblockaddress:$src)>; @@ -900,6 +905,8 @@ def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>; def : Pat<(i64 (X86Wrapper texternalsym:$dst)), (MOV64ri texternalsym:$dst)>, Requires<[FarData]>; +def : Pat<(i64 (X86Wrapper mcsym:$dst)), + (MOV64ri mcsym:$dst)>, Requires<[FarData]>; def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; @@ -914,6 +921,8 @@ def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper texternalsym:$dst)), (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper mcsym:$dst)), + (MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; @@ -932,12 +941,15 @@ def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), (MOV64mi32 addr:$dst, texternalsym:$src)>, Requires<[NearData, IsStatic]>; +def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, mcsym:$src)>, + Requires<[NearData, IsStatic]>; def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), (MOV64mi32 addr:$dst, tblockaddress:$src)>, Requires<[NearData, IsStatic]>; -def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; -def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>; +def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>; +def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>; // Calls diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 0dd05d8befd6..49068e9c37d3 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -633,16 +633,16 @@ def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>; def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>; def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; -def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), - "fxsave\t$dst", [], IIC_FXSAVE>, TB; -def FXSAVE64 : RI<0xAE, MRM0m, (outs opaque512mem:$dst), (ins), - "fxsave64\t$dst", [], IIC_FXSAVE>, TB, - Requires<[In64BitMode]>; +def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; +def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], + IIC_FXSAVE>, TB, Requires<[In64BitMode]>; def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor\t$src", [], IIC_FXRSTOR>, TB; + "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB; def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor64\t$src", [], IIC_FXRSTOR>, TB, - Requires<[In64BitMode]>; + "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], + IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; } // SchedRW //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 16ae77dd81a3..fe245c3a7e38 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -251,6 +251,7 @@ def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; +def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>; def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; @@ -310,6 +311,7 @@ def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; @@ -347,12 +349,10 @@ def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, def X86pcmpistri : SDNode<"X86ISD::PCMPISTRI", SDT_PCMPISTRI>; def X86pcmpestri : SDNode<"X86ISD::PCMPESTRI", SDT_PCMPESTRI>; -def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 3, - [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, - SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; -def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 3, - [SDTCisSameAs<0, 3>, - SDTCisVec<3>, SDTCisVec<1>, SDTCisInt<1>]>, []>; +def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1, + [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; +def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, + [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>; @@ -561,6 +561,14 @@ def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), return false; }]>; +def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i32 || + Mgt->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_gather node:$src1, node:$src2, node:$src3) , [{ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) @@ -569,6 +577,20 @@ def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), return false; }]>; +def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v2i64 || + Mgt->getBasePtr().getValueType() == MVT::v2i64); + return false; +}]>; +def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_gather node:$src1, node:$src2, node:$src3) , [{ + if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) + return (Mgt->getIndex().getValueType() == MVT::v4i64 || + Mgt->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_gather node:$src1, node:$src2, node:$src3) , [{ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) @@ -584,6 +606,30 @@ def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), return false; }]>; +def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v2i64 || + Sc->getBasePtr().getValueType() == MVT::v2i64); + return false; +}]>; + +def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v4i32 || + Sc->getBasePtr().getValueType() == MVT::v4i32); + return false; +}]>; + +def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_scatter node:$src1, node:$src2, node:$src3) , [{ + if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) + return (Sc->getIndex().getValueType() == MVT::v4i64 || + Sc->getBasePtr().getValueType() == MVT::v4i64); + return false; +}]>; + def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_scatter node:$src1, node:$src2, node:$src3) , [{ if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 4aa0ae6f1959..b92ba99fb100 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1577,38 +1577,38 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPXORYrr, X86::VPXORYrm, 0 }, // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 }, - { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 }, - { X86::VFMADDPS4rr, X86::VFMADDPS4mr, 0 }, - { X86::VFMADDPD4rr, X86::VFMADDPD4mr, 0 }, - { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, 0 }, - { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, 0 }, - { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 }, - { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 }, - { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, 0 }, - { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, 0 }, - { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, 0 }, - { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, 0 }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 }, - { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, 0 }, - { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, 0 }, - { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, 0 }, - { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, 0 }, - { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 }, - { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 }, - { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, 0 }, - { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, 0 }, - { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, 0 }, - { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, 0 }, - { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, 0 }, - { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, 0 }, - { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, 0 }, - { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, 0 }, - { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, 0 }, - { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, 0 }, - { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, 0 }, - { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, 0 }, + { X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE }, + { X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE }, + { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE }, + { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_NONE }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_NONE }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE }, // XOP foldable instructions { X86::VPCMOVrr, X86::VPCMOVmr, 0 }, @@ -1852,38 +1852,38 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE }, // FMA4 foldable patterns - { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 }, - { X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 }, - { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 }, - { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 }, - { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 }, - { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 }, - { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 }, - { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 }, - { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 }, - { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 }, - { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 }, - { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 }, - { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 }, - { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 }, - { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 }, - { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 }, - { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 }, - { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 }, - { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 }, - { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 }, - { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 }, - { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 }, - { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 }, - { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 }, - { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 }, - { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 }, - { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 }, - { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 }, - { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 }, - { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 }, - { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 }, - { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 }, + { X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE }, + { X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE }, + { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE }, + { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE }, + { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_NONE }, + { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE }, + { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE }, + { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE }, + { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE }, + { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_NONE }, + { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE }, + { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE }, + { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE }, + { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE }, + { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE }, + { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_NONE }, + { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE }, + { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_NONE }, + { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE }, + { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_NONE }, + { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE }, // XOP foldable instructions { X86::VPCMOVrr, X86::VPCMOVrm, 0 }, @@ -5295,21 +5295,57 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Size, Alignment, /*AllowCommute=*/true); } -static bool isPartialRegisterLoad(const MachineInstr &LoadMI, - const MachineFunction &MF) { +/// Check if \p LoadMI is a partial register load that we can't fold into \p MI +/// because the latter uses contents that wouldn't be defined in the folded +/// version. For instance, this transformation isn't legal: +/// movss (%rdi), %xmm0 +/// addps %xmm0, %xmm0 +/// -> +/// addps (%rdi), %xmm0 +/// +/// But this one is: +/// movss (%rdi), %xmm0 +/// addss %xmm0, %xmm0 +/// -> +/// addss (%rdi), %xmm0 +/// +static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, + const MachineInstr &UserMI, + const MachineFunction &MF) { unsigned Opc = LoadMI.getOpcode(); + unsigned UserOpc = UserMI.getOpcode(); unsigned RegSize = MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); - if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) + if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) { // These instructions only load 32 bits, we can't fold them if the - // destination register is wider than 32 bits (4 bytes). - return true; + // destination register is wider than 32 bits (4 bytes), and its user + // instruction isn't scalar (SS). + switch (UserOpc) { + case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: + case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: + case X86::MULSSrr_Int: case X86::VMULSSrr_Int: + case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: + return false; + default: + return true; + } + } - if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) + if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) { // These instructions only load 64 bits, we can't fold them if the - // destination register is wider than 64 bits (8 bytes). - return true; + // destination register is wider than 64 bits (8 bytes), and its user + // instruction isn't scalar (SD). + switch (UserOpc) { + case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: + case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: + case X86::MULSDrr_Int: case X86::VMULSDrr_Int: + case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: + return false; + default: + return true; + } + } return false; } @@ -5321,7 +5357,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( unsigned NumOps = LoadMI->getDesc().getNumOperands(); int FrameIndex; if (isLoadFromStackSlot(LoadMI, FrameIndex)) { - if (isPartialRegisterLoad(*LoadMI, MF)) + if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) return nullptr; return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex); } @@ -5434,7 +5470,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( break; } default: { - if (isPartialRegisterLoad(*LoadMI, MF)) + if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) return nullptr; // Folding a normal load. Just copy the load's address operands. @@ -6334,22 +6370,11 @@ hasHighOperandLatency(const TargetSchedModel &SchedModel, return isHighLatencyDef(DefMI->getOpcode()); } -/// If the input instruction is part of a chain of dependent ops that are -/// suitable for reassociation, return the earlier instruction in the sequence -/// that defines its first operand, otherwise return a nullptr. -/// If the instruction's operands must be commuted to be considered a -/// reassociation candidate, Commuted will be set to true. -static MachineInstr *isReassocCandidate(const MachineInstr &Inst, - unsigned AssocOpcode, - bool checkPrevOneUse, - bool &Commuted) { - if (Inst.getOpcode() != AssocOpcode) - return nullptr; - - MachineOperand Op1 = Inst.getOperand(1); - MachineOperand Op2 = Inst.getOperand(2); - - const MachineBasicBlock *MBB = Inst.getParent(); +static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst, + const MachineBasicBlock *MBB) { + assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators"); + const MachineOperand &Op1 = Inst.getOperand(1); + const MachineOperand &Op2 = Inst.getOperand(2); const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); // We need virtual register definitions. @@ -6359,80 +6384,99 @@ static MachineInstr *isReassocCandidate(const MachineInstr &Inst, MI1 = MRI.getUniqueVRegDef(Op1.getReg()); if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg())) MI2 = MRI.getUniqueVRegDef(Op2.getReg()); - + // And they need to be in the trace (otherwise, they won't have a depth). - if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB) - return nullptr; - - Commuted = false; - if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode) { + if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB) + return true; + + return false; +} + +static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) { + const MachineBasicBlock *MBB = Inst.getParent(); + const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg()); + MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg()); + unsigned AssocOpcode = Inst.getOpcode(); + + // If only one operand has the same opcode and it's the second source operand, + // the operands must be commuted. + Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode; + if (Commuted) std::swap(MI1, MI2); - Commuted = true; - } - // Avoid reassociating operands when it won't provide any benefit. If both - // operands are produced by instructions of this type, we may already - // have the optimal sequence. - if (MI2->getOpcode() == AssocOpcode) - return nullptr; - - // The instruction must only be used by the other instruction that we - // reassociate with. - if (checkPrevOneUse && !MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) - return nullptr; - - // We must match a simple chain of dependent ops. - // TODO: This check is not necessary for the earliest instruction in the - // sequence. Instead of a sequence of 3 dependent instructions with the same - // opcode, we only need to find a sequence of 2 dependent instructions with - // the same opcode plus 1 other instruction that adds to the height of the - // trace. - if (MI1->getOpcode() != AssocOpcode) - return nullptr; + // 1. The previous instruction must be the same type as Inst. + // 2. The previous instruction must have virtual register definitions for its + // operands in the same basic block as Inst. + // 3. The previous instruction's result must only be used by Inst. + if (MI1->getOpcode() == AssocOpcode && + hasVirtualRegDefsInBasicBlock(*MI1, MBB) && + MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) + return true; - return MI1; + return false; } -/// Select a pattern based on how the operands of each associative operation -/// need to be commuted. -static MachineCombinerPattern::MC_PATTERN getPattern(bool CommutePrev, - bool CommuteRoot) { - if (CommutePrev) { - if (CommuteRoot) - return MachineCombinerPattern::MC_REASSOC_XA_YB; - return MachineCombinerPattern::MC_REASSOC_XA_BY; - } else { - if (CommuteRoot) - return MachineCombinerPattern::MC_REASSOC_AX_YB; - return MachineCombinerPattern::MC_REASSOC_AX_BY; - } +/// Return true if the input instruction is part of a chain of dependent ops +/// that are suitable for reassociation, otherwise return false. +/// If the instruction's operands must be commuted to have a previous +/// instruction of the same type define the first source operand, Commuted will +/// be set to true. +static bool isReassocCandidate(const MachineInstr &Inst, unsigned AssocOpcode, + bool &Commuted) { + // 1. The instruction must have the correct type. + // 2. The instruction must have virtual register definitions for its + // operands in the same basic block. + // 3. The instruction must have a reassociatable sibling. + if (Inst.getOpcode() == AssocOpcode && + hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) && + hasReassocSibling(Inst, Commuted)) + return true; + + return false; } +// FIXME: This has the potential to be expensive (compile time) while not +// improving the code at all. Some ways to limit the overhead: +// 1. Track successful transforms; bail out if hit rate gets too low. +// 2. Only enable at -O3 or some other non-default optimization level. +// 3. Pre-screen pattern candidates here: if an operand of the previous +// instruction is known to not increase the critical path, then don't match +// that pattern. bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) return false; + // TODO: There is nothing x86-specific here except the instruction type. + // This logic could be hoisted into the machine combiner pass itself. + + // Look for this reassociation pattern: + // B = A op X (Prev) + // C = B op Y (Root) + // TODO: There are many more associative instruction types to match: // 1. Other forms of scalar FP add (non-AVX) // 2. Other data types (double, integer, vectors) // 3. Other math / logic operations (mul, and, or) unsigned AssocOpcode = X86::VADDSSrr; - // TODO: There is nothing x86-specific here except the instruction type. - // This logic could be hoisted into the machine combiner pass itself. - bool CommuteRoot; - if (MachineInstr *Prev = isReassocCandidate(Root, AssocOpcode, true, - CommuteRoot)) { - bool CommutePrev; - if (isReassocCandidate(*Prev, AssocOpcode, false, CommutePrev)) { - // We found a sequence of instructions that may be suitable for a - // reassociation of operands to increase ILP. - Patterns.push_back(getPattern(CommutePrev, CommuteRoot)); - return true; + bool Commute = false; + if (isReassocCandidate(Root, AssocOpcode, Commute)) { + // We found a sequence of instructions that may be suitable for a + // reassociation of operands to increase ILP. Specify each commutation + // possibility for the Prev instruction in the sequence and let the + // machine combiner decide if changing the operands is worthwhile. + if (Commute) { + Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB); + Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB); + } else { + Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY); + Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY); } + return true; } - + return false; } @@ -6525,14 +6569,16 @@ void X86InstrInfo::genAlternativeCodeSequence( // Select the previous instruction in the sequence based on the input pattern. MachineInstr *Prev = nullptr; - if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_BY || - Pattern == MachineCombinerPattern::MC_REASSOC_XA_BY) - Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); - else if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_YB || - Pattern == MachineCombinerPattern::MC_REASSOC_XA_YB) - Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); - else - llvm_unreachable("Unknown pattern for machine combiner"); + switch (Pattern) { + case MachineCombinerPattern::MC_REASSOC_AX_BY: + case MachineCombinerPattern::MC_REASSOC_XA_BY: + Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + break; + case MachineCombinerPattern::MC_REASSOC_AX_YB: + case MachineCombinerPattern::MC_REASSOC_XA_YB: + Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); + } + assert(Prev && "Unknown pattern for machine combiner"); reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); return; @@ -6604,7 +6650,7 @@ namespace { MachineFunctionPass::getAnalysisUsage(AU); } }; -} // namespace +} char CGBR::ID = 0; FunctionPass* @@ -6716,7 +6762,7 @@ namespace { MachineFunctionPass::getAnalysisUsage(AU); } }; -} // namespace +} char LDTLSCleanup::ID = 0; FunctionPass* diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 4912951140d9..bf63336c7005 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -90,7 +90,7 @@ namespace X86 { /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(CondCode CC); -} // namespace X86 +} // end namespace X86; /// isGlobalStubReference - Return true if the specified TargetFlag operand is @@ -512,6 +512,6 @@ private: int &FrameIndex) const; }; -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index e936b4bc466e..6f38cb8eaf33 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -282,6 +282,10 @@ let RenderMethod = "addMemOperands" in { def X86MemVX64Operand : AsmOperandClass { let Name = "MemVX64"; } def X86MemVY64Operand : AsmOperandClass { let Name = "MemVY64"; } def X86MemVZ64Operand : AsmOperandClass { let Name = "MemVZ64"; } + def X86MemVX32XOperand : AsmOperandClass { let Name = "MemVX32X"; } + def X86MemVY32XOperand : AsmOperandClass { let Name = "MemVY32X"; } + def X86MemVX64XOperand : AsmOperandClass { let Name = "MemVX64X"; } + def X86MemVY64XOperand : AsmOperandClass { let Name = "MemVY64X"; } } def X86AbsMemAsmOperand : AsmOperandClass { @@ -332,7 +336,11 @@ def vx32mem : X86VMemOperand<VR128, "printi32mem", X86MemVX32Operand>; def vy32mem : X86VMemOperand<VR256, "printi32mem", X86MemVY32Operand>; def vx64mem : X86VMemOperand<VR128, "printi64mem", X86MemVX64Operand>; def vy64mem : X86VMemOperand<VR256, "printi64mem", X86MemVY64Operand>; -def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64Operand>; + +def vx32xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX32XOperand>; +def vx64xmem : X86VMemOperand<VR128X, "printi32mem", X86MemVX64XOperand>; +def vy32xmem : X86VMemOperand<VR256X, "printi32mem", X86MemVY32XOperand>; +def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>; def vz32mem : X86VMemOperand<VR512, "printi32mem", X86MemVZ32Operand>; def vz64mem : X86VMemOperand<VR512, "printi64mem", X86MemVZ64Operand>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 95629184f2cf..2a896dfe8aa8 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7860,10 +7860,11 @@ def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, int_x86_avx2_vbroadcast_sd_pd_256, WriteFShuffle256>, VEX_L; -let Predicates = [HasAVX2] in -def VBROADCASTI128 : avx_broadcast_no_int<0x5A, "vbroadcasti128", VR256, - i128mem, v4i64, loadv2i64, - WriteLoad>, VEX_L; +let mayLoad = 1, Predicates = [HasAVX2] in +def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), + (ins i128mem:$src), + "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteLoad]>, VEX, VEX_L; let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 2b829301e327..61a33484b8bf 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -21,8 +21,9 @@ enum IntrinsicType { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, - INTR_TYPE_3OP_MASK, FMA_OP_MASK, + INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, + INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, + VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, EXPAND_FROM_MEM, BLEND }; @@ -55,6 +56,22 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0), X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0), + X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0), X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0), X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0), @@ -129,15 +146,30 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0), - - X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, - X86::VSCATTERPF0DPDm, X86::VSCATTERPF1DPDm), - X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, - X86::VSCATTERPF0DPSm, X86::VSCATTERPF1DPSm), - X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, - X86::VSCATTERPF0QPDm, X86::VSCATTERPF1QPDm), - X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, - X86::VSCATTERPF0QPSm, X86::VSCATTERPF1QPSm), + X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm, + X86::VSCATTERPF1DPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm, + X86::VSCATTERPF1DPSm), + X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm, + X86::VSCATTERPF1QPDm), + X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm, + X86::VSCATTERPF1QPSm), + X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0), + X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0), X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0), X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0), @@ -251,6 +283,52 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), + + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD, + X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD, + X86ISD::FMADD_RND), + + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB, + X86ISD::FMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB, + X86ISD::FMSUB_RND), + + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD, + X86ISD::FMSUBADD_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD, + X86ISD::FMSUBADD_RND), + + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_add_pd_128, INTR_TYPE_2OP_MASK, ISD::FADD, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_256, INTR_TYPE_2OP_MASK, ISD::FADD, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, @@ -382,9 +460,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), @@ -393,7 +471,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, - X86ISD::FMAX_RND), + X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, @@ -405,7 +483,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, - X86ISD::FMIN_RND), + X86ISD::FMIN_RND), X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, X86ISD::FMIN_RND), X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, @@ -428,6 +506,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_or_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), X86_INTRINSIC_DATA(avx512_mask_or_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), X86_INTRINSIC_DATA(avx512_mask_or_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FOR, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), + X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0), X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0), @@ -581,6 +671,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_por_q_128, INTR_TYPE_2OP_MASK, ISD::OR, 0), X86_INTRINSIC_DATA(avx512_mask_por_q_256, INTR_TYPE_2OP_MASK, ISD::OR, 0), X86_INTRINSIC_DATA(avx512_mask_por_q_512, INTR_TYPE_2OP_MASK, ISD::OR, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_b_128, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_b_256, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(avx512_mask_pshuf_b_512, INTR_TYPE_2OP_MASK, + X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx512_mask_psll_d, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx512_mask_psll_q, INTR_TYPE_2OP_MASK, X86ISD::VSHL, 0), X86_INTRINSIC_DATA(avx512_mask_pslli_d, VSHIFT_MASK, X86ISD::VSHLI, 0), @@ -633,6 +729,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::RNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, @@ -667,12 +775,181 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), + + X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, + X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, + X86ISD::FMADD_RND), + + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD, + X86ISD::FNMADD_RND), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD, + X86ISD::FNMADD_RND), + + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, + X86ISD::FNMSUB_RND), + + + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK, + X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK, + X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), + + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD, + X86ISD::FMADD_RND), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD, + X86ISD::FMADD_RND), + + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, + X86ISD::FMADDSUB_RND), + + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ, + X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), @@ -696,54 +973,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), - X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, - X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, - X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, - X86ISD::FMADDSUB_RND), - X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB, - X86ISD::FMADDSUB_RND), - X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_128, FMA_OP_MASK, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_256, FMA_OP_MASK, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsub_pd_512, FMA_OP_MASK, X86ISD::FMSUB, - X86ISD::FMSUB_RND), - X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_128, FMA_OP_MASK, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_256, FMA_OP_MASK, X86ISD::FMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsub_ps_512, FMA_OP_MASK, X86ISD::FMSUB, - X86ISD::FMSUB_RND), - X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsubadd_pd_512, FMA_OP_MASK, X86ISD::FMSUBADD, - X86ISD::FMSUBADD_RND), - X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_128, FMA_OP_MASK, X86ISD::FMSUBADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_256, FMA_OP_MASK, X86ISD::FMSUBADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfmsubadd_ps_512, FMA_OP_MASK, X86ISD::FMSUBADD, - X86ISD::FMSUBADD_RND), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD, - X86ISD::FNMADD_RND), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD, - X86ISD::FNMADD_RND), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB, - X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0), - X86_INTRINSIC_DATA(fma_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, - X86ISD::FNMSUB_RND), X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0), diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 64135e0f53e5..3415cedc6fea 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -112,7 +112,7 @@ namespace llvm { OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); SMShadowTracker.count(Inst, getSubtargetInfo()); } -} // namespace llvm +} // end llvm namespace X86MCInstLower::X86MCInstLower(const MachineFunction &mf, X86AsmPrinter &asmprinter) @@ -159,10 +159,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { const GlobalValue *GV = MO.getGlobal(); AsmPrinter.getNameWithPrefix(Name, GV); } else if (MO.isSymbol()) { - if (MO.getTargetFlags() == X86II::MO_NOPREFIX) - Name += MO.getSymbolName(); - else - getMang()->getNameWithPrefix(Name, MO.getSymbolName()); + Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL); } else if (MO.isMBB()) { assert(Suffix.empty()); Sym = MO.getMBB()->getSymbol(); @@ -241,7 +238,6 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, case X86II::MO_DARWIN_NONLAZY: case X86II::MO_DLLIMPORT: case X86II::MO_DARWIN_STUB: - case X86II::MO_NOPREFIX: break; case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break; @@ -423,6 +419,8 @@ X86MCInstLower::LowerMachineOperand(const MachineInstr *MI, case MachineOperand::MO_GlobalAddress: case MachineOperand::MO_ExternalSymbol: return LowerSymbolOperand(MO, GetSymbolFromOperand(MO)); + case MachineOperand::MO_MCSymbol: + return LowerSymbolOperand(MO, MO.getMCSymbol()); case MachineOperand::MO_JumpTableIndex: return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex())); case MachineOperand::MO_ConstantPoolIndex: diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h index 342d26ab1fbb..d598b55aae3e 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.h +++ b/lib/Target/X86/X86MachineFunctionInfo.h @@ -179,6 +179,6 @@ public: } }; -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp index 33aa78ffdf8a..143e70bda9e7 100644 --- a/lib/Target/X86/X86PadShortFunction.cpp +++ b/lib/Target/X86/X86PadShortFunction.cpp @@ -84,7 +84,7 @@ namespace { }; char PadShortFunc::ID = 0; -} // namespace +} FunctionPass *llvm::createX86PadShortFunctions() { return new PadShortFunc(); diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 00e213423974..0033b5058187 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -598,10 +598,10 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { } namespace llvm { -unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, - bool High) { +unsigned getX86SubSuperRegisterOrZero(unsigned Reg, MVT::SimpleValueType VT, + bool High) { switch (VT) { - default: llvm_unreachable("Unexpected VT"); + default: return 0; case MVT::i8: if (High) { switch (Reg) { @@ -625,7 +625,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, } } else { switch (Reg) { - default: llvm_unreachable("Unexpected register"); + default: return 0; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AL; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -662,7 +662,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, } case MVT::i16: switch (Reg) { - default: llvm_unreachable("Unexpected register"); + default: return 0; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -698,7 +698,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, } case MVT::i32: switch (Reg) { - default: llvm_unreachable("Unexpected register"); + default: return 0; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::EAX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -734,7 +734,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, } case MVT::i64: switch (Reg) { - default: llvm_unreachable("Unexpected register"); + default: return 0; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::RAX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: @@ -771,6 +771,14 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, } } +unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, + bool High) { + unsigned Res = getX86SubSuperRegisterOrZero(Reg, VT, High); + if (Res == 0) + llvm_unreachable("Unexpected register or VT"); + return Res; +} + unsigned get512BitSuperRegister(unsigned Reg) { if (Reg >= X86::XMM0 && Reg <= X86::XMM31) return X86::ZMM0 + (Reg - X86::XMM0); @@ -781,4 +789,4 @@ unsigned get512BitSuperRegister(unsigned Reg) { llvm_unreachable("Unexpected SIMD register"); } -} // namespace llvm +} diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 459ecf7fff72..8de1d0bf8ec8 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -128,14 +128,19 @@ public: unsigned getSlotSize() const { return SlotSize; } }; -// getX86SubSuperRegister - X86 utility function. It returns the sub or super -// register of a specific X86 register. -// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX +/// Returns the sub or super register of a specific X86 register. +/// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) returns X86::AX. +/// Aborts on error. unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); +/// Returns the sub or super register of a specific X86 register. +/// Like getX86SubSuperRegister() but returns 0 on error. +unsigned getX86SubSuperRegisterOrZero(unsigned, MVT::SimpleValueType, + bool High = false); + //get512BitRegister - X86 utility - returns 512-bit super register unsigned get512BitSuperRegister(unsigned Reg); -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h index 25606d3f5df3..eb7e0ed9de6c 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.h +++ b/lib/Target/X86/X86SelectionDAGInfo.h @@ -48,6 +48,6 @@ public: MachinePointerInfo SrcPtrInfo) const override; }; -} // namespace llvm +} #endif diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 6934061c6922..d420abbe1433 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -490,6 +490,6 @@ public: } }; -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 3d6eb4f7ce02..fb9cb4ba4c86 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -110,12 +110,15 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, if (Subtarget.isTargetWin64()) this->Options.TrapUnreachable = true; - // TODO: By default, all reciprocal estimate operations are off because - // that matches the behavior before TargetRecip was added (except for btver2 - // which used subtarget features to enable this type of codegen). - // We should change this to match GCC behavior where everything but - // scalar division estimates are turned on by default with -ffast-math. - this->Options.Reciprocals.setDefaults("all", false, 1); + // By default (and when -ffast-math is on), enable estimate codegen for + // everything except scalar division. By default, use 1 refinement step for + // all operations. Defaults may be overridden by using command-line options. + // Scalar division estimates are disabled because they break too much + // real-world code. These defaults match GCC behavior. + this->Options.Reciprocals.setDefaults("sqrtf", true, 1); + this->Options.Reciprocals.setDefaults("divf", false, 1); + this->Options.Reciprocals.setDefaults("vec-sqrtf", true, 1); + this->Options.Reciprocals.setDefaults("vec-divf", true, 1); initAsmInfo(); } diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index be56888b75f4..262955698e44 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -44,6 +44,6 @@ public: } }; -} // namespace llvm +} // End llvm namespace #endif diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index f9f62904b64b..6f900ea351ef 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -131,52 +131,44 @@ static std::string APIntToHexString(const APInt &AI) { return HexString; } - static std::string scalarConstantToHexString(const Constant *C) { Type *Ty = C->getType(); - APInt AI; if (isa<UndefValue>(C)) { - AI = APInt(Ty->getPrimitiveSizeInBits(), /*val=*/0); - } else if (Ty->isFloatTy() || Ty->isDoubleTy()) { - const auto *CFP = cast<ConstantFP>(C); - AI = CFP->getValueAPF().bitcastToAPInt(); - } else if (Ty->isIntegerTy()) { - const auto *CI = cast<ConstantInt>(C); - AI = CI->getValue(); + return APIntToHexString(APInt::getNullValue(Ty->getPrimitiveSizeInBits())); + } else if (const auto *CFP = dyn_cast<ConstantFP>(C)) { + return APIntToHexString(CFP->getValueAPF().bitcastToAPInt()); + } else if (const auto *CI = dyn_cast<ConstantInt>(C)) { + return APIntToHexString(CI->getValue()); } else { - llvm_unreachable("unexpected constant pool element type!"); + unsigned NumElements; + if (isa<VectorType>(Ty)) + NumElements = Ty->getVectorNumElements(); + else + NumElements = Ty->getArrayNumElements(); + std::string HexString; + for (int I = NumElements - 1, E = -1; I != E; --I) + HexString += scalarConstantToHexString(C->getAggregateElement(I)); + return HexString; } - return APIntToHexString(AI); } MCSection * X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, const Constant *C) const { - if (Kind.isReadOnly()) { - if (C) { - Type *Ty = C->getType(); - SmallString<32> COMDATSymName; - if (Ty->isFloatTy() || Ty->isDoubleTy()) { - COMDATSymName = "__real@"; - COMDATSymName += scalarConstantToHexString(C); - } else if (const auto *VTy = dyn_cast<VectorType>(Ty)) { - uint64_t NumBits = VTy->getBitWidth(); - if (NumBits == 128 || NumBits == 256) { - COMDATSymName = NumBits == 128 ? "__xmm@" : "__ymm@"; - for (int I = VTy->getNumElements() - 1, E = -1; I != E; --I) - COMDATSymName += - scalarConstantToHexString(C->getAggregateElement(I)); - } - } - if (!COMDATSymName.empty()) { - unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | - COFF::IMAGE_SCN_MEM_READ | - COFF::IMAGE_SCN_LNK_COMDAT; - return getContext().getCOFFSection(".rdata", Characteristics, Kind, - COMDATSymName, - COFF::IMAGE_COMDAT_SELECT_ANY); - } - } + if (Kind.isMergeableConst() && C) { + const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ | + COFF::IMAGE_SCN_LNK_COMDAT; + std::string COMDATSymName; + if (Kind.isMergeableConst4() || Kind.isMergeableConst8()) + COMDATSymName = "__real@" + scalarConstantToHexString(C); + else if (Kind.isMergeableConst16()) + COMDATSymName = "__xmm@" + scalarConstantToHexString(C); + + if (!COMDATSymName.empty()) + return getContext().getCOFFSection(".rdata", Characteristics, Kind, + COMDATSymName, + COFF::IMAGE_COMDAT_SELECT_ANY); } return TargetLoweringObjectFile::getSectionForConstant(Kind, C); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 13384fab5985..0c82a700952b 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1130,3 +1130,18 @@ bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) { return isLegalMaskedLoad(DataType, Consecutive); } +bool X86TTIImpl::hasCompatibleFunctionAttributes(const Function *Caller, + const Function *Callee) const { + const TargetMachine &TM = getTLI()->getTargetMachine(); + + // Work this as a subsetting of subtarget features. + const FeatureBitset &CallerBits = + TM.getSubtargetImpl(*Caller)->getFeatureBits(); + const FeatureBitset &CalleeBits = + TM.getSubtargetImpl(*Callee)->getFeatureBits(); + + // FIXME: This is likely too limiting as it will include subtarget features + // that we might not care about for inlining, but it is conservatively + // correct. + return (CallerBits & CalleeBits) == CalleeBits; +} diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index e570bb55710a..a83158440193 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -103,6 +103,8 @@ public: Type *Ty); bool isLegalMaskedLoad(Type *DataType, int Consecutive); bool isLegalMaskedStore(Type *DataType, int Consecutive); + bool hasCompatibleFunctionAttributes(const Function *Caller, + const Function *Callee) const; /// @} }; diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 71ce45b0bc2e..6925b272b4a5 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -86,7 +86,7 @@ namespace { }; char VZeroUpperInserter::ID = 0; -} // namespace +} FunctionPass *llvm::createX86IssueVZeroUpperPass() { return new VZeroUpperInserter(); diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index c9e80945549b..90357257b9ef 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -105,7 +105,7 @@ private: /// The linked list node subobject inside of RegNode. Value *Link = nullptr; }; -} // namespace +} FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); } @@ -398,6 +398,7 @@ void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) { // Set up RegNodeEscapeIndex int RegNodeEscapeIndex = escapeRegNode(F); + FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex; // Only insert stores in catch handlers. Constant *FI8 = @@ -480,8 +481,8 @@ void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) { WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F); // Remember and return the index that we used. We save it in WinEHFuncInfo so - // that we can lower llvm.x86.seh.exceptioninfo later in filter functions - // without too much trouble. + // that we can lower llvm.x86.seh.recoverfp later in filter functions without + // too much trouble. int RegNodeEscapeIndex = escapeRegNode(F); FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex; @@ -528,14 +529,12 @@ void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) { } } - // Insert llvm.stackrestore into each __except block. - Function *StackRestore = - Intrinsic::getDeclaration(TheModule, Intrinsic::stackrestore); + // Insert llvm.x86.seh.restoreframe() into each __except block. + Function *RestoreFrame = + Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_restoreframe); for (BasicBlock *ExceptBB : ExceptBlocks) { IRBuilder<> Builder(ExceptBB->begin()); - Value *SP = - Builder.CreateLoad(Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); - Builder.CreateCall(StackRestore, {SP}); + Builder.CreateCall(RestoreFrame, {}); } } |