diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/X86')
125 files changed, 17919 insertions, 11135 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index d37d812df485..a3014b2aba92 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -31,6 +31,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -39,6 +40,11 @@ using namespace llvm; +static cl::opt<bool> LVIInlineAsmHardening( + "x86-experimental-lvi-inline-asm-hardening", + cl::desc("Harden inline assembly code that may be vulnerable to Load Value" + " Injection (LVI). This feature is experimental."), cl::Hidden); + static bool checkScale(unsigned Scale, StringRef &ErrMsg) { if (Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) { ErrMsg = "scale factor in address must be 1, 2, 4 or 8"; @@ -74,7 +80,7 @@ class X86AsmParser : public MCTargetAsmParser { enum VEXEncoding { VEXEncoding_Default, - VEXEncoding_VEX2, + VEXEncoding_VEX, VEXEncoding_VEX3, VEXEncoding_EVEX, }; @@ -326,6 +332,7 @@ private: IES_PLUS, IES_MINUS, IES_OFFSET, + IES_CAST, IES_NOT, IES_MULTIPLY, IES_DIVIDE, @@ -352,6 +359,7 @@ private: bool MemExpr; bool OffsetOperator; SMLoc OffsetOperatorLoc; + StringRef CurType; bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) { if (Sym) { @@ -379,6 +387,7 @@ private: unsigned getScale() { return Scale; } const MCExpr *getSym() { return Sym; } StringRef getSymName() { return SymName; } + StringRef getType() { return CurType; } int64_t getImm() { return Imm + IC.execute(); } bool isValidEndState() { return State == IES_RBRAC || State == IES_INTEGER; @@ -611,9 +620,9 @@ private: } bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName, const InlineAsmIdentifierInfo &IDInfo, - bool ParsingInlineAsm, StringRef &ErrMsg) { + bool ParsingMSInlineAsm, StringRef &ErrMsg) { // InlineAsm: Treat an enum value as an integer - if (ParsingInlineAsm) + if (ParsingMSInlineAsm) if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) return onInteger(IDInfo.Enum.EnumVal, ErrMsg); // Treat a symbolic constant like an integer @@ -624,6 +633,7 @@ private: default: State = IES_ERROR; break; + case IES_CAST: case IES_PLUS: case IES_MINUS: case IES_NOT: @@ -634,7 +644,7 @@ private: MemExpr = true; State = IES_INTEGER; IC.pushOperand(IC_IMM); - if (ParsingInlineAsm) + if (ParsingMSInlineAsm) Info = IDInfo; break; } @@ -736,6 +746,7 @@ private: IC.pushOperator(IC_PLUS); break; case IES_INIT: + case IES_CAST: assert(!BracCount && "BracCount should be zero on parsing's start"); State = IES_LBRAC; break; @@ -808,6 +819,7 @@ private: case IES_INTEGER: case IES_OFFSET: case IES_REGISTER: + case IES_RBRAC: case IES_RPAREN: State = IES_RPAREN; IC.pushOperator(IC_RPAREN); @@ -815,7 +827,7 @@ private: } } bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID, - const InlineAsmIdentifierInfo &IDInfo, bool ParsingInlineAsm, + const InlineAsmIdentifierInfo &IDInfo, bool ParsingMSInlineAsm, StringRef &ErrMsg) { PrevState = State; switch (State) { @@ -833,13 +845,26 @@ private: // As we cannot yet resolve the actual value (offset), we retain // the requested semantics by pushing a '0' to the operands stack IC.pushOperand(IC_IMM); - if (ParsingInlineAsm) { + if (ParsingMSInlineAsm) { Info = IDInfo; } break; } return false; } + void onCast(StringRef Type) { + PrevState = State; + switch (State) { + default: + State = IES_ERROR; + break; + case IES_LPAREN: + setType(Type); + State = IES_CAST; + break; + } + } + void setType(StringRef Type) { CurType = Type; } }; bool Error(SMLoc L, const Twine &Msg, SMRange Range = None, @@ -858,6 +883,11 @@ private: return nullptr; } + bool MatchRegisterByName(unsigned &RegNo, StringRef RegName, SMLoc StartLoc, + SMLoc EndLoc); + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc, + bool RestoreOnFailure); + std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc); std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc); bool IsSIReg(unsigned Reg); @@ -896,10 +926,10 @@ private: bool ParseIntelMemoryOperandSize(unsigned &Size); std::unique_ptr<X86Operand> - CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, - unsigned IndexReg, unsigned Scale, SMLoc Start, - SMLoc End, unsigned Size, StringRef Identifier, - const InlineAsmIdentifierInfo &Info); + CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, + unsigned IndexReg, unsigned Scale, SMLoc Start, + SMLoc End, unsigned Size, StringRef Identifier, + const InlineAsmIdentifierInfo &Info); bool parseDirectiveEven(SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); @@ -927,9 +957,14 @@ private: bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); - /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds + // Load Value Injection (LVI) Mitigations for machine code + void emitWarningForSpecialLVIInstruction(SMLoc Loc); + void applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out); + void applyLVILoadHardeningMitigation(MCInst &Inst, MCStreamer &Out); + + /// Wrapper around MCStreamer::emitInstruction(). Possibly adds /// instrumentation around Inst. - void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out); + void emitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out); bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -1023,6 +1058,8 @@ public: } bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) override; bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override; @@ -1129,36 +1166,21 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg, return checkScale(Scale, ErrMsg); } -bool X86AsmParser::ParseRegister(unsigned &RegNo, - SMLoc &StartLoc, SMLoc &EndLoc) { - MCAsmParser &Parser = getParser(); - RegNo = 0; - const AsmToken &PercentTok = Parser.getTok(); - StartLoc = PercentTok.getLoc(); - +bool X86AsmParser::MatchRegisterByName(unsigned &RegNo, StringRef RegName, + SMLoc StartLoc, SMLoc EndLoc) { // If we encounter a %, ignore it. This code handles registers with and // without the prefix, unprefixed registers can occur in cfi directives. - if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent)) - Parser.Lex(); // Eat percent token. + RegName.consume_front("%"); - const AsmToken &Tok = Parser.getTok(); - EndLoc = Tok.getEndLoc(); - - if (Tok.isNot(AsmToken::Identifier)) { - if (isParsingIntelSyntax()) return true; - return Error(StartLoc, "invalid register name", - SMRange(StartLoc, EndLoc)); - } - - RegNo = MatchRegisterName(Tok.getString()); + RegNo = MatchRegisterName(RegName); // If the match failed, try the register name as lowercase. if (RegNo == 0) - RegNo = MatchRegisterName(Tok.getString().lower()); + RegNo = MatchRegisterName(RegName.lower()); // The "flags" and "mxcsr" registers cannot be referenced directly. // Treat it as an identifier instead. - if (isParsingInlineAsm() && isParsingIntelSyntax() && + if (isParsingMSInlineAsm() && isParsingIntelSyntax() && (RegNo == X86::EFLAGS || RegNo == X86::MXCSR)) RegNo = 0; @@ -1172,27 +1194,137 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) || X86II::isX86_64NonExtLowByteReg(RegNo) || X86II::isX86_64ExtendedReg(RegNo)) { - StringRef RegName = Tok.getString(); - Parser.Lex(); // Eat register name. return Error(StartLoc, "register %" + RegName + " is only available in 64-bit mode", SMRange(StartLoc, EndLoc)); } } + // If this is "db[0-15]", match it as an alias + // for dr[0-15]. + if (RegNo == 0 && RegName.startswith("db")) { + if (RegName.size() == 3) { + switch (RegName[2]) { + case '0': + RegNo = X86::DR0; + break; + case '1': + RegNo = X86::DR1; + break; + case '2': + RegNo = X86::DR2; + break; + case '3': + RegNo = X86::DR3; + break; + case '4': + RegNo = X86::DR4; + break; + case '5': + RegNo = X86::DR5; + break; + case '6': + RegNo = X86::DR6; + break; + case '7': + RegNo = X86::DR7; + break; + case '8': + RegNo = X86::DR8; + break; + case '9': + RegNo = X86::DR9; + break; + } + } else if (RegName.size() == 4 && RegName[2] == '1') { + switch (RegName[3]) { + case '0': + RegNo = X86::DR10; + break; + case '1': + RegNo = X86::DR11; + break; + case '2': + RegNo = X86::DR12; + break; + case '3': + RegNo = X86::DR13; + break; + case '4': + RegNo = X86::DR14; + break; + case '5': + RegNo = X86::DR15; + break; + } + } + } + + if (RegNo == 0) { + if (isParsingIntelSyntax()) + return true; + return Error(StartLoc, "invalid register name", SMRange(StartLoc, EndLoc)); + } + return false; +} + +bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc, bool RestoreOnFailure) { + MCAsmParser &Parser = getParser(); + MCAsmLexer &Lexer = getLexer(); + RegNo = 0; + + SmallVector<AsmToken, 5> Tokens; + auto OnFailure = [RestoreOnFailure, &Lexer, &Tokens]() { + if (RestoreOnFailure) { + while (!Tokens.empty()) { + Lexer.UnLex(Tokens.pop_back_val()); + } + } + }; + + const AsmToken &PercentTok = Parser.getTok(); + StartLoc = PercentTok.getLoc(); + + // If we encounter a %, ignore it. This code handles registers with and + // without the prefix, unprefixed registers can occur in cfi directives. + if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent)) { + Tokens.push_back(PercentTok); + Parser.Lex(); // Eat percent token. + } + + const AsmToken &Tok = Parser.getTok(); + EndLoc = Tok.getEndLoc(); + + if (Tok.isNot(AsmToken::Identifier)) { + OnFailure(); + if (isParsingIntelSyntax()) return true; + return Error(StartLoc, "invalid register name", + SMRange(StartLoc, EndLoc)); + } + + if (MatchRegisterByName(RegNo, Tok.getString(), StartLoc, EndLoc)) { + OnFailure(); + return true; + } + // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens. if (RegNo == X86::ST0) { + Tokens.push_back(Tok); Parser.Lex(); // Eat 'st' // Check to see if we have '(4)' after %st. - if (getLexer().isNot(AsmToken::LParen)) + if (Lexer.isNot(AsmToken::LParen)) return false; // Lex the paren. - getParser().Lex(); + Tokens.push_back(Parser.getTok()); + Parser.Lex(); const AsmToken &IntTok = Parser.getTok(); - if (IntTok.isNot(AsmToken::Integer)) + if (IntTok.isNot(AsmToken::Integer)) { + OnFailure(); return Error(IntTok.getLoc(), "expected stack index"); + } switch (IntTok.getIntVal()) { case 0: RegNo = X86::ST0; break; case 1: RegNo = X86::ST1; break; @@ -1202,11 +1334,18 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, case 5: RegNo = X86::ST5; break; case 6: RegNo = X86::ST6; break; case 7: RegNo = X86::ST7; break; - default: return Error(IntTok.getLoc(), "invalid stack index"); + default: + OnFailure(); + return Error(IntTok.getLoc(), "invalid stack index"); } - if (getParser().Lex().isNot(AsmToken::RParen)) + // Lex IntTok + Tokens.push_back(IntTok); + Parser.Lex(); + if (Lexer.isNot(AsmToken::RParen)) { + OnFailure(); return Error(Parser.getTok().getLoc(), "expected ')'"); + } EndLoc = Parser.getTok().getEndLoc(); Parser.Lex(); // Eat ')' @@ -1215,41 +1354,8 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, EndLoc = Parser.getTok().getEndLoc(); - // If this is "db[0-15]", match it as an alias - // for dr[0-15]. - if (RegNo == 0 && Tok.getString().startswith("db")) { - if (Tok.getString().size() == 3) { - switch (Tok.getString()[2]) { - case '0': RegNo = X86::DR0; break; - case '1': RegNo = X86::DR1; break; - case '2': RegNo = X86::DR2; break; - case '3': RegNo = X86::DR3; break; - case '4': RegNo = X86::DR4; break; - case '5': RegNo = X86::DR5; break; - case '6': RegNo = X86::DR6; break; - case '7': RegNo = X86::DR7; break; - case '8': RegNo = X86::DR8; break; - case '9': RegNo = X86::DR9; break; - } - } else if (Tok.getString().size() == 4 && Tok.getString()[2] == '1') { - switch (Tok.getString()[3]) { - case '0': RegNo = X86::DR10; break; - case '1': RegNo = X86::DR11; break; - case '2': RegNo = X86::DR12; break; - case '3': RegNo = X86::DR13; break; - case '4': RegNo = X86::DR14; break; - case '5': RegNo = X86::DR15; break; - } - } - - if (RegNo != 0) { - EndLoc = Parser.getTok().getEndLoc(); - Parser.Lex(); // Eat it. - return false; - } - } - if (RegNo == 0) { + OnFailure(); if (isParsingIntelSyntax()) return true; return Error(StartLoc, "invalid register name", SMRange(StartLoc, EndLoc)); @@ -1259,6 +1365,25 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, return false; } +bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) { + return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false); +} + +OperandMatchResultTy X86AsmParser::tryParseRegister(unsigned &RegNo, + SMLoc &StartLoc, + SMLoc &EndLoc) { + bool Result = + ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true); + bool PendingErrors = getParser().hasPendingError(); + getParser().clearPendingErrors(); + if (PendingErrors) + return MatchOperand_ParseFail; + if (Result) + return MatchOperand_NoMatch; + return MatchOperand_Success; +} + std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) { bool Parse32 = is32BitMode() || Code16GCC; unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI); @@ -1405,7 +1530,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() { return ParseATTOperand(); } -std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( +std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm( unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, const InlineAsmIdentifierInfo &Info) { @@ -1445,8 +1570,9 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( } else { BaseReg = BaseReg ? BaseReg : 1; return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, - IndexReg, Scale, Start, End, Size, Identifier, - Decl, FrontendSize); + IndexReg, Scale, Start, End, Size, + /*DefaultBaseReg=*/X86::RIP, Identifier, Decl, + FrontendSize); } } @@ -1483,7 +1609,7 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, return true; StringRef ErrMsg; ParseError = - SM.onOffset(Val, OffsetLoc, ID, Info, isParsingInlineAsm(), ErrMsg); + SM.onOffset(Val, OffsetLoc, ID, Info, isParsingMSInlineAsm(), ErrMsg); if (ParseError) return Error(SMLoc::getFromPointer(Name.data()), ErrMsg); } else { @@ -1525,12 +1651,51 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { SMLoc IdentLoc = Tok.getLoc(); StringRef Identifier = Tok.getString(); UpdateLocLex = false; - // Register + // (MASM only) <TYPE> PTR operator + if (Parser.isParsingMasm()) { + const AsmToken &NextTok = getLexer().peekTok(); + if (NextTok.is(AsmToken::Identifier) && + NextTok.getIdentifier().equals_lower("ptr")) { + SM.onCast(Identifier); + // Eat type and PTR. + consumeToken(); + End = consumeToken(); + break; + } + } + // Register, or (MASM only) <register>.<field> unsigned Reg; - if (Tok.is(AsmToken::Identifier) && !ParseRegister(Reg, IdentLoc, End)) { - if (SM.onRegister(Reg, ErrMsg)) - return Error(Tok.getLoc(), ErrMsg); - break; + if (Tok.is(AsmToken::Identifier)) { + if (!ParseRegister(Reg, IdentLoc, End, /*RestoreOnFailure=*/true)) { + if (SM.onRegister(Reg, ErrMsg)) + return Error(IdentLoc, ErrMsg); + break; + } + if (Parser.isParsingMasm()) { + const std::pair<StringRef, StringRef> IDField = + Tok.getString().split('.'); + const StringRef ID = IDField.first, Field = IDField.second; + SMLoc IDEndLoc = SMLoc::getFromPointer(ID.data() + ID.size()); + if (!Field.empty() && + !MatchRegisterByName(Reg, ID, IdentLoc, IDEndLoc)) { + if (SM.onRegister(Reg, ErrMsg)) + return Error(IdentLoc, ErrMsg); + + StringRef Type; + unsigned Offset = 0; + SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data()); + if (Parser.lookUpField(Field, Type, Offset)) + return Error(FieldStartLoc, "unknown offset"); + else if (SM.onPlus(ErrMsg)) + return Error(getTok().getLoc(), ErrMsg); + else if (SM.onInteger(Offset, ErrMsg)) + return Error(IdentLoc, ErrMsg); + SM.setType(Type); + + End = consumeToken(); + break; + } + } } // Operator synonymous ("not", "or" etc.) bool ParseError = false; @@ -1542,37 +1707,40 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { // Symbol reference, when parsing assembly content InlineAsmIdentifierInfo Info; const MCExpr *Val; - if (!isParsingInlineAsm()) { - if (getParser().parsePrimaryExpr(Val, End)) { - return Error(Tok.getLoc(), "Unexpected identifier!"); - } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) { - return Error(IdentLoc, ErrMsg); - } else + if (isParsingMSInlineAsm() || Parser.isParsingMasm()) { + // MS Dot Operator expression + if (Identifier.count('.') && + (PrevTK == AsmToken::RBrac || PrevTK == AsmToken::RParen)) { + if (ParseIntelDotOperator(SM, End)) + return true; break; + } } - // MS InlineAsm operators (TYPE/LENGTH/SIZE) - if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) { - if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) { - if (SM.onInteger(Val, ErrMsg)) - return Error(IdentLoc, ErrMsg); - } else - return true; - break; - } - // MS Dot Operator expression - if (Identifier.count('.') && PrevTK == AsmToken::RBrac) { - if (ParseIntelDotOperator(SM, End)) + if (isParsingMSInlineAsm()) { + // MS InlineAsm operators (TYPE/LENGTH/SIZE) + if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) { + if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) { + if (SM.onInteger(Val, ErrMsg)) + return Error(IdentLoc, ErrMsg); + } else + return true; + break; + } + // MS InlineAsm identifier + // Call parseIdentifier() to combine @ with the identifier behind it. + if (TK == AsmToken::At && Parser.parseIdentifier(Identifier)) + return Error(IdentLoc, "expected identifier"); + if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End)) return true; + else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg)) + return Error(IdentLoc, ErrMsg); break; } - // MS InlineAsm identifier - // Call parseIdentifier() to combine @ with the identifier behind it. - if (TK == AsmToken::At && Parser.parseIdentifier(Identifier)) - return Error(IdentLoc, "expected identifier"); - if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End)) - return true; - else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg)) + if (getParser().parsePrimaryExpr(Val, End)) { + return Error(Tok.getLoc(), "Unexpected identifier!"); + } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) { return Error(IdentLoc, ErrMsg); + } break; } case AsmToken::Integer: { @@ -1593,8 +1761,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return Error(Loc, "invalid reference to undefined symbol"); StringRef Identifier = Sym->getName(); InlineAsmIdentifierInfo Info; - if (SM.onIdentifierExpr(Val, Identifier, Info, - isParsingInlineAsm(), ErrMsg)) + if (SM.onIdentifierExpr(Val, Identifier, Info, isParsingMSInlineAsm(), + ErrMsg)) return Error(Loc, ErrMsg); End = consumeToken(); } else { @@ -1688,7 +1856,7 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier( const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End, bool IsParsingOffsetOperator) { MCAsmParser &Parser = getParser(); - assert(isParsingInlineAsm() && "Expected to be parsing inline assembly."); + assert(isParsingMSInlineAsm() && "Expected to be parsing inline assembly."); Val = nullptr; StringRef LineBuf(Identifier.data()); @@ -1777,9 +1945,11 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) { } /// Parse the '.' operator. -bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) { +bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, + SMLoc &End) { const AsmToken &Tok = getTok(); - unsigned Offset; + StringRef Type; + unsigned Offset = 0; // Drop the optional '.'. StringRef DotDispStr = Tok.getString(); @@ -1791,10 +1961,15 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) APInt DotDisp; DotDispStr.getAsInteger(10, DotDisp); Offset = DotDisp.getZExtValue(); - } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { - std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.'); - if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second, - Offset)) + } else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) && + Tok.is(AsmToken::Identifier)) { + const std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.'); + const StringRef Base = BaseMember.first, Member = BaseMember.second; + if (getParser().lookUpField(SM.getType(), DotDispStr, Type, Offset) && + getParser().lookUpField(SM.getSymName(), DotDispStr, Type, Offset) && + getParser().lookUpField(DotDispStr, Type, Offset) && + (!SemaCallback || + SemaCallback->LookupInlineAsmField(Base, Member, Offset))) return Error(Tok.getLoc(), "Unable to lookup field reference!"); } else return Error(Tok.getLoc(), "Unexpected token type!"); @@ -1805,6 +1980,7 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) while (Tok.getLoc().getPointer() < DotExprEndLoc) Lex(); SM.addImm(Offset); + SM.setType(Type); return false; } @@ -1816,7 +1992,7 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID, // Eat offset, mark start of identifier. SMLoc Start = Lex().getLoc(); ID = getTok().getString(); - if (!isParsingInlineAsm()) { + if (!isParsingMSInlineAsm()) { if ((getTok().isNot(AsmToken::Identifier) && getTok().isNot(AsmToken::String)) || getParser().parsePrimaryExpr(Val, End)) @@ -1939,7 +2115,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { if (ParseIntelExpression(SM, End)) return nullptr; - if (isParsingInlineAsm()) + if (isParsingMSInlineAsm()) RewriteIntelExpression(SM, Start, Tok.getLoc()); int64_t Imm = SM.getImm(); @@ -1953,7 +2129,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { // RegNo != 0 specifies a valid segment register, // and we are parsing a segment override if (!SM.isMemExpr() && !RegNo) { - if (isParsingInlineAsm() && SM.isOffsetOperator()) { + if (isParsingMSInlineAsm() && SM.isOffsetOperator()) { const InlineAsmIdentifierInfo Info = SM.getIdentifierInfo(); if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) { // Disp includes the address of a variable; make sure this is recorded @@ -2005,10 +2181,18 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(), ErrMsg)) return ErrorOperand(Start, ErrMsg); - if (isParsingInlineAsm()) - return CreateMemForInlineAsm(RegNo, Disp, BaseReg, IndexReg, - Scale, Start, End, Size, SM.getSymName(), - SM.getIdentifierInfo()); + if (isParsingMSInlineAsm()) + return CreateMemForMSInlineAsm(RegNo, Disp, BaseReg, IndexReg, Scale, Start, + End, Size, SM.getSymName(), + SM.getIdentifierInfo()); + + // When parsing x64 MS-style assembly, all memory operands default to + // RIP-relative when interpreted as non-absolute references. + if (Parser.isParsingMasm() && is64BitMode()) + return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, BaseReg, + IndexReg, Scale, Start, End, Size, + /*DefaultBaseReg=*/X86::RIP); + if (!(BaseReg || IndexReg || RegNo)) return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size); return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, @@ -2420,8 +2604,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return Error(Parser.getTok().getLoc(), "Expected '}'"); Parser.Lex(); // Eat curly. - if (Prefix == "vex2") - ForcedVEXEncoding = VEXEncoding_VEX2; + if (Prefix == "vex" || Prefix == "vex2") + ForcedVEXEncoding = VEXEncoding_VEX; else if (Prefix == "vex3") ForcedVEXEncoding = VEXEncoding_VEX3; else if (Prefix == "evex") @@ -2711,7 +2895,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // In MS inline asm curly braces mark the beginning/end of a block, // therefore they should be interepreted as end of statement CurlyAsEndOfStatement = - isParsingIntelSyntax() && isParsingInlineAsm() && + isParsingIntelSyntax() && isParsingMSInlineAsm() && (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly)); if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement) return TokError("unexpected token in argument list"); @@ -3096,9 +3280,122 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { static const char *getSubtargetFeatureName(uint64_t Val); -void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, +void X86AsmParser::emitWarningForSpecialLVIInstruction(SMLoc Loc) { + Warning(Loc, "Instruction may be vulnerable to LVI and " + "requires manual mitigation"); + Note(SMLoc(), "See https://software.intel.com/" + "security-software-guidance/insights/" + "deep-dive-load-value-injection#specialinstructions" + " for more information"); +} + +/// RET instructions and also instructions that indirect calls/jumps from memory +/// combine a load and a branch within a single instruction. To mitigate these +/// instructions against LVI, they must be decomposed into separate load and +/// branch instructions, with an LFENCE in between. For more details, see: +/// - X86LoadValueInjectionRetHardening.cpp +/// - X86LoadValueInjectionIndirectThunks.cpp +/// - https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection +/// +/// Returns `true` if a mitigation was applied or warning was emitted. +void X86AsmParser::applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out) { + // Information on control-flow instructions that require manual mitigation can + // be found here: + // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions + switch (Inst.getOpcode()) { + case X86::RETW: + case X86::RETL: + case X86::RETQ: + case X86::RETIL: + case X86::RETIQ: + case X86::RETIW: { + MCInst ShlInst, FenceInst; + bool Parse32 = is32BitMode() || Code16GCC; + unsigned Basereg = + is64BitMode() ? X86::RSP : (Parse32 ? X86::ESP : X86::SP); + const MCExpr *Disp = MCConstantExpr::create(0, getContext()); + auto ShlMemOp = X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp, + /*BaseReg=*/Basereg, /*IndexReg=*/0, + /*Scale=*/1, SMLoc{}, SMLoc{}, 0); + ShlInst.setOpcode(X86::SHL64mi); + ShlMemOp->addMemOperands(ShlInst, 5); + ShlInst.addOperand(MCOperand::createImm(0)); + FenceInst.setOpcode(X86::LFENCE); + Out.emitInstruction(ShlInst, getSTI()); + Out.emitInstruction(FenceInst, getSTI()); + return; + } + case X86::JMP16m: + case X86::JMP32m: + case X86::JMP64m: + case X86::CALL16m: + case X86::CALL32m: + case X86::CALL64m: + emitWarningForSpecialLVIInstruction(Inst.getLoc()); + return; + } +} + +/// To mitigate LVI, every instruction that performs a load can be followed by +/// an LFENCE instruction to squash any potential mis-speculation. There are +/// some instructions that require additional considerations, and may requre +/// manual mitigation. For more details, see: +/// https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection +/// +/// Returns `true` if a mitigation was applied or warning was emitted. +void X86AsmParser::applyLVILoadHardeningMitigation(MCInst &Inst, + MCStreamer &Out) { + auto Opcode = Inst.getOpcode(); + auto Flags = Inst.getFlags(); + if ((Flags & X86::IP_HAS_REPEAT) || (Flags & X86::IP_HAS_REPEAT_NE)) { + // Information on REP string instructions that require manual mitigation can + // be found here: + // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions + switch (Opcode) { + case X86::CMPSB: + case X86::CMPSW: + case X86::CMPSL: + case X86::CMPSQ: + case X86::SCASB: + case X86::SCASW: + case X86::SCASL: + case X86::SCASQ: + emitWarningForSpecialLVIInstruction(Inst.getLoc()); + return; + } + } else if (Opcode == X86::REP_PREFIX || Opcode == X86::REPNE_PREFIX) { + // If a REP instruction is found on its own line, it may or may not be + // followed by a vulnerable instruction. Emit a warning just in case. + emitWarningForSpecialLVIInstruction(Inst.getLoc()); + return; + } + + const MCInstrDesc &MCID = MII.get(Inst.getOpcode()); + + // Can't mitigate after terminators or calls. A control flow change may have + // already occurred. + if (MCID.isTerminator() || MCID.isCall()) + return; + + // LFENCE has the mayLoad property, don't double fence. + if (MCID.mayLoad() && Inst.getOpcode() != X86::LFENCE) { + MCInst FenceInst; + FenceInst.setOpcode(X86::LFENCE); + Out.emitInstruction(FenceInst, getSTI()); + } +} + +void X86AsmParser::emitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out) { - Out.EmitInstruction(Inst, getSTI()); + if (LVIInlineAsmHardening && + getSTI().getFeatureBits()[X86::FeatureLVIControlFlowIntegrity]) + applyLVICFIMitigation(Inst, Out); + + Out.emitInstruction(Inst, getSTI()); + + if (LVIInlineAsmHardening && + getSTI().getFeatureBits()[X86::FeatureLVILoadHardening]) + applyLVILoadHardeningMitigation(Inst, Out); } bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -3133,7 +3430,7 @@ void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op, Inst.setOpcode(X86::WAIT); Inst.setLoc(IDLoc); if (!MatchingInlineAsm) - EmitInstruction(Inst, Operands, Out); + emitInstruction(Inst, Operands, Out); Operands[0] = X86Operand::CreateToken(Repl, IDLoc); } } @@ -3170,7 +3467,7 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) { (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX) return Match_Unsupported; - if ((ForcedVEXEncoding == VEXEncoding_VEX2 || + if ((ForcedVEXEncoding == VEXEncoding_VEX || ForcedVEXEncoding == VEXEncoding_VEX3) && (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX) return Match_Unsupported; @@ -3240,7 +3537,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, Inst.setLoc(IDLoc); if (!MatchingInlineAsm) - EmitInstruction(Inst, Operands, Out); + emitInstruction(Inst, Operands, Out); Opcode = Inst.getOpcode(); return false; case Match_InvalidImmUnsignedi4: { @@ -3282,20 +3579,47 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // Otherwise, we assume that this may be an integer instruction, which comes // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively. const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0"; + // MemSize corresponding to Suffixes. { 8, 16, 32, 64 } { 32, 64, 80, 0 } + const char *MemSize = Base[0] != 'f' ? "\x08\x10\x20\x40" : "\x20\x40\x50\0"; // Check for the various suffix matches. uint64_t ErrorInfoIgnore; FeatureBitset ErrorInfoMissingFeatures; // Init suppresses compiler warnings. unsigned Match[4]; + // Some instruction like VPMULDQ is NOT the variant of VPMULD but a new one. + // So we should make sure the suffix matcher only works for memory variant + // that has the same size with the suffix. + // FIXME: This flag is a workaround for legacy instructions that didn't + // declare non suffix variant assembly. + bool HasVectorReg = false; + X86Operand *MemOp = nullptr; + for (const auto &Op : Operands) { + X86Operand *X86Op = static_cast<X86Operand *>(Op.get()); + if (X86Op->isVectorReg()) + HasVectorReg = true; + else if (X86Op->isMem()) { + MemOp = X86Op; + assert(MemOp->Mem.Size == 0 && "Memory size always 0 under ATT syntax"); + // Have we found an unqualified memory operand, + // break. IA allows only one memory operand. + break; + } + } + for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) { Tmp.back() = Suffixes[I]; - Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore, - MissingFeatures, MatchingInlineAsm, - isParsingIntelSyntax()); - // If this returned as a missing feature failure, remember that. - if (Match[I] == Match_MissingFeature) - ErrorInfoMissingFeatures = MissingFeatures; + if (MemOp && HasVectorReg) + MemOp->Mem.Size = MemSize[I]; + Match[I] = Match_MnemonicFail; + if (MemOp || !HasVectorReg) { + Match[I] = + MatchInstruction(Operands, Inst, ErrorInfoIgnore, MissingFeatures, + MatchingInlineAsm, isParsingIntelSyntax()); + // If this returned as a missing feature failure, remember that. + if (Match[I] == Match_MissingFeature) + ErrorInfoMissingFeatures = MissingFeatures; + } } // Restore the old token. @@ -3309,7 +3633,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, if (NumSuccessfulMatches == 1) { Inst.setLoc(IDLoc); if (!MatchingInlineAsm) - EmitInstruction(Inst, Operands, Out); + emitInstruction(Inst, Operands, Out); Opcode = Inst.getOpcode(); return false; } @@ -3562,7 +3886,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, ; Inst.setLoc(IDLoc); if (!MatchingInlineAsm) - EmitInstruction(Inst, Operands, Out); + emitInstruction(Inst, Operands, Out); Opcode = Inst.getOpcode(); return false; } else if (NumSuccessfulMatches > 1) { @@ -3684,9 +4008,9 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) { Section = getStreamer().getCurrentSectionOnly(); } if (Section->UseCodeAlign()) - getStreamer().EmitCodeAlignment(2, 0); + getStreamer().emitCodeAlignment(2, 0); else - getStreamer().EmitValueToAlignment(2, 0, 1, 0); + getStreamer().emitValueToAlignment(2, 0, 1, 0); return false; } @@ -3699,7 +4023,7 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { Parser.Lex(); if (!is16BitMode()) { SwitchMode(X86::Mode16Bit); - getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + getParser().getStreamer().emitAssemblerFlag(MCAF_Code16); } } else if (IDVal == ".code16gcc") { // .code16gcc parses as if in 32-bit mode, but emits code in 16-bit mode. @@ -3707,19 +4031,19 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { Code16GCC = true; if (!is16BitMode()) { SwitchMode(X86::Mode16Bit); - getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16); + getParser().getStreamer().emitAssemblerFlag(MCAF_Code16); } } else if (IDVal == ".code32") { Parser.Lex(); if (!is32BitMode()) { SwitchMode(X86::Mode32Bit); - getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32); + getParser().getStreamer().emitAssemblerFlag(MCAF_Code32); } } else if (IDVal == ".code64") { Parser.Lex(); if (!is64BitMode()) { SwitchMode(X86::Mode64Bit); - getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64); + getParser().getStreamer().emitAssemblerFlag(MCAF_Code64); } } else { Error(L, "unknown directive " + IDVal); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h index d831a63b04ee..5cf4516ede97 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -17,9 +17,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SMLoc.h" #include <cassert> #include <memory> @@ -60,6 +58,7 @@ struct X86Operand final : public MCParsedAsmOperand { unsigned SegReg; const MCExpr *Disp; unsigned BaseReg; + unsigned DefaultBaseReg; unsigned IndexReg; unsigned Scale; unsigned Size; @@ -184,6 +183,10 @@ struct X86Operand final : public MCParsedAsmOperand { assert(Kind == Memory && "Invalid access!"); return Mem.BaseReg; } + unsigned getMemDefaultBaseReg() const { + assert(Kind == Memory && "Invalid access!"); + return Mem.DefaultBaseReg; + } unsigned getMemIndexReg() const { assert(Kind == Memory && "Invalid access!"); return Mem.IndexReg; @@ -312,6 +315,11 @@ struct X86Operand final : public MCParsedAsmOperand { bool isMem512() const { return Kind == Memory && (!Mem.Size || Mem.Size == 512); } + + bool isSibMem() const { + return isMem() && Mem.BaseReg != X86::RIP && Mem.BaseReg != X86::EIP; + } + bool isMemIndexReg(unsigned LowR, unsigned HighR) const { assert(Kind == Memory && "Invalid access!"); return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR; @@ -458,6 +466,14 @@ struct X86Operand final : public MCParsedAsmOperand { X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg())); } + bool isVectorReg() const { + return Kind == Register && + (X86MCRegisterClasses[X86::VR64RegClassID].contains(getReg()) || + X86MCRegisterClasses[X86::VR128XRegClassID].contains(getReg()) || + X86MCRegisterClasses[X86::VR256XRegClassID].contains(getReg()) || + X86MCRegisterClasses[X86::VR512RegClassID].contains(getReg())); + } + bool isVK1Pair() const { return Kind == Register && X86MCRegisterClasses[X86::VK1RegClassID].contains(getReg()); @@ -540,7 +556,10 @@ struct X86Operand final : public MCParsedAsmOperand { void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(getMemBaseReg())); + if (getMemBaseReg()) + Inst.addOperand(MCOperand::createReg(getMemBaseReg())); + else + Inst.addOperand(MCOperand::createReg(getMemDefaultBaseReg())); Inst.addOperand(MCOperand::createImm(getMemScale())); Inst.addOperand(MCOperand::createReg(getMemIndexReg())); addExpr(Inst, getMemDisp()); @@ -633,6 +652,7 @@ struct X86Operand final : public MCParsedAsmOperand { Res->Mem.SegReg = 0; Res->Mem.Disp = Disp; Res->Mem.BaseReg = 0; + Res->Mem.DefaultBaseReg = 0; Res->Mem.IndexReg = 0; Res->Mem.Scale = 1; Res->Mem.Size = Size; @@ -648,11 +668,14 @@ struct X86Operand final : public MCParsedAsmOperand { static std::unique_ptr<X86Operand> CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc, - SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(), - void *OpDecl = nullptr, unsigned FrontendSize = 0) { + SMLoc EndLoc, unsigned Size = 0, + unsigned DefaultBaseReg = X86::NoRegister, + StringRef SymName = StringRef(), void *OpDecl = nullptr, + unsigned FrontendSize = 0) { // We should never just have a displacement, that should be parsed as an // absolute memory operand. - assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!"); + assert((SegReg || BaseReg || IndexReg || DefaultBaseReg) && + "Invalid memory operand!"); // The scale should always be one of {1,2,4,8}. assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) && @@ -661,6 +684,7 @@ struct X86Operand final : public MCParsedAsmOperand { Res->Mem.SegReg = SegReg; Res->Mem.Disp = Disp; Res->Mem.BaseReg = BaseReg; + Res->Mem.DefaultBaseReg = DefaultBaseReg; Res->Mem.IndexReg = IndexReg; Res->Mem.Scale = Scale; Res->Mem.Size = Size; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index ea8c606d1564..a7fa1eb9a5ee 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -776,6 +776,10 @@ static int readModRM(struct InternalInstruction *insn) { return prefix##_YMM0 + index; \ case TYPE_XMM: \ return prefix##_XMM0 + index; \ + case TYPE_TMM: \ + if (index > 7) \ + *valid = 0; \ + return prefix##_TMM0 + index; \ case TYPE_VK: \ index &= 0xf; \ if (index > 7) \ @@ -849,6 +853,7 @@ static int fixupReg(struct InternalInstruction *insn, if (!valid) return -1; break; + case ENCODING_SIB: CASE_ENCODING_RM: if (insn->eaBase >= insn->eaRegBase) { insn->eaBase = (EABase)fixupRMValue( @@ -1533,6 +1538,15 @@ static int readOperands(struct InternalInstruction *insn) { if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB); break; + case ENCODING_SIB: + // Reject if SIB wasn't used. + if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64) + return -1; + if (readModRM(insn)) + return -1; + if (fixupReg(insn, &Op)) + return -1; + break; case ENCODING_REG: CASE_ENCODING_RM: if (readModRM(insn)) @@ -2006,9 +2020,11 @@ static bool translateRMRegister(MCInst &mcInst, /// @param mcInst - The MCInst to append to. /// @param insn - The instruction to extract Mod, R/M, and SIB fields /// from. +/// @param ForceSIB - The instruction must use SIB. /// @return - 0 on success; nonzero otherwise static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, - const MCDisassembler *Dis) { + const MCDisassembler *Dis, + bool ForceSIB = false) { // Addresses in an MCInst are represented as five operands: // 1. basereg (register) The R/M base, or (if there is a SIB) the // SIB base @@ -2067,11 +2083,12 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, // -Any base register used other than ESP/RSP/R12D/R12. Using these as a // base always requires a SIB byte. // -A scale other than 1 is used. - if (insn.sibScale != 1 || - (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) || - (insn.sibBase != SIB_BASE_NONE && - insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP && - insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12)) { + if (!ForceSIB && + (insn.sibScale != 1 || + (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) || + (insn.sibBase != SIB_BASE_NONE && + insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP && + insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12))) { indexReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIZ : X86::RIZ); } else @@ -2182,6 +2199,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM: case TYPE_YMM: case TYPE_ZMM: + case TYPE_TMM: case TYPE_VK_PAIR: case TYPE_VK: case TYPE_DEBUGREG: @@ -2193,6 +2211,8 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_MVSIBY: case TYPE_MVSIBZ: return translateRMMemory(mcInst, insn, Dis); + case TYPE_MSIB: + return translateRMMemory(mcInst, insn, Dis, true); } } @@ -2242,6 +2262,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, return false; case ENCODING_WRITEMASK: return translateMaskRegister(mcInst, insn.writemask); + case ENCODING_SIB: CASE_ENCODING_RM: CASE_ENCODING_VSIB: return translateRM(mcInst, operand, insn, Dis); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index 147fe46d81b9..4318c17f03a0 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -19,9 +19,6 @@ #include "llvm/Support/X86DisassemblerDecoderCommon.h" namespace llvm { - -class MCInstrInfo; - namespace X86Disassembler { // Accessor functions for various fields of an Intel instruction @@ -383,6 +380,17 @@ namespace X86Disassembler { ENTRY(BND2) \ ENTRY(BND3) +#undef REGS_TMM +#define REGS_TMM \ + ENTRY(TMM0) \ + ENTRY(TMM1) \ + ENTRY(TMM2) \ + ENTRY(TMM3) \ + ENTRY(TMM4) \ + ENTRY(TMM5) \ + ENTRY(TMM6) \ + ENTRY(TMM7) + #define ALL_EA_BASES \ EA_BASES_16BIT \ EA_BASES_32BIT \ @@ -407,6 +415,7 @@ namespace X86Disassembler { REGS_DEBUG \ REGS_CONTROL \ REGS_BOUND \ + REGS_TMM \ ENTRY(RIP) /// All possible values of the base field for effective-address diff --git a/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h b/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h index 5833017037a5..56738e9cfa73 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h @@ -28,7 +28,6 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/Support/raw_ostream.h" #include <algorithm> #include <iterator> #include <utility> diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index 675a9c377b12..0134b4efce72 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -56,7 +56,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address, if (MI->getOpcode() == X86::CALLpcrel32 && (STI.getFeatureBits()[X86::Mode64Bit])) { OS << "\tcallq\t"; - printPCRelImm(MI, 0, OS); + printPCRelImm(MI, Address, 0, OS); } // data16 and data32 both have the same encoding of 0x66. While data32 is // valid only in 16 bit systems, data16 is valid in the rest. @@ -68,8 +68,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address, OS << "\tdata32"; } // Try to print any aliases first. - else if (!printAliasInstr(MI, OS) && - !printVecCompareInstr(MI, OS)) + else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS)) printInstruction(MI, Address, OS); // Next always print the annotation. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h index 3d5d384dc4a0..51ddae61d251 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h @@ -30,9 +30,10 @@ public: // Autogenerated by tblgen, returns true if we successfully printed an // alias. - bool printAliasInstr(const MCInst *MI, raw_ostream &OS); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); + bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, uint64_t Address, + unsigned OpIdx, unsigned PrintMethodIdx, + raw_ostream &O); // Autogenerated by tblgen. void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS); @@ -46,13 +47,6 @@ public: void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS); void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); - void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { printMemReference(MI, OpNo, O); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 2284cd7a70b8..bf3b6bcb5463 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -12,7 +12,9 @@ #include "llvm/BinaryFormat/ELF.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCELFObjectWriter.h" @@ -60,10 +62,9 @@ public: else if (BranchType == "indirect") addKind(X86::AlignBranchIndirect); else { - report_fatal_error( - "'-x86-align-branch 'The branches's type is combination of jcc, " - "fused, jmp, call, ret, indirect.(plus separated)", - false); + errs() << "invalid argument " << BranchType.str() + << " to -x86-align-branch=; each element must be one of: fused, " + "jcc, jmp, call, ret, indirect.(plus separated)\n"; } } } @@ -86,12 +87,13 @@ cl::opt<unsigned> X86AlignBranchBoundary( cl::opt<X86AlignBranchKind, true, cl::parser<std::string>> X86AlignBranch( "x86-align-branch", cl::desc( - "Specify types of branches to align. The branches's types are " - "combination of jcc, fused, jmp, call, ret, indirect. jcc indicates " - "conditional jumps, fused indicates fused conditional jumps, jmp " - "indicates unconditional jumps, call indicates direct and indirect " - "calls, ret indicates rets, indirect indicates indirect jumps."), - cl::value_desc("(plus separated list of types)"), + "Specify types of branches to align (plus separated list of types):" + "\njcc indicates conditional jumps" + "\nfused indicates fused conditional jumps" + "\njmp indicates direct unconditional jumps" + "\ncall indicates direct and indirect calls" + "\nret indicates rets" + "\nindirect indicates indirect unconditional jumps"), cl::location(X86AlignBranchKindLoc)); cl::opt<bool> X86AlignBranchWithin32BBoundaries( @@ -102,6 +104,18 @@ cl::opt<bool> X86AlignBranchWithin32BBoundaries( "assumptions about labels corresponding to particular instructions, " "and should be used with caution.")); +cl::opt<unsigned> X86PadMaxPrefixSize( + "x86-pad-max-prefix-size", cl::init(0), + cl::desc("Maximum number of prefixes to use for padding")); + +cl::opt<bool> X86PadForAlign( + "x86-pad-for-align", cl::init(true), cl::Hidden, + cl::desc("Pad previous instructions to implement align directives")); + +cl::opt<bool> X86PadForBranchAlign( + "x86-pad-for-branch-align", cl::init(true), cl::Hidden, + cl::desc("Pad previous instructions to implement branch alignment")); + class X86ELFObjectWriter : public MCELFObjectTargetWriter { public: X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine, @@ -114,14 +128,18 @@ class X86AsmBackend : public MCAsmBackend { std::unique_ptr<const MCInstrInfo> MCII; X86AlignBranchKind AlignBranchType; Align AlignBoundary; + unsigned TargetPrefixMax = 0; - bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const; - - bool needAlign(MCObjectStreamer &OS) const; - bool needAlignInst(const MCInst &Inst) const; - MCBoundaryAlignFragment * - getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const; MCInst PrevInst; + MCBoundaryAlignFragment *PendingBA = nullptr; + std::pair<MCFragment *, size_t> PrevInstPosition; + bool CanPadInst; + + uint8_t determinePaddingPrefix(const MCInst &Inst) const; + bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const; + bool needAlign(const MCInst &Inst) const; + bool canPadBranches(MCObjectStreamer &OS) const; + bool canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const; public: X86AsmBackend(const Target &T, const MCSubtargetInfo &STI) @@ -142,11 +160,14 @@ public: AlignBoundary = assumeAligned(X86AlignBranchBoundary); if (X86AlignBranch.getNumOccurrences()) AlignBranchType = X86AlignBranchKindLoc; + if (X86PadMaxPrefixSize.getNumOccurrences()) + TargetPrefixMax = X86PadMaxPrefixSize; } bool allowAutoPadding() const override; - void alignBranchesBegin(MCObjectStreamer &OS, const MCInst &Inst) override; - void alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) override; + bool allowEnhancedRelaxation() const override; + void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst) override; + void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) override; unsigned getNumFixupKinds() const override { return X86::NumTargetFixupKinds; @@ -155,7 +176,7 @@ public: Optional<MCFixupKind> getFixupKind(StringRef Name) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; - + bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target) override; @@ -171,22 +192,34 @@ public: const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; - void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, - MCInst &Res) const override; + void relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const override; + + bool padInstructionViaRelaxation(MCRelaxableFragment &RF, + MCCodeEmitter &Emitter, + unsigned &RemainingSize) const; + + bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter, + unsigned &RemainingSize) const; + + bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter, + unsigned &RemainingSize) const; + + void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override; bool writeNopData(raw_ostream &OS, uint64_t Count) const override; }; } // end anonymous namespace -static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) { +static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool Is16BitMode) { unsigned Op = Inst.getOpcode(); switch (Op) { default: return Op; case X86::JCC_1: - return (is16BitMode) ? X86::JCC_2 : X86::JCC_4; + return (Is16BitMode) ? X86::JCC_2 : X86::JCC_4; case X86::JMP_1: - return (is16BitMode) ? X86::JMP_2 : X86::JMP_4; + return (Is16BitMode) ? X86::JMP_2 : X86::JMP_4; } } @@ -275,11 +308,11 @@ static unsigned getRelaxedOpcodeArith(const MCInst &Inst) { } } -static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) { +static unsigned getRelaxedOpcode(const MCInst &Inst, bool Is16BitMode) { unsigned R = getRelaxedOpcodeArith(Inst); if (R != Inst.getOpcode()) return R; - return getRelaxedOpcodeBranch(Inst, is16BitMode); + return getRelaxedOpcodeBranch(Inst, Is16BitMode); } static X86::CondCode getCondFromBranch(const MCInst &MI, @@ -316,6 +349,11 @@ static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) { return (BaseReg == X86::RIP); } +/// Check if the instruction is a prefix. +static bool isPrefix(const MCInst &MI, const MCInstrInfo &MCII) { + return X86II::isPrefix(MCII.get(MI.getOpcode()).TSFlags); +} + /// Check if the instruction is valid as the first instruction in macro fusion. static bool isFirstMacroFusibleInst(const MCInst &Inst, const MCInstrInfo &MCII) { @@ -327,6 +365,69 @@ static bool isFirstMacroFusibleInst(const MCInst &Inst, return FIK != X86::FirstMacroFusionInstKind::Invalid; } +/// X86 can reduce the bytes of NOP by padding instructions with prefixes to +/// get a better peformance in some cases. Here, we determine which prefix is +/// the most suitable. +/// +/// If the instruction has a segment override prefix, use the existing one. +/// If the target is 64-bit, use the CS. +/// If the target is 32-bit, +/// - If the instruction has a ESP/EBP base register, use SS. +/// - Otherwise use DS. +uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const { + assert((STI.hasFeature(X86::Mode32Bit) || STI.hasFeature(X86::Mode64Bit)) && + "Prefixes can be added only in 32-bit or 64-bit mode."); + const MCInstrDesc &Desc = MCII->get(Inst.getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + + // Determine where the memory operand starts, if present. + int MemoryOperand = X86II::getMemoryOperandNo(TSFlags); + if (MemoryOperand != -1) + MemoryOperand += X86II::getOperandBias(Desc); + + unsigned SegmentReg = 0; + if (MemoryOperand >= 0) { + // Check for explicit segment override on memory operand. + SegmentReg = Inst.getOperand(MemoryOperand + X86::AddrSegmentReg).getReg(); + } + + switch (TSFlags & X86II::FormMask) { + default: + break; + case X86II::RawFrmDstSrc: { + // Check segment override opcode prefix as needed (not for %ds). + if (Inst.getOperand(2).getReg() != X86::DS) + SegmentReg = Inst.getOperand(2).getReg(); + break; + } + case X86II::RawFrmSrc: { + // Check segment override opcode prefix as needed (not for %ds). + if (Inst.getOperand(1).getReg() != X86::DS) + SegmentReg = Inst.getOperand(1).getReg(); + break; + } + case X86II::RawFrmMemOffs: { + // Check segment override opcode prefix as needed. + SegmentReg = Inst.getOperand(1).getReg(); + break; + } + } + + if (SegmentReg != 0) + return X86::getSegmentOverridePrefixForReg(SegmentReg); + + if (STI.hasFeature(X86::Mode64Bit)) + return X86::CS_Encoding; + + if (MemoryOperand >= 0) { + unsigned BaseRegNum = MemoryOperand + X86::AddrBaseReg; + unsigned BaseReg = Inst.getOperand(BaseRegNum).getReg(); + if (BaseReg == X86::ESP || BaseReg == X86::EBP) + return X86::SS_Encoding; + } + return X86::DS_Encoding; +} + /// Check if the two instructions will be macro-fused on the target cpu. bool X86AsmBackend::isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const { const MCInstrDesc &InstDesc = MCII->get(Jcc.getOpcode()); @@ -355,19 +456,122 @@ static bool hasVariantSymbol(const MCInst &MI) { } bool X86AsmBackend::allowAutoPadding() const { - return (AlignBoundary != Align::None() && - AlignBranchType != X86::AlignBranchNone); + return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone); +} + +bool X86AsmBackend::allowEnhancedRelaxation() const { + return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign; +} + +/// X86 has certain instructions which enable interrupts exactly one +/// instruction *after* the instruction which stores to SS. Return true if the +/// given instruction has such an interrupt delay slot. +static bool hasInterruptDelaySlot(const MCInst &Inst) { + switch (Inst.getOpcode()) { + case X86::POPSS16: + case X86::POPSS32: + case X86::STI: + return true; + + case X86::MOV16sr: + case X86::MOV32sr: + case X86::MOV64sr: + case X86::MOV16sm: + if (Inst.getOperand(0).getReg() == X86::SS) + return true; + break; + } + return false; +} + +/// Check if the instruction to be emitted is right after any data. +static bool +isRightAfterData(MCFragment *CurrentFragment, + const std::pair<MCFragment *, size_t> &PrevInstPosition) { + MCFragment *F = CurrentFragment; + // Empty data fragments may be created to prevent further data being + // added into the previous fragment, we need to skip them since they + // have no contents. + for (; isa_and_nonnull<MCDataFragment>(F); F = F->getPrevNode()) + if (cast<MCDataFragment>(F)->getContents().size() != 0) + break; + + // Since data is always emitted into a DataFragment, our check strategy is + // simple here. + // - If the fragment is a DataFragment + // - If it's not the fragment where the previous instruction is, + // returns true. + // - If it's the fragment holding the previous instruction but its + // size changed since the the previous instruction was emitted into + // it, returns true. + // - Otherwise returns false. + // - If the fragment is not a DataFragment, returns false. + if (auto *DF = dyn_cast_or_null<MCDataFragment>(F)) + return DF != PrevInstPosition.first || + DF->getContents().size() != PrevInstPosition.second; + + return false; +} + +/// \returns the fragment size if it has instructions, otherwise returns 0. +static size_t getSizeForInstFragment(const MCFragment *F) { + if (!F || !F->hasInstructions()) + return 0; + // MCEncodedFragmentWithContents being templated makes this tricky. + switch (F->getKind()) { + default: + llvm_unreachable("Unknown fragment with instructions!"); + case MCFragment::FT_Data: + return cast<MCDataFragment>(*F).getContents().size(); + case MCFragment::FT_Relaxable: + return cast<MCRelaxableFragment>(*F).getContents().size(); + case MCFragment::FT_CompactEncodedInst: + return cast<MCCompactEncodedInstFragment>(*F).getContents().size(); + } } -bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const { +/// Return true if we can insert NOP or prefixes automatically before the +/// the instruction to be emitted. +bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const { + if (hasVariantSymbol(Inst)) + // Linker may rewrite the instruction with variant symbol operand(e.g. + // TLSCALL). + return false; + + if (hasInterruptDelaySlot(PrevInst)) + // If this instruction follows an interrupt enabling instruction with a one + // instruction delay, inserting a nop would change behavior. + return false; + + if (isPrefix(PrevInst, *MCII)) + // If this instruction follows a prefix, inserting a nop/prefix would change + // semantic. + return false; + + if (isPrefix(Inst, *MCII)) + // If this instruction is a prefix, inserting a prefix would change + // semantic. + return false; + + if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition)) + // If this instruction follows any data, there is no clear + // instruction boundary, inserting a nop/prefix would change semantic. + return false; + + return true; +} + +bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const { if (!OS.getAllowAutoPadding()) return false; assert(allowAutoPadding() && "incorrect initialization!"); - MCAssembler &Assembler = OS.getAssembler(); - MCSection *Sec = OS.getCurrentSectionOnly(); + // We only pad in text section. + if (!OS.getCurrentSectionOnly()->getKind().isText()) + return false; + // To be Done: Currently don't deal with Bundle cases. - if (Assembler.isBundlingEnabled() && Sec->isBundleLocked()) + if (OS.getAssembler().isBundlingEnabled()) return false; // Branches only need to be aligned in 32-bit or 64-bit mode. @@ -377,59 +581,42 @@ bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const { return true; } -/// Check if the instruction operand needs to be aligned. Padding is disabled -/// before intruction which may be rewritten by linker(e.g. TLSCALL). -bool X86AsmBackend::needAlignInst(const MCInst &Inst) const { - // Linker may rewrite the instruction with variant symbol operand. - if (hasVariantSymbol(Inst)) - return false; - - const MCInstrDesc &InstDesc = MCII->get(Inst.getOpcode()); - return (InstDesc.isConditionalBranch() && +/// Check if the instruction operand needs to be aligned. +bool X86AsmBackend::needAlign(const MCInst &Inst) const { + const MCInstrDesc &Desc = MCII->get(Inst.getOpcode()); + return (Desc.isConditionalBranch() && (AlignBranchType & X86::AlignBranchJcc)) || - (InstDesc.isUnconditionalBranch() && + (Desc.isUnconditionalBranch() && (AlignBranchType & X86::AlignBranchJmp)) || - (InstDesc.isCall() && - (AlignBranchType & X86::AlignBranchCall)) || - (InstDesc.isReturn() && - (AlignBranchType & X86::AlignBranchRet)) || - (InstDesc.isIndirectBranch() && + (Desc.isCall() && (AlignBranchType & X86::AlignBranchCall)) || + (Desc.isReturn() && (AlignBranchType & X86::AlignBranchRet)) || + (Desc.isIndirectBranch() && (AlignBranchType & X86::AlignBranchIndirect)); } -static bool canReuseBoundaryAlignFragment(const MCBoundaryAlignFragment &F) { - // If a MCBoundaryAlignFragment has not been used to emit NOP,we can reuse it. - return !F.canEmitNops(); -} +/// Insert BoundaryAlignFragment before instructions to align branches. +void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, + const MCInst &Inst) { + CanPadInst = canPadInst(Inst, OS); -MCBoundaryAlignFragment * -X86AsmBackend::getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const { - auto *F = dyn_cast_or_null<MCBoundaryAlignFragment>(OS.getCurrentFragment()); - if (!F || !canReuseBoundaryAlignFragment(*F)) { - F = new MCBoundaryAlignFragment(AlignBoundary); - OS.insert(F); - } - return F; -} + if (!canPadBranches(OS)) + return; + + if (!isMacroFused(PrevInst, Inst)) + // Macro fusion doesn't happen indeed, clear the pending. + PendingBA = nullptr; -/// Insert MCBoundaryAlignFragment before instructions to align branches. -void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS, - const MCInst &Inst) { - if (!needAlign(OS)) + if (!CanPadInst) return; - MCFragment *CF = OS.getCurrentFragment(); - bool NeedAlignFused = AlignBranchType & X86::AlignBranchFused; - if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) { + if (PendingBA && OS.getCurrentFragment()->getPrevNode() == PendingBA) { // Macro fusion actually happens and there is no other fragment inserted - // after the previous instruction. NOP can be emitted in PF to align fused - // jcc. - if (auto *PF = - dyn_cast_or_null<MCBoundaryAlignFragment>(CF->getPrevNode())) { - const_cast<MCBoundaryAlignFragment *>(PF)->setEmitNops(true); - const_cast<MCBoundaryAlignFragment *>(PF)->setFused(true); - } - } else if (needAlignInst(Inst)) { + // after the previous instruction. + // + // Do nothing here since we already inserted a BoudaryAlign fragment when + // we met the first instruction in the fused pair and we'll tie them + // together in emitInstructionEnd. + // // Note: When there is at least one fragment, such as MCAlignFragment, // inserted after the previous instruction, e.g. // @@ -441,34 +628,41 @@ void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS, // // We will treat the JCC as a unfused branch although it may be fused // with the CMP. - auto *F = getOrCreateBoundaryAlignFragment(OS); - F->setEmitNops(true); - F->setFused(false); - } else if (NeedAlignFused && isFirstMacroFusibleInst(Inst, *MCII)) { - // We don't know if macro fusion happens until the reaching the next - // instruction, so a place holder is put here if necessary. - getOrCreateBoundaryAlignFragment(OS); + return; } - PrevInst = Inst; + if (needAlign(Inst) || ((AlignBranchType & X86::AlignBranchFused) && + isFirstMacroFusibleInst(Inst, *MCII))) { + // If we meet a unfused branch or the first instuction in a fusiable pair, + // insert a BoundaryAlign fragment. + OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary)); + } } -/// Insert a MCBoundaryAlignFragment to mark the end of the branch to be aligned -/// if necessary. -void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) { - if (!needAlign(OS)) +/// Set the last fragment to be aligned for the BoundaryAlignFragment. +void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) { + PrevInst = Inst; + MCFragment *CF = OS.getCurrentFragment(); + PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF)); + if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF)) + F->setAllowAutoPadding(CanPadInst); + + if (!canPadBranches(OS)) + return; + + if (!needAlign(Inst) || !PendingBA) return; - // If the branch is emitted into a MCRelaxableFragment, we can determine the - // size of the branch easily in MCAssembler::relaxBoundaryAlign. When the - // branch is fused, the fused branch(macro fusion pair) must be emitted into - // two fragments. Or when the branch is unfused, the branch must be emitted - // into one fragment. The MCRelaxableFragment naturally marks the end of the - // fused or unfused branch. - // Otherwise, we need to insert a MCBoundaryAlignFragment to mark the end of - // the branch. This MCBoundaryAlignFragment may be reused to emit NOP to align - // other branch. - if (needAlignInst(Inst) && !isa<MCRelaxableFragment>(OS.getCurrentFragment())) - OS.insert(new MCBoundaryAlignFragment(AlignBoundary)); + + // Tie the aligned instructions into a a pending BoundaryAlign. + PendingBA->setLastFragment(CF); + PendingBA = nullptr; + + // We need to ensure that further data isn't added to the current + // DataFragment, so that we can get the size of instructions later in + // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty + // DataFragment. + if (isa_and_nonnull<MCDataFragment>(CF)) + OS.insert(new MCDataFragment()); // Update the maximum alignment on the current section if necessary. MCSection *Sec = OS.getCurrentSectionOnly(); @@ -478,13 +672,23 @@ void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) { Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const { if (STI.getTargetTriple().isOSBinFormatELF()) { + unsigned Type; if (STI.getTargetTriple().getArch() == Triple::x86_64) { - if (Name == "R_X86_64_NONE") - return FK_NONE; + Type = llvm::StringSwitch<unsigned>(Name) +#define ELF_RELOC(X, Y) .Case(#X, Y) +#include "llvm/BinaryFormat/ELFRelocs/x86_64.def" +#undef ELF_RELOC + .Default(-1u); } else { - if (Name == "R_386_NONE") - return FK_NONE; + Type = llvm::StringSwitch<unsigned>(Name) +#define ELF_RELOC(X, Y) .Case(#X, Y) +#include "llvm/BinaryFormat/ELFRelocs/i386.def" +#undef ELF_RELOC + .Default(-1u); } + if (Type == -1u) + return None; + return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type); } return MCAsmBackend::getFixupKind(Name); } @@ -502,6 +706,11 @@ const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const { {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, }; + // Fixup kinds from .reloc directive are like R_386_NONE/R_X86_64_NONE. They + // do not require any extra processing. + if (Kind >= FirstLiteralRelocationKind) + return MCAsmBackend::getFixupKindInfo(FK_NONE); + if (Kind < FirstTargetFixupKind) return MCAsmBackend::getFixupKindInfo(Kind); @@ -514,7 +723,7 @@ const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const { bool X86AsmBackend::shouldForceRelocation(const MCAssembler &, const MCFixup &Fixup, const MCValue &) { - return Fixup.getKind() == FK_NONE; + return Fixup.getKind() >= FirstLiteralRelocationKind; } static unsigned getFixupKindSize(unsigned Kind) { @@ -556,7 +765,10 @@ void X86AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, MutableArrayRef<char> Data, uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const { - unsigned Size = getFixupKindSize(Fixup.getKind()); + unsigned Kind = Fixup.getKind(); + if (Kind >= FirstLiteralRelocationKind) + return; + unsigned Size = getFixupKindSize(Kind); assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); @@ -613,12 +825,11 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, // FIXME: Can tblgen help at all here to verify there aren't other instructions // we can relax? -void X86AsmBackend::relaxInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI, - MCInst &Res) const { +void X86AsmBackend::relaxInstruction(MCInst &Inst, + const MCSubtargetInfo &STI) const { // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel. - bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; - unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode); + bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; + unsigned RelaxedOp = getRelaxedOpcode(Inst, Is16BitMode); if (RelaxedOp == Inst.getOpcode()) { SmallString<256> Tmp; @@ -628,8 +839,232 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, report_fatal_error("unexpected instruction to relax: " + OS.str()); } - Res = Inst; - Res.setOpcode(RelaxedOp); + Inst.setOpcode(RelaxedOp); +} + +/// Return true if this instruction has been fully relaxed into it's most +/// general available form. +static bool isFullyRelaxed(const MCRelaxableFragment &RF) { + auto &Inst = RF.getInst(); + auto &STI = *RF.getSubtargetInfo(); + bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit]; + return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode(); +} + +bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF, + MCCodeEmitter &Emitter, + unsigned &RemainingSize) const { + if (!RF.getAllowAutoPadding()) + return false; + // If the instruction isn't fully relaxed, shifting it around might require a + // larger value for one of the fixups then can be encoded. The outer loop + // will also catch this before moving to the next instruction, but we need to + // prevent padding this single instruction as well. + if (!isFullyRelaxed(RF)) + return false; + + const unsigned OldSize = RF.getContents().size(); + if (OldSize == 15) + return false; + + const unsigned MaxPossiblePad = std::min(15 - OldSize, RemainingSize); + const unsigned RemainingPrefixSize = [&]() -> unsigned { + SmallString<15> Code; + raw_svector_ostream VecOS(Code); + Emitter.emitPrefix(RF.getInst(), VecOS, STI); + assert(Code.size() < 15 && "The number of prefixes must be less than 15."); + + // TODO: It turns out we need a decent amount of plumbing for the target + // specific bits to determine number of prefixes its safe to add. Various + // targets (older chips mostly, but also Atom family) encounter decoder + // stalls with too many prefixes. For testing purposes, we set the value + // externally for the moment. + unsigned ExistingPrefixSize = Code.size(); + if (TargetPrefixMax <= ExistingPrefixSize) + return 0; + return TargetPrefixMax - ExistingPrefixSize; + }(); + const unsigned PrefixBytesToAdd = + std::min(MaxPossiblePad, RemainingPrefixSize); + if (PrefixBytesToAdd == 0) + return false; + + const uint8_t Prefix = determinePaddingPrefix(RF.getInst()); + + SmallString<256> Code; + Code.append(PrefixBytesToAdd, Prefix); + Code.append(RF.getContents().begin(), RF.getContents().end()); + RF.getContents() = Code; + + // Adjust the fixups for the change in offsets + for (auto &F : RF.getFixups()) { + F.setOffset(F.getOffset() + PrefixBytesToAdd); + } + + RemainingSize -= PrefixBytesToAdd; + return true; +} + +bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF, + MCCodeEmitter &Emitter, + unsigned &RemainingSize) const { + if (isFullyRelaxed(RF)) + // TODO: There are lots of other tricks we could apply for increasing + // encoding size without impacting performance. + return false; + + MCInst Relaxed = RF.getInst(); + relaxInstruction(Relaxed, *RF.getSubtargetInfo()); + + SmallVector<MCFixup, 4> Fixups; + SmallString<15> Code; + raw_svector_ostream VecOS(Code); + Emitter.encodeInstruction(Relaxed, VecOS, Fixups, *RF.getSubtargetInfo()); + const unsigned OldSize = RF.getContents().size(); + const unsigned NewSize = Code.size(); + assert(NewSize >= OldSize && "size decrease during relaxation?"); + unsigned Delta = NewSize - OldSize; + if (Delta > RemainingSize) + return false; + RF.setInst(Relaxed); + RF.getContents() = Code; + RF.getFixups() = Fixups; + RemainingSize -= Delta; + return true; +} + +bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF, + MCCodeEmitter &Emitter, + unsigned &RemainingSize) const { + bool Changed = false; + if (RemainingSize != 0) + Changed |= padInstructionViaRelaxation(RF, Emitter, RemainingSize); + if (RemainingSize != 0) + Changed |= padInstructionViaPrefix(RF, Emitter, RemainingSize); + return Changed; +} + +void X86AsmBackend::finishLayout(MCAssembler const &Asm, + MCAsmLayout &Layout) const { + // See if we can further relax some instructions to cut down on the number of + // nop bytes required for code alignment. The actual win is in reducing + // instruction count, not number of bytes. Modern X86-64 can easily end up + // decode limited. It is often better to reduce the number of instructions + // (i.e. eliminate nops) even at the cost of increasing the size and + // complexity of others. + if (!X86PadForAlign && !X86PadForBranchAlign) + return; + + DenseSet<MCFragment *> LabeledFragments; + for (const MCSymbol &S : Asm.symbols()) + LabeledFragments.insert(S.getFragment(false)); + + for (MCSection &Sec : Asm) { + if (!Sec.getKind().isText()) + continue; + + SmallVector<MCRelaxableFragment *, 4> Relaxable; + for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) { + MCFragment &F = *I; + + if (LabeledFragments.count(&F)) + Relaxable.clear(); + + if (F.getKind() == MCFragment::FT_Data || + F.getKind() == MCFragment::FT_CompactEncodedInst) + // Skip and ignore + continue; + + if (F.getKind() == MCFragment::FT_Relaxable) { + auto &RF = cast<MCRelaxableFragment>(*I); + Relaxable.push_back(&RF); + continue; + } + + auto canHandle = [](MCFragment &F) -> bool { + switch (F.getKind()) { + default: + return false; + case MCFragment::FT_Align: + return X86PadForAlign; + case MCFragment::FT_BoundaryAlign: + return X86PadForBranchAlign; + } + }; + // For any unhandled kind, assume we can't change layout. + if (!canHandle(F)) { + Relaxable.clear(); + continue; + } + +#ifndef NDEBUG + const uint64_t OrigOffset = Layout.getFragmentOffset(&F); +#endif + const uint64_t OrigSize = Asm.computeFragmentSize(Layout, F); + + // To keep the effects local, prefer to relax instructions closest to + // the align directive. This is purely about human understandability + // of the resulting code. If we later find a reason to expand + // particular instructions over others, we can adjust. + MCFragment *FirstChangedFragment = nullptr; + unsigned RemainingSize = OrigSize; + while (!Relaxable.empty() && RemainingSize != 0) { + auto &RF = *Relaxable.pop_back_val(); + // Give the backend a chance to play any tricks it wishes to increase + // the encoding size of the given instruction. Target independent code + // will try further relaxation, but target's may play further tricks. + if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize)) + FirstChangedFragment = &RF; + + // If we have an instruction which hasn't been fully relaxed, we can't + // skip past it and insert bytes before it. Changing its starting + // offset might require a larger negative offset than it can encode. + // We don't need to worry about larger positive offsets as none of the + // possible offsets between this and our align are visible, and the + // ones afterwards aren't changing. + if (!isFullyRelaxed(RF)) + break; + } + Relaxable.clear(); + + if (FirstChangedFragment) { + // Make sure the offsets for any fragments in the effected range get + // updated. Note that this (conservatively) invalidates the offsets of + // those following, but this is not required. + Layout.invalidateFragmentsFrom(FirstChangedFragment); + } + + // BoundaryAlign explicitly tracks it's size (unlike align) + if (F.getKind() == MCFragment::FT_BoundaryAlign) + cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize); + +#ifndef NDEBUG + const uint64_t FinalOffset = Layout.getFragmentOffset(&F); + const uint64_t FinalSize = Asm.computeFragmentSize(Layout, F); + assert(OrigOffset + OrigSize == FinalOffset + FinalSize && + "can't move start of next fragment!"); + assert(FinalSize == RemainingSize && "inconsistent size computation?"); +#endif + + // If we're looking at a boundary align, make sure we don't try to pad + // its target instructions for some following directive. Doing so would + // break the alignment of the current boundary align. + if (auto *BF = dyn_cast<MCBoundaryAlignFragment>(&F)) { + const MCFragment *LastFragment = BF->getLastFragment(); + if (!LastFragment) + continue; + while (&*I != LastFragment) + ++I; + } + } + } + + // The layout is done. Mark every fragment as valid. + for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) { + MCSection &Section = *Layout.getSectionOrder()[i]; + Layout.getFragmentOffset(&*Section.getFragmentList().rbegin()); + Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin()); + } } /// Write a sequence of optimal nops to the output, covering \p Count @@ -661,7 +1096,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { // This CPU doesn't support long nops. If needed add more. // FIXME: We could generated something better than plain 0x90. - if (!STI.getFeatureBits()[X86::FeatureNOPL]) { + if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) { for (uint64_t i = 0; i < Count; ++i) OS << '\x90'; return true; @@ -670,7 +1105,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const { // 15-bytes is the longest single NOP instruction, but 10-bytes is // commonly the longest that can be efficiently decoded. uint64_t MaxNopLength = 10; - if (STI.getFeatureBits()[X86::ProcIntelSLM]) + if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP]) MaxNopLength = 7; else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) MaxNopLength = 15; @@ -811,6 +1246,7 @@ class DarwinX86AsmBackend : public X86AsmBackend { enum { CU_NUM_SAVED_REGS = 6 }; mutable unsigned SavedRegs[CU_NUM_SAVED_REGS]; + Triple TT; bool Is64Bit; unsigned OffsetSize; ///< Offset of a "push" instruction. @@ -838,10 +1274,140 @@ protected: return 1; } +private: + /// Get the compact unwind number for a given register. The number + /// corresponds to the enum lists in compact_unwind_encoding.h. + int getCompactUnwindRegNum(unsigned Reg) const { + static const MCPhysReg CU32BitRegs[7] = { + X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 + }; + static const MCPhysReg CU64BitRegs[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; + for (int Idx = 1; *CURegs; ++CURegs, ++Idx) + if (*CURegs == Reg) + return Idx; + + return -1; + } + + /// Return the registers encoded for a compact encoding with a frame + /// pointer. + uint32_t encodeCompactUnwindRegistersWithFrame() const { + // Encode the registers in the order they were saved --- 3-bits per + // register. The list of saved registers is assumed to be in reverse + // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS. + uint32_t RegEnc = 0; + for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) { + unsigned Reg = SavedRegs[i]; + if (Reg == 0) break; + + int CURegNum = getCompactUnwindRegNum(Reg); + if (CURegNum == -1) return ~0U; + + // Encode the 3-bit register number in order, skipping over 3-bits for + // each register. + RegEnc |= (CURegNum & 0x7) << (Idx++ * 3); + } + + assert((RegEnc & 0x3FFFF) == RegEnc && + "Invalid compact register encoding!"); + return RegEnc; + } + + /// Create the permutation encoding used with frameless stacks. It is + /// passed the number of registers to be saved and an array of the registers + /// saved. + uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const { + // The saved registers are numbered from 1 to 6. In order to encode the + // order in which they were saved, we re-number them according to their + // place in the register order. The re-numbering is relative to the last + // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in + // that order: + // + // Orig Re-Num + // ---- ------ + // 6 6 + // 2 2 + // 4 3 + // 5 3 + // + for (unsigned i = 0; i < RegCount; ++i) { + int CUReg = getCompactUnwindRegNum(SavedRegs[i]); + if (CUReg == -1) return ~0U; + SavedRegs[i] = CUReg; + } + + // Reverse the list. + std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]); + + uint32_t RenumRegs[CU_NUM_SAVED_REGS]; + for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){ + unsigned Countless = 0; + for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j) + if (SavedRegs[j] < SavedRegs[i]) + ++Countless; + + RenumRegs[i] = SavedRegs[i] - Countless - 1; + } + + // Take the renumbered values and encode them into a 10-bit number. + uint32_t permutationEncoding = 0; + switch (RegCount) { + case 6: + permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1] + + 6 * RenumRegs[2] + 2 * RenumRegs[3] + + RenumRegs[4]; + break; + case 5: + permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2] + + 6 * RenumRegs[3] + 2 * RenumRegs[4] + + RenumRegs[5]; + break; + case 4: + permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3] + + 3 * RenumRegs[4] + RenumRegs[5]; + break; + case 3: + permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4] + + RenumRegs[5]; + break; + case 2: + permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5]; + break; + case 1: + permutationEncoding |= RenumRegs[5]; + break; + } + + assert((permutationEncoding & 0x3FF) == permutationEncoding && + "Invalid compact register encoding!"); + return permutationEncoding; + } + +public: + DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI) + : X86AsmBackend(T, STI), MRI(MRI), TT(STI.getTargetTriple()), + Is64Bit(TT.isArch64Bit()) { + memset(SavedRegs, 0, sizeof(SavedRegs)); + OffsetSize = Is64Bit ? 8 : 4; + MoveInstrSize = Is64Bit ? 3 : 2; + StackDivide = Is64Bit ? 8 : 4; + } + + std::unique_ptr<MCObjectTargetWriter> + createObjectTargetWriter() const override { + uint32_t CPUType = cantFail(MachO::getCPUType(TT)); + uint32_t CPUSubType = cantFail(MachO::getCPUSubType(TT)); + return createX86MachObjectWriter(Is64Bit, CPUType, CPUSubType); + } + /// Implementation of algorithm to generate the compact unwind encoding /// for the CFI instructions. uint32_t - generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const { + generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const override { if (Instrs.empty()) return 0; // Reset the saved registers. @@ -904,7 +1470,7 @@ protected: // L0: // .cfi_def_cfa_offset 80 // - StackSize = std::abs(Inst.getOffset()) / StackDivide; + StackSize = Inst.getOffset() / StackDivide; ++NumDefCFAOffsets; break; } @@ -991,168 +1557,6 @@ protected: return CompactUnwindEncoding; } - -private: - /// Get the compact unwind number for a given register. The number - /// corresponds to the enum lists in compact_unwind_encoding.h. - int getCompactUnwindRegNum(unsigned Reg) const { - static const MCPhysReg CU32BitRegs[7] = { - X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 - }; - static const MCPhysReg CU64BitRegs[] = { - X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 - }; - const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; - for (int Idx = 1; *CURegs; ++CURegs, ++Idx) - if (*CURegs == Reg) - return Idx; - - return -1; - } - - /// Return the registers encoded for a compact encoding with a frame - /// pointer. - uint32_t encodeCompactUnwindRegistersWithFrame() const { - // Encode the registers in the order they were saved --- 3-bits per - // register. The list of saved registers is assumed to be in reverse - // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS. - uint32_t RegEnc = 0; - for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) { - unsigned Reg = SavedRegs[i]; - if (Reg == 0) break; - - int CURegNum = getCompactUnwindRegNum(Reg); - if (CURegNum == -1) return ~0U; - - // Encode the 3-bit register number in order, skipping over 3-bits for - // each register. - RegEnc |= (CURegNum & 0x7) << (Idx++ * 3); - } - - assert((RegEnc & 0x3FFFF) == RegEnc && - "Invalid compact register encoding!"); - return RegEnc; - } - - /// Create the permutation encoding used with frameless stacks. It is - /// passed the number of registers to be saved and an array of the registers - /// saved. - uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const { - // The saved registers are numbered from 1 to 6. In order to encode the - // order in which they were saved, we re-number them according to their - // place in the register order. The re-numbering is relative to the last - // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in - // that order: - // - // Orig Re-Num - // ---- ------ - // 6 6 - // 2 2 - // 4 3 - // 5 3 - // - for (unsigned i = 0; i < RegCount; ++i) { - int CUReg = getCompactUnwindRegNum(SavedRegs[i]); - if (CUReg == -1) return ~0U; - SavedRegs[i] = CUReg; - } - - // Reverse the list. - std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]); - - uint32_t RenumRegs[CU_NUM_SAVED_REGS]; - for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){ - unsigned Countless = 0; - for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j) - if (SavedRegs[j] < SavedRegs[i]) - ++Countless; - - RenumRegs[i] = SavedRegs[i] - Countless - 1; - } - - // Take the renumbered values and encode them into a 10-bit number. - uint32_t permutationEncoding = 0; - switch (RegCount) { - case 6: - permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1] - + 6 * RenumRegs[2] + 2 * RenumRegs[3] - + RenumRegs[4]; - break; - case 5: - permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2] - + 6 * RenumRegs[3] + 2 * RenumRegs[4] - + RenumRegs[5]; - break; - case 4: - permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3] - + 3 * RenumRegs[4] + RenumRegs[5]; - break; - case 3: - permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4] - + RenumRegs[5]; - break; - case 2: - permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5]; - break; - case 1: - permutationEncoding |= RenumRegs[5]; - break; - } - - assert((permutationEncoding & 0x3FF) == permutationEncoding && - "Invalid compact register encoding!"); - return permutationEncoding; - } - -public: - DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, bool Is64Bit) - : X86AsmBackend(T, STI), MRI(MRI), Is64Bit(Is64Bit) { - memset(SavedRegs, 0, sizeof(SavedRegs)); - OffsetSize = Is64Bit ? 8 : 4; - MoveInstrSize = Is64Bit ? 3 : 2; - StackDivide = Is64Bit ? 8 : 4; - } -}; - -class DarwinX86_32AsmBackend : public DarwinX86AsmBackend { -public: - DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) - : DarwinX86AsmBackend(T, MRI, STI, false) {} - - std::unique_ptr<MCObjectTargetWriter> - createObjectTargetWriter() const override { - return createX86MachObjectWriter(/*Is64Bit=*/false, - MachO::CPU_TYPE_I386, - MachO::CPU_SUBTYPE_I386_ALL); - } - - /// Generate the compact unwind encoding for the CFI instructions. - uint32_t generateCompactUnwindEncoding( - ArrayRef<MCCFIInstruction> Instrs) const override { - return generateCompactUnwindEncodingImpl(Instrs); - } -}; - -class DarwinX86_64AsmBackend : public DarwinX86AsmBackend { - const MachO::CPUSubTypeX86 Subtype; -public: - DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, MachO::CPUSubTypeX86 st) - : DarwinX86AsmBackend(T, MRI, STI, true), Subtype(st) {} - - std::unique_ptr<MCObjectTargetWriter> - createObjectTargetWriter() const override { - return createX86MachObjectWriter(/*Is64Bit=*/true, MachO::CPU_TYPE_X86_64, - Subtype); - } - - /// Generate the compact unwind encoding for the CFI instructions. - uint32_t generateCompactUnwindEncoding( - ArrayRef<MCCFIInstruction> Instrs) const override { - return generateCompactUnwindEncodingImpl(Instrs); - } }; } // end anonymous namespace @@ -1163,7 +1567,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, const MCTargetOptions &Options) { const Triple &TheTriple = STI.getTargetTriple(); if (TheTriple.isOSBinFormatMachO()) - return new DarwinX86_32AsmBackend(T, MRI, STI); + return new DarwinX86AsmBackend(T, MRI, STI); if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF()) return new WindowsX86AsmBackend(T, false, STI); @@ -1181,13 +1585,8 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, const MCTargetOptions &Options) { const Triple &TheTriple = STI.getTargetTriple(); - if (TheTriple.isOSBinFormatMachO()) { - MachO::CPUSubTypeX86 CS = - StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName()) - .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H) - .Default(MachO::CPU_SUBTYPE_X86_64_ALL); - return new DarwinX86_64AsmBackend(T, MRI, STI, CS); - } + if (TheTriple.isOSBinFormatMachO()) + return new DarwinX86AsmBackend(T, MRI, STI); if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF()) return new WindowsX86AsmBackend(T, true, STI); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index a4f8dd669e1e..79f07d3c7792 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -91,7 +91,7 @@ namespace X86 { COND_G = 15, LAST_VALID_COND = COND_G, - // Artificial condition codes. These are used by AnalyzeBranch + // Artificial condition codes. These are used by analyzeBranch // to indicate a block terminated with two conditional branches that together // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE, // which can't be represented on x86 with a single condition. These @@ -356,6 +356,39 @@ namespace X86 { AlignBranchRet = 1U << 4, AlignBranchIndirect = 1U << 5 }; + + /// Defines the encoding values for segment override prefix. + enum EncodingOfSegmentOverridePrefix : uint8_t { + CS_Encoding = 0x2E, + DS_Encoding = 0x3E, + ES_Encoding = 0x26, + FS_Encoding = 0x64, + GS_Encoding = 0x65, + SS_Encoding = 0x36 + }; + + /// Given a segment register, return the encoding of the segment override + /// prefix for it. + inline EncodingOfSegmentOverridePrefix + getSegmentOverridePrefixForReg(unsigned Reg) { + switch (Reg) { + default: + llvm_unreachable("Unknown segment register!"); + case X86::CS: + return CS_Encoding; + case X86::DS: + return DS_Encoding; + case X86::ES: + return ES_Encoding; + case X86::FS: + return FS_Encoding; + case X86::GS: + return GS_Encoding; + case X86::SS: + return SS_Encoding; + } + } + } // end namespace X86; /// X86II - This namespace holds all of the target specific flags that @@ -581,90 +614,107 @@ namespace X86II { /// in the lower 4 bits of the opcode. AddCCFrm = 9, + /// PrefixByte - This form is used for instructions that represent a prefix + /// byte like data16 or rep. + PrefixByte = 10, + /// MRM[0-7][rm] - These forms are used to represent instructions that use /// a Mod/RM byte, and use the middle field to hold extended opcode /// information. In the intel manual these are represented as /0, /1, ... /// + // Instructions operate on a register Reg/Opcode operand not the r/m field. + MRMr0 = 21, + + /// MRMSrcMem - But force to use the SIB field. + MRMSrcMemFSIB = 22, + + /// MRMDestMem - But force to use the SIB field. + MRMDestMemFSIB = 23, + /// MRMDestMem - This form is used for instructions that use the Mod/RM byte /// to specify a destination, which in this case is memory. /// - MRMDestMem = 32, + MRMDestMem = 24, /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte /// to specify a source, which in this case is memory. /// - MRMSrcMem = 33, + MRMSrcMem = 25, /// MRMSrcMem4VOp3 - This form is used for instructions that encode /// operand 3 with VEX.VVVV and load from memory. /// - MRMSrcMem4VOp3 = 34, + MRMSrcMem4VOp3 = 26, /// MRMSrcMemOp4 - This form is used for instructions that use the Mod/RM /// byte to specify the fourth source, which in this case is memory. /// - MRMSrcMemOp4 = 35, + MRMSrcMemOp4 = 27, /// MRMSrcMemCC - This form is used for instructions that use the Mod/RM /// byte to specify the operands and also encodes a condition code. /// - MRMSrcMemCC = 36, + MRMSrcMemCC = 28, /// MRMXm - This form is used for instructions that use the Mod/RM byte /// to specify a memory source, but doesn't use the middle field. And has /// a condition code. /// - MRMXmCC = 38, + MRMXmCC = 30, /// MRMXm - This form is used for instructions that use the Mod/RM byte /// to specify a memory source, but doesn't use the middle field. /// - MRMXm = 39, + MRMXm = 31, // Next, instructions that operate on a memory r/m operand... - MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43, // Format /0 /1 /2 /3 - MRM4m = 44, MRM5m = 45, MRM6m = 46, MRM7m = 47, // Format /4 /5 /6 /7 + MRM0m = 32, MRM1m = 33, MRM2m = 34, MRM3m = 35, // Format /0 /1 /2 /3 + MRM4m = 36, MRM5m = 37, MRM6m = 38, MRM7m = 39, // Format /4 /5 /6 /7 /// MRMDestReg - This form is used for instructions that use the Mod/RM byte /// to specify a destination, which in this case is a register. /// - MRMDestReg = 48, + MRMDestReg = 40, /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte /// to specify a source, which in this case is a register. /// - MRMSrcReg = 49, + MRMSrcReg = 41, /// MRMSrcReg4VOp3 - This form is used for instructions that encode /// operand 3 with VEX.VVVV and do not load from memory. /// - MRMSrcReg4VOp3 = 50, + MRMSrcReg4VOp3 = 42, /// MRMSrcRegOp4 - This form is used for instructions that use the Mod/RM /// byte to specify the fourth source, which in this case is a register. /// - MRMSrcRegOp4 = 51, + MRMSrcRegOp4 = 43, /// MRMSrcRegCC - This form is used for instructions that use the Mod/RM /// byte to specify the operands and also encodes a condition code /// - MRMSrcRegCC = 52, + MRMSrcRegCC = 44, /// MRMXCCr - This form is used for instructions that use the Mod/RM byte /// to specify a register source, but doesn't use the middle field. And has /// a condition code. /// - MRMXrCC = 54, + MRMXrCC = 46, /// MRMXr - This form is used for instructions that use the Mod/RM byte /// to specify a register source, but doesn't use the middle field. /// - MRMXr = 55, + MRMXr = 47, // Instructions that operate on a register r/m operand... - MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59, // Format /0 /1 /2 /3 - MRM4r = 60, MRM5r = 61, MRM6r = 62, MRM7r = 63, // Format /4 /5 /6 /7 + MRM0r = 48, MRM1r = 49, MRM2r = 50, MRM3r = 51, // Format /0 /1 /2 /3 + MRM4r = 52, MRM5r = 53, MRM6r = 54, MRM7r = 55, // Format /4 /5 /6 /7 + + // Instructions that operate that have mod=11 and an opcode but ignore r/m. + MRM0X = 56, MRM1X = 57, MRM2X = 58, MRM3X = 59, // Format /0 /1 /2 /3 + MRM4X = 60, MRM5X = 61, MRM6X = 62, MRM7X = 63, // Format /4 /5 /6 /7 /// MRM_XX - A mod/rm byte of exactly 0xXX. MRM_C0 = 64, MRM_C1 = 65, MRM_C2 = 66, MRM_C3 = 67, @@ -900,6 +950,16 @@ namespace X86II { NOTRACK = 1ULL << NoTrackShift }; + /// \returns true if the instruction with given opcode is a prefix. + inline bool isPrefix(uint64_t TSFlags) { + return (TSFlags & X86II::FormMask) == PrefixByte; + } + + /// \returns true if the instruction with given opcode is a pseudo. + inline bool isPseudo(uint64_t TSFlags) { + return (TSFlags & X86II::FormMask) == Pseudo; + } + /// \returns the "base" X86 opcode for the specified machine /// instruction. inline uint8_t getBaseOpcodeFor(uint64_t TSFlags) { @@ -1028,10 +1088,13 @@ namespace X86II { case X86II::RawFrmDst: case X86II::RawFrmDstSrc: case X86II::AddCCFrm: + case X86II::PrefixByte: return -1; case X86II::MRMDestMem: + case X86II::MRMDestMemFSIB: return 0; case X86II::MRMSrcMem: + case X86II::MRMSrcMemFSIB: // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a // mask register. return 1 + HasVEX_4V + HasEVEX_K; @@ -1051,12 +1114,18 @@ namespace X86II { case X86II::MRMSrcRegOp4: case X86II::MRMSrcRegCC: case X86II::MRMXrCC: + case X86II::MRMr0: case X86II::MRMXr: case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: case X86II::MRM3r: case X86II::MRM4r: case X86II::MRM5r: case X86II::MRM6r: case X86II::MRM7r: return -1; + case X86II::MRM0X: case X86II::MRM1X: + case X86II::MRM2X: case X86II::MRM3X: + case X86II::MRM4X: case X86II::MRM5X: + case X86II::MRM6X: case X86II::MRM7X: + return -1; case X86II::MRMXmCC: case X86II::MRMXm: case X86II::MRM0m: case X86II::MRM1m: diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index bd009da60851..292dd17e2f51 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -317,8 +317,10 @@ static unsigned getRelocType32(MCContext &Ctx, unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); MCFixupKind Kind = Fixup.getKind(); + if (Kind >= FirstLiteralRelocationKind) + return Kind - FirstLiteralRelocationKind; + MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); X86_64RelType Type = getType64(Kind, Modifier, IsPCRel); if (getEMachine() == ELF::EM_X86_64) return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index 73b1969b4e82..b51011e2c52f 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -15,7 +15,7 @@ #include "X86ATTInstPrinter.h" #include "X86BaseInfo.h" #include "X86MCTargetDesc.h" -#include "Utils/X86ShuffleDecode.h" +#include "X86ShuffleDecode.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/Support/raw_ostream.h" @@ -199,6 +199,40 @@ using namespace llvm; CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int) \ CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int) +#define CASE_FMA4(Inst, suf) \ + CASE_AVX_INS_COMMON(Inst, 4, suf) \ + CASE_AVX_INS_COMMON(Inst, 4Y, suf) + +#define CASE_FMA4_PACKED_RR(Inst) \ + CASE_FMA4(Inst##PD, rr) \ + CASE_FMA4(Inst##PS, rr) + +#define CASE_FMA4_PACKED_RM(Inst) \ + CASE_FMA4(Inst##PD, rm) \ + CASE_FMA4(Inst##PS, rm) + +#define CASE_FMA4_PACKED_MR(Inst) \ + CASE_FMA4(Inst##PD, mr) \ + CASE_FMA4(Inst##PS, mr) + +#define CASE_FMA4_SCALAR_RR(Inst) \ + CASE_AVX_INS_COMMON(Inst##SD4, , rr) \ + CASE_AVX_INS_COMMON(Inst##SS4, , rr) \ + CASE_AVX_INS_COMMON(Inst##SD4, , rr_Int) \ + CASE_AVX_INS_COMMON(Inst##SS4, , rr_Int) + +#define CASE_FMA4_SCALAR_RM(Inst) \ + CASE_AVX_INS_COMMON(Inst##SD4, , rm) \ + CASE_AVX_INS_COMMON(Inst##SS4, , rm) \ + CASE_AVX_INS_COMMON(Inst##SD4, , rm_Int) \ + CASE_AVX_INS_COMMON(Inst##SS4, , rm_Int) + +#define CASE_FMA4_SCALAR_MR(Inst) \ + CASE_AVX_INS_COMMON(Inst##SD4, , mr) \ + CASE_AVX_INS_COMMON(Inst##SS4, , mr) \ + CASE_AVX_INS_COMMON(Inst##SD4, , mr_Int) \ + CASE_AVX_INS_COMMON(Inst##SS4, , mr_Int) + static unsigned getVectorRegSize(unsigned RegNo) { if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31) return 512; @@ -247,14 +281,15 @@ static void printMasking(raw_ostream &OS, const MCInst *MI, OS << " {z}"; } -static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) { +static bool printFMAComments(const MCInst *MI, raw_ostream &OS, + const MCInstrInfo &MCII) { const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr; unsigned NumOperands = MI->getNumOperands(); bool RegForm = false; bool Negate = false; StringRef AccStr = "+"; - // The operands for FMA instructions without rounding fall into two forms. + // The operands for FMA3 instructions without rounding fall into two forms: // dest, src1, src2, src3 // dest, src1, mask, src2, src3 // Where src3 is either a register or 5 memory address operands. So to find @@ -262,9 +297,112 @@ static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) { // index from the end by taking into account memory vs register form when // finding src2. + // The operands for FMA4 instructions: + // dest, src1, src2, src3 + // Where src2 OR src3 are either a register or 5 memory address operands. So + // to find dest and src1 we can index from the front, src2 (reg/mem) follows + // and then src3 (reg) will be at the end. + switch (MI->getOpcode()) { default: return false; + + CASE_FMA4_PACKED_RR(FMADD) + CASE_FMA4_SCALAR_RR(FMADD) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_FMA4_PACKED_RM(FMADD) + CASE_FMA4_SCALAR_RM(FMADD) + Mul2Name = getRegName(MI->getOperand(2).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + break; + CASE_FMA4_PACKED_MR(FMADD) + CASE_FMA4_SCALAR_MR(FMADD) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + break; + + CASE_FMA4_PACKED_RR(FMSUB) + CASE_FMA4_SCALAR_RR(FMSUB) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_FMA4_PACKED_RM(FMSUB) + CASE_FMA4_SCALAR_RM(FMSUB) + Mul2Name = getRegName(MI->getOperand(2).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + break; + CASE_FMA4_PACKED_MR(FMSUB) + CASE_FMA4_SCALAR_MR(FMSUB) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + break; + + CASE_FMA4_PACKED_RR(FNMADD) + CASE_FMA4_SCALAR_RR(FNMADD) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_FMA4_PACKED_RM(FNMADD) + CASE_FMA4_SCALAR_RM(FNMADD) + Mul2Name = getRegName(MI->getOperand(2).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + Negate = true; + break; + CASE_FMA4_PACKED_MR(FNMADD) + CASE_FMA4_SCALAR_MR(FNMADD) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + Negate = true; + break; + + CASE_FMA4_PACKED_RR(FNMSUB) + CASE_FMA4_SCALAR_RR(FNMSUB) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_FMA4_PACKED_RM(FNMSUB) + CASE_FMA4_SCALAR_RM(FNMSUB) + Mul2Name = getRegName(MI->getOperand(2).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + Negate = true; + break; + CASE_FMA4_PACKED_MR(FNMSUB) + CASE_FMA4_SCALAR_MR(FNMSUB) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-"; + Negate = true; + break; + + CASE_FMA4_PACKED_RR(FMADDSUB) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_FMA4_PACKED_RM(FMADDSUB) + Mul2Name = getRegName(MI->getOperand(2).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "+/-"; + break; + CASE_FMA4_PACKED_MR(FMADDSUB) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "+/-"; + break; + + CASE_FMA4_PACKED_RR(FMSUBADD) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_FMA4_PACKED_RM(FMSUBADD) + Mul2Name = getRegName(MI->getOperand(2).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-/+"; + break; + CASE_FMA4_PACKED_MR(FMSUBADD) + AccName = getRegName(MI->getOperand(NumOperands - 1).getReg()); + Mul1Name = getRegName(MI->getOperand(1).getReg()); + AccStr = "-/+"; + break; + CASE_FMA_PACKED_REG(FMADD132) CASE_FMA_SCALAR_REG(FMADD132) Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); @@ -476,8 +614,9 @@ static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) { if (!Mul2Name) Mul2Name = "mem"; if (!AccName) AccName = "mem"; - OS << DestName << " = "; - // TODO: Print masking information? + OS << DestName; + printMasking(OS, MI, MCII); + OS << " = "; if (Negate) OS << '-'; @@ -504,7 +643,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, unsigned NumOperands = MI->getNumOperands(); bool RegForm = false; - if (printFMA3Comments(MI, OS)) + if (printFMAComments(MI, OS, MCII)) return true; switch (MI->getOpcode()) { @@ -669,14 +808,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PSLLDQri: case X86::VPSLLDQri: case X86::VPSLLDQYri: - case X86::VPSLLDQZ128rr: - case X86::VPSLLDQZ256rr: - case X86::VPSLLDQZrr: + case X86::VPSLLDQZ128ri: + case X86::VPSLLDQZ256ri: + case X86::VPSLLDQZri: Src1Name = getRegName(MI->getOperand(1).getReg()); LLVM_FALLTHROUGH; - case X86::VPSLLDQZ128rm: - case X86::VPSLLDQZ256rm: - case X86::VPSLLDQZrm: + case X86::VPSLLDQZ128mi: + case X86::VPSLLDQZ256mi: + case X86::VPSLLDQZmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(NumOperands - 1).isImm()) DecodePSLLDQMask(getRegOperandNumElts(MI, 8, 0), @@ -687,14 +826,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PSRLDQri: case X86::VPSRLDQri: case X86::VPSRLDQYri: - case X86::VPSRLDQZ128rr: - case X86::VPSRLDQZ256rr: - case X86::VPSRLDQZrr: + case X86::VPSRLDQZ128ri: + case X86::VPSRLDQZ256ri: + case X86::VPSRLDQZri: Src1Name = getRegName(MI->getOperand(1).getReg()); LLVM_FALLTHROUGH; - case X86::VPSRLDQZ128rm: - case X86::VPSRLDQZ256rm: - case X86::VPSRLDQZrm: + case X86::VPSRLDQZ128mi: + case X86::VPSRLDQZ256mi: + case X86::VPSRLDQZmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(NumOperands - 1).isImm()) DecodePSRLDQMask(getRegOperandNumElts(MI, 8, 0), @@ -1178,28 +1317,28 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeSubVectorBroadcast(16, 8, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rr) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rm) DecodeSubVectorBroadcast(4, 2, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r) - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r) + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rr) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rr) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; - CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m) - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m) + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rm) DecodeSubVectorBroadcast(8, 2, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r) - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r) + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rr) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rr) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); LLVM_FALLTHROUGH; - CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m) - CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m) + CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rm) + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rm) DecodeSubVectorBroadcast(16, 2, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index a21555076976..33d70fdb1214 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -13,6 +13,7 @@ #include "X86InstPrinterCommon.h" #include "X86BaseInfo.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" @@ -287,16 +288,23 @@ void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op, } } -/// printPCRelImm - This is used to print an immediate value that ends up -/// being encoded as a pc-relative value (e.g. for jumps and calls). In -/// Intel-style these print slightly differently than normal immediates. -/// for example, a $ is not emitted. -void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { +/// value (e.g. for jumps and calls). In Intel-style these print slightly +/// differently than normal immediates. For example, a $ is not emitted. +/// +/// \p Address The address of the next instruction. +/// \see MCInstPrinter::printInst +void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address, + unsigned OpNo, raw_ostream &O) { const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) - O << formatImm(Op.getImm()); - else { + if (Op.isImm()) { + if (PrintBranchImmAsAddress) { + uint64_t Target = Address + Op.getImm(); + if (MAI.getCodePointerSize() == 4) + Target &= 0xffffffff; + O << formatHex(Target); + } else + O << formatImm(Op.getImm()); + } else { assert(Op.isExpr() && "unknown pcrel immediate operand"); // If a symbolic branch target was added as a constant expression then print // that address in hex. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index 8e28f24b619a..bb12ede3b729 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -29,7 +29,9 @@ public: void printVPCMPMnemonic(const MCInst *MI, raw_ostream &OS); void printCMPMnemonic(const MCInst *MI, bool IsVCmp, raw_ostream &OS); void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O); - void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printPCRelImm(const MCInst *MI, uint64_t Address, unsigned OpNo, + raw_ostream &O); + protected: void printInstFlags(const MCInst *MI, raw_ostream &O); void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index f4bb0fbf62cd..d1eb4d09851d 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -45,8 +45,7 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address, if (MI->getOpcode() == X86::DATA16_PREFIX && STI.getFeatureBits()[X86::Mode16Bit]) { OS << "\tdata32"; - } else if (!printAliasInstr(MI, OS) && - !printVecCompareInstr(MI, OS)) + } else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS)) printInstruction(MI, Address, OS); // Next always print the annotation. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h index b409b20cbea8..82baf611df03 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h @@ -31,9 +31,10 @@ public: // Autogenerated by tblgen, returns true if we successfully printed an // alias. - bool printAliasInstr(const MCInst *MI, raw_ostream &OS); - void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); + bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS); + void printCustomAliasOperand(const MCInst *MI, uint64_t Address, + unsigned OpIdx, unsigned PrintMethodIdx, + raw_ostream &O); // Autogenerated by tblgen. void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O); @@ -47,14 +48,6 @@ public: void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O); void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS); - void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - - void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printMemReference(MI, OpNo, O); - } - void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << "byte ptr "; printMemReference(MI, OpNo, O); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index d986c829d98e..c294da6baffa 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -71,8 +71,6 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) { // (actually, must, since otherwise the non-extern relocations we produce // overwhelm ld64's tiny little mind and it fails). DwarfFDESymbolsUseAbsDiff = true; - - UseIntegratedAssembler = true; } X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple) @@ -102,10 +100,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) { // Exceptions handling ExceptionsType = ExceptionHandling::DwarfCFI; - - // Always enable the integrated assembler by default. - // Clang also enabled it when the OS is Solaris but that is redundant here. - UseIntegratedAssembler = true; } const MCExpr * @@ -141,8 +135,16 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) { TextAlignFillValue = 0x90; AllowAtInName = true; +} - UseIntegratedAssembler = true; +void X86MCAsmInfoMicrosoftMASM::anchor() { } + +X86MCAsmInfoMicrosoftMASM::X86MCAsmInfoMicrosoftMASM(const Triple &Triple) + : X86MCAsmInfoMicrosoft(Triple) { + DollarIsPC = true; + SeparatorString = "\n"; + CommentString = ";"; + AllowSymbolAtNameStart = true; } void X86MCAsmInfoGNUCOFF::anchor() { } @@ -164,6 +166,4 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) { TextAlignFillValue = 0x90; AllowAtInName = true; - - UseIntegratedAssembler = true; } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index b2369647a40f..ce8e84fb96b9 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -13,7 +13,6 @@ #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H -#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCAsmInfoCOFF.h" #include "llvm/MC/MCAsmInfoDarwin.h" #include "llvm/MC/MCAsmInfoELF.h" @@ -49,6 +48,13 @@ public: explicit X86MCAsmInfoMicrosoft(const Triple &Triple); }; +class X86MCAsmInfoMicrosoftMASM : public X86MCAsmInfoMicrosoft { + void anchor() override; + +public: + explicit X86MCAsmInfoMicrosoftMASM(const Triple &Triple); +}; + class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { void anchor() override; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 54a293702bd0..7dea0760a831 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -55,83 +55,64 @@ public: const MCSubtargetInfo &STI) const override; private: - unsigned getX86RegNum(const MCOperand &MO) const { - return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7; - } + unsigned getX86RegNum(const MCOperand &MO) const; - unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const { - return Ctx.getRegisterInfo()->getEncodingValue( - MI.getOperand(OpNum).getReg()); - } + unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const; /// \param MI a single low-level machine instruction. /// \param OpNum the operand #. /// \returns true if the OpNumth operand of MI require a bit to be set in /// REX prefix. - bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const { - return (getX86RegEncoding(MI, OpNum) >> 3) & 1; - } - - void emitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const { - OS << (char)C; - ++CurByte; - } - - void emitConstant(uint64_t Val, unsigned Size, unsigned &CurByte, - raw_ostream &OS) const { - // Output the constant in little endian byte order. - for (unsigned i = 0; i != Size; ++i) { - emitByte(Val & 255, CurByte, OS); - Val >>= 8; - } - } + bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const; void emitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize, - MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, + MCFixupKind FixupKind, uint64_t StartByte, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const; - static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) { - assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); - return RM | (RegOpcode << 3) | (Mod << 6); - } - void emitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld, - unsigned &CurByte, raw_ostream &OS) const { - emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), CurByte, OS); - } + raw_ostream &OS) const; void emitSIBByte(unsigned SS, unsigned Index, unsigned Base, - unsigned &CurByte, raw_ostream &OS) const { - // SIB byte is in the same format as the modRMByte. - emitByte(modRMByte(SS, Index, Base), CurByte, OS); - } + raw_ostream &OS) const; void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField, - uint64_t TSFlags, bool Rex, unsigned &CurByte, + uint64_t TSFlags, bool HasREX, uint64_t StartByte, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + const MCSubtargetInfo &STI, + bool ForceSIB = false) const; - void emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, unsigned &CurByte, - bool &Rex, const MCInst &MI, const MCInstrDesc &Desc, - const MCSubtargetInfo &STI, raw_ostream &OS) const; + bool emitPrefixImpl(unsigned &CurOp, const MCInst &MI, + const MCSubtargetInfo &STI, raw_ostream &OS) const; - void emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, - const MCInst &MI, const MCInstrDesc &Desc, + void emitVEXOpcodePrefix(int MemOperand, const MCInst &MI, raw_ostream &OS) const; - void emitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand, - const MCInst &MI, raw_ostream &OS) const; + void emitSegmentOverridePrefix(unsigned SegOperand, const MCInst &MI, + raw_ostream &OS) const; - bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand, - const MCInst &MI, const MCInstrDesc &Desc, + bool emitOpcodePrefix(int MemOperand, const MCInst &MI, const MCSubtargetInfo &STI, raw_ostream &OS) const; - uint8_t determineREXPrefix(const MCInst &MI, uint64_t TSFlags, int MemOperand, - const MCInstrDesc &Desc) const; + bool emitREXPrefix(int MemOperand, const MCInst &MI, raw_ostream &OS) const; }; } // end anonymous namespace +static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) { + assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!"); + return RM | (RegOpcode << 3) | (Mod << 6); +} + +static void emitByte(uint8_t C, raw_ostream &OS) { OS << static_cast<char>(C); } + +static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) { + // Output the constant in little endian byte order. + for (unsigned i = 0; i != Size; ++i) { + emitByte(Val & 255, OS); + Val >>= 8; + } +} + /// \returns true if this signed displacement fits in a 8-bit sign-extended /// field. static bool isDisp8(int Value) { return Value == (int8_t)Value; } @@ -275,7 +256,8 @@ static bool hasSecRelSymbolRef(const MCExpr *Expr) { static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) { unsigned Opcode = MI.getOpcode(); const MCInstrDesc &Desc = MCII.get(Opcode); - if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4) || + if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4 && + Opcode != X86::JCC_4) || getImmFixupKind(Desc.TSFlags) != FK_PCRel_4) return false; @@ -288,9 +270,27 @@ static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) { return Ref && Ref->getKind() == MCSymbolRefExpr::VK_None; } +unsigned X86MCCodeEmitter::getX86RegNum(const MCOperand &MO) const { + return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7; +} + +unsigned X86MCCodeEmitter::getX86RegEncoding(const MCInst &MI, + unsigned OpNum) const { + return Ctx.getRegisterInfo()->getEncodingValue(MI.getOperand(OpNum).getReg()); +} + +/// \param MI a single low-level machine instruction. +/// \param OpNum the operand #. +/// \returns true if the OpNumth operand of MI require a bit to be set in +/// REX prefix. +bool X86MCCodeEmitter::isREXExtendedReg(const MCInst &MI, + unsigned OpNum) const { + return (getX86RegEncoding(MI, OpNum) >> 3) & 1; +} + void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size, MCFixupKind FixupKind, - unsigned &CurByte, raw_ostream &OS, + uint64_t StartByte, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const { const MCExpr *Expr = nullptr; @@ -299,7 +299,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc, // relocation, emit it now. if (FixupKind != FK_PCRel_1 && FixupKind != FK_PCRel_2 && FixupKind != FK_PCRel_4) { - emitConstant(DispOp.getImm() + ImmOffset, Size, CurByte, OS); + emitConstant(DispOp.getImm() + ImmOffset, Size, OS); return; } Expr = MCConstantExpr::create(DispOp.getImm(), Ctx); @@ -322,7 +322,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc, } if (Kind == GOT_Normal) - ImmOffset = CurByte; + ImmOffset = static_cast<int>(OS.tell() - StartByte); } else if (Expr->getKind() == MCExpr::SymbolRef) { if (hasSecRelSymbolRef(Expr)) { FixupKind = MCFixupKind(FK_SecRel_4); @@ -361,16 +361,30 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc, Ctx); // Emit a symbolic constant as a fixup and 4 zeros. - Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc)); - emitConstant(0, Size, CurByte, OS); + Fixups.push_back(MCFixup::create(static_cast<uint32_t>(OS.tell() - StartByte), + Expr, FixupKind, Loc)); + emitConstant(0, Size, OS); +} + +void X86MCCodeEmitter::emitRegModRMByte(const MCOperand &ModRMReg, + unsigned RegOpcodeFld, + raw_ostream &OS) const { + emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), OS); +} + +void X86MCCodeEmitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base, + raw_ostream &OS) const { + // SIB byte is in the same format as the modRMByte. + emitByte(modRMByte(SS, Index, Base), OS); } void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField, - uint64_t TSFlags, bool Rex, - unsigned &CurByte, raw_ostream &OS, + uint64_t TSFlags, bool HasREX, + uint64_t StartByte, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { + const MCSubtargetInfo &STI, + bool ForceSIB) const { const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp); const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg); const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt); @@ -383,8 +397,9 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode assert(STI.hasFeature(X86::Mode64Bit) && "Rip-relative addressing requires 64-bit mode"); - assert(IndexReg.getReg() == 0 && "Invalid rip-relative address"); - emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS); + assert(IndexReg.getReg() == 0 && !ForceSIB && + "Invalid rip-relative address"); + emitByte(modRMByte(0, RegOpcodeField, 5), OS); unsigned Opcode = MI.getOpcode(); // movq loads are handled with a special relocation form which allows the @@ -395,7 +410,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, default: return X86::reloc_riprel_4byte; case X86::MOV64rm: - assert(Rex); + assert(HasREX); return X86::reloc_riprel_4byte_movq_load; case X86::CALL64m: case X86::JMP64m: @@ -409,8 +424,8 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, case X86::SBB64rm: case X86::SUB64rm: case X86::XOR64rm: - return Rex ? X86::reloc_riprel_4byte_relax_rex - : X86::reloc_riprel_4byte_relax; + return HasREX ? X86::reloc_riprel_4byte_relax_rex + : X86::reloc_riprel_4byte_relax; } }(); @@ -425,7 +440,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, ? X86II::getSizeOfImm(TSFlags) : 0; - emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS, + emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS, Fixups, -ImmSize); return; } @@ -472,23 +487,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, if (Disp.isImm() && isDisp8(Disp.getImm())) { if (Disp.getImm() == 0 && RMfield != 6) { // There is no displacement; just the register. - emitByte(modRMByte(0, RegOpcodeField, RMfield), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, RMfield), OS); return; } // Use the [REG]+disp8 form, including for [BP] which cannot be encoded. - emitByte(modRMByte(1, RegOpcodeField, RMfield), CurByte, OS); - emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + emitByte(modRMByte(1, RegOpcodeField, RMfield), OS); + emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups); return; } // This is the [REG]+disp16 case. - emitByte(modRMByte(2, RegOpcodeField, RMfield), CurByte, OS); + emitByte(modRMByte(2, RegOpcodeField, RMfield), OS); } else { // There is no BaseReg; this is the plain [disp16] case. - emitByte(modRMByte(0, RegOpcodeField, 6), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, 6), OS); } // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases. - emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups); + emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, StartByte, OS, Fixups); return; } @@ -498,7 +513,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // 2-7) and absolute references. if ( // The SIB byte must be used if there is an index register. - IndexReg.getReg() == 0 && + !ForceSIB && IndexReg.getReg() == 0 && // The SIB byte must be used if the base is ESP/RSP/R12, all of which // encode to an R/M value of 4, which indicates that a SIB byte is // present. @@ -508,8 +523,8 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, (!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) { if (BaseReg == 0) { // [disp32] in X86-32 mode - emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS); - emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups); + emitByte(modRMByte(0, RegOpcodeField, 5), OS); + emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, StartByte, OS, Fixups); return; } @@ -519,7 +534,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // by emitting a displacement of 0 below. if (BaseRegNo != N86::EBP) { if (Disp.isImm() && Disp.getImm() == 0) { - emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS); return; } @@ -530,7 +545,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // This is exclusively used by call *a@tlscall(base). The relocation // (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning. Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc())); - emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS); return; } } @@ -539,27 +554,27 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, // Otherwise, if the displacement fits in a byte, encode as [REG+disp8]. if (Disp.isImm()) { if (!HasEVEX && isDisp8(Disp.getImm())) { - emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); - emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups); + emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS); + emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups); return; } // Try EVEX compressed 8-bit displacement first; if failed, fall back to // 32-bit displacement. int CDisp8 = 0; if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { - emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS); - emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, + emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS); + emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups, CDisp8 - Disp.getImm()); return; } } // Otherwise, emit the most general non-SIB encoding: [REG+disp32] - emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); + emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS); unsigned Opcode = MI.getOpcode(); unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax : X86::reloc_signed_4byte; - emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS, + emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS, Fixups); return; } @@ -575,30 +590,30 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, if (BaseReg == 0) { // If there is no base register, we emit the special case SIB byte with // MOD=0, BASE=5, to JUST get the index, scale, and displacement. - emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, 4), OS); ForceDisp32 = true; } else if (!Disp.isImm()) { // Emit the normal disp32 encoding. - emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(2, RegOpcodeField, 4), OS); ForceDisp32 = true; } else if (Disp.getImm() == 0 && // Base reg can't be anything that ends up with '5' as the base // reg, it is the magic [*] nomenclature that indicates no base. BaseRegNo != N86::EBP) { // Emit no displacement ModR/M byte - emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(0, RegOpcodeField, 4), OS); } else if (!HasEVEX && isDisp8(Disp.getImm())) { // Emit the disp8 encoding. - emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(1, RegOpcodeField, 4), OS); ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) { // Emit the disp8 encoding. - emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(1, RegOpcodeField, 4), OS); ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP ImmOffset = CDisp8 - Disp.getImm(); } else { // Emit the normal disp32 encoding. - emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS); + emitByte(modRMByte(2, RegOpcodeField, 4), OS); } // Calculate what the SS field value should be... @@ -613,77 +628,78 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, IndexRegNo = getX86RegNum(IndexReg); else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5) IndexRegNo = 4; - emitSIBByte(SS, IndexRegNo, 5, CurByte, OS); + emitSIBByte(SS, IndexRegNo, 5, OS); } else { unsigned IndexRegNo; if (IndexReg.getReg()) IndexRegNo = getX86RegNum(IndexReg); else IndexRegNo = 4; // For example [ESP+1*<noreg>+4] - emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), CurByte, OS); + emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), OS); } // Do we need to output a displacement? if (ForceDisp8) - emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, + emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups, ImmOffset); else if (ForceDisp32 || Disp.getImm() != 0) emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), - CurByte, OS, Fixups); + StartByte, OS, Fixups); } -void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, - unsigned &CurByte, bool &Rex, - const MCInst &MI, const MCInstrDesc &Desc, - const MCSubtargetInfo &STI, - raw_ostream &OS) const { +/// Emit all instruction prefixes. +/// +/// \returns true if REX prefix is used, otherwise returns false. +bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI, + const MCSubtargetInfo &STI, + raw_ostream &OS) const { + uint64_t TSFlags = MCII.get(MI.getOpcode()).TSFlags; // Determine where the memory operand starts, if present. int MemoryOperand = X86II::getMemoryOperandNo(TSFlags); - if (MemoryOperand != -1) - MemoryOperand += CurOp; - // Emit segment override opcode prefix as needed. - if (MemoryOperand >= 0) - emitSegmentOverridePrefix(CurByte, MemoryOperand + X86::AddrSegmentReg, MI, - OS); + if (MemoryOperand != -1) { + MemoryOperand += CurOp; + emitSegmentOverridePrefix(MemoryOperand + X86::AddrSegmentReg, MI, OS); + } // Emit the repeat opcode prefix as needed. unsigned Flags = MI.getFlags(); if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT) - emitByte(0xF3, CurByte, OS); + emitByte(0xF3, OS); if (Flags & X86::IP_HAS_REPEAT_NE) - emitByte(0xF2, CurByte, OS); + emitByte(0xF2, OS); // Emit the address size opcode prefix as needed. - bool need_address_override; + bool NeedAddressOverride; uint64_t AdSize = TSFlags & X86II::AdSizeMask; if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) || (STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) || (STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) { - need_address_override = true; + NeedAddressOverride = true; } else if (MemoryOperand < 0) { - need_address_override = false; + NeedAddressOverride = false; } else if (STI.hasFeature(X86::Mode64Bit)) { assert(!is16BitMemOperand(MI, MemoryOperand, STI)); - need_address_override = is32BitMemOperand(MI, MemoryOperand); + NeedAddressOverride = is32BitMemOperand(MI, MemoryOperand); } else if (STI.hasFeature(X86::Mode32Bit)) { assert(!is64BitMemOperand(MI, MemoryOperand)); - need_address_override = is16BitMemOperand(MI, MemoryOperand, STI); + NeedAddressOverride = is16BitMemOperand(MI, MemoryOperand, STI); } else { assert(STI.hasFeature(X86::Mode16Bit)); assert(!is64BitMemOperand(MI, MemoryOperand)); - need_address_override = !is16BitMemOperand(MI, MemoryOperand, STI); + NeedAddressOverride = !is16BitMemOperand(MI, MemoryOperand, STI); } - if (need_address_override) - emitByte(0x67, CurByte, OS); + if (NeedAddressOverride) + emitByte(0x67, OS); // Encoding type for this instruction. uint64_t Encoding = TSFlags & X86II::EncodingMask; - if (Encoding == 0) - Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS); + bool HasREX = false; + if (Encoding) + emitVEXOpcodePrefix(MemoryOperand, MI, OS); else - emitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS); + HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS); uint64_t Form = TSFlags & X86II::FormMask; switch (Form) { @@ -697,11 +713,11 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, "SI and DI register sizes do not match"); // Emit segment override opcode prefix as needed (not for %ds). if (MI.getOperand(2).getReg() != X86::DS) - emitSegmentOverridePrefix(CurByte, 2, MI, OS); + emitSegmentOverridePrefix(2, MI, OS); // Emit AdSize prefix as needed. if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) || (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI)) - emitByte(0x67, CurByte, OS); + emitByte(0x67, OS); CurOp += 3; // Consume operands. break; } @@ -709,11 +725,11 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, unsigned siReg = MI.getOperand(0).getReg(); // Emit segment override opcode prefix as needed (not for %ds). if (MI.getOperand(1).getReg() != X86::DS) - emitSegmentOverridePrefix(CurByte, 1, MI, OS); + emitSegmentOverridePrefix(1, MI, OS); // Emit AdSize prefix as needed. if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) || (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI)) - emitByte(0x67, CurByte, OS); + emitByte(0x67, OS); CurOp += 2; // Consume operands. break; } @@ -722,24 +738,26 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, // Emit AdSize prefix as needed. if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) || (STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI)) - emitByte(0x67, CurByte, OS); + emitByte(0x67, OS); ++CurOp; // Consume operand. break; } case X86II::RawFrmMemOffs: { // Emit segment override opcode prefix as needed. - emitSegmentOverridePrefix(CurByte, 1, MI, OS); + emitSegmentOverridePrefix(1, MI, OS); break; } } + + return HasREX; } -/// emitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix -/// called VEX. -void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, - int MemOperand, const MCInst &MI, - const MCInstrDesc &Desc, +/// AVX instructions are encoded using a opcode prefix called VEX. +void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI, raw_ostream &OS) const { + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX."); uint64_t Encoding = TSFlags & X86II::EncodingMask; @@ -868,8 +886,11 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, switch (TSFlags & X86II::FormMask) { default: llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!"); + case X86II::MRM_C0: case X86II::RawFrm: + case X86II::PrefixByte: break; + case X86II::MRMDestMemFSIB: case X86II::MRMDestMem: { // MRMDestMem instructions forms: // MemAddr, src1(ModR/M) @@ -900,6 +921,7 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, EVEX_R2 = ~(RegEnc >> 4) & 1; break; } + case X86II::MRMSrcMemFSIB: case X86II::MRMSrcMem: { // MRMSrcMem instructions forms: // src1(ModR/M), MemAddr @@ -1081,6 +1103,15 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, EncodeRC = true; break; } + case X86II::MRMr0: { + // MRMr0 instructions forms: + // 11:rrr:000 + // dst(ModR/M) + unsigned RegEnc = getX86RegEncoding(MI, CurOp++); + VEX_R = ~(RegEnc >> 3) & 1; + EVEX_R2 = ~(RegEnc >> 4) & 1; + break; + } case X86II::MRM0r: case X86II::MRM1r: case X86II::MRM2r: @@ -1127,15 +1158,15 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // Can we use the 2 byte VEX prefix? if (!(MI.getFlags() & X86::IP_USE_VEX3) && Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) { - emitByte(0xC5, CurByte, OS); - emitByte(LastByte | (VEX_R << 7), CurByte, OS); + emitByte(0xC5, OS); + emitByte(LastByte | (VEX_R << 7), OS); return; } // 3 byte VEX prefix - emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS); - emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); - emitByte(LastByte | (VEX_W << 7), CurByte, OS); + emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, OS); + emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, OS); + emitByte(LastByte | (VEX_W << 7), OS); } else { assert(Encoding == X86II::EVEX && "unknown encoding!"); // EVEX opcode prefix can have 4 bytes @@ -1146,144 +1177,137 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, assert((VEX_5M & 0x3) == VEX_5M && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!"); - emitByte(0x62, CurByte, OS); + emitByte(0x62, OS); emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) | VEX_5M, - CurByte, OS); - emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, CurByte, OS); + emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, OS); if (EncodeRC) emitByte((EVEX_z << 7) | (EVEX_rc << 5) | (EVEX_b << 4) | (EVEX_V2 << 3) | EVEX_aaa, - CurByte, OS); + OS); else emitByte((EVEX_z << 7) | (EVEX_L2 << 6) | (VEX_L << 5) | (EVEX_b << 4) | (EVEX_V2 << 3) | EVEX_aaa, - CurByte, OS); + OS); } } -/// Determine if the MCInst has to be encoded with a X86-64 REX prefix which -/// specifies 1) 64-bit instructions, 2) non-default operand size, and 3) use -/// of X86-64 extended registers. -uint8_t X86MCCodeEmitter::determineREXPrefix(const MCInst &MI, uint64_t TSFlags, - int MemOperand, - const MCInstrDesc &Desc) const { - uint8_t REX = 0; - bool UsesHighByteReg = false; - - if (TSFlags & X86II::REX_W) - REX |= 1 << 3; // set REX.W +/// Emit REX prefix which specifies +/// 1) 64-bit instructions, +/// 2) non-default operand size, and +/// 3) use of X86-64 extended registers. +/// +/// \returns true if REX prefix is used, otherwise returns false. +bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI, + raw_ostream &OS) const { + uint8_t REX = [&, MemOperand]() { + uint8_t REX = 0; + bool UsesHighByteReg = false; + + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + + if (TSFlags & X86II::REX_W) + REX |= 1 << 3; // set REX.W + + if (MI.getNumOperands() == 0) + return REX; + + unsigned NumOps = MI.getNumOperands(); + unsigned CurOp = X86II::getOperandBias(Desc); + + // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. + for (unsigned i = CurOp; i != NumOps; ++i) { + const MCOperand &MO = MI.getOperand(i); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) + UsesHighByteReg = true; + if (X86II::isX86_64NonExtLowByteReg(Reg)) + // FIXME: The caller of determineREXPrefix slaps this prefix onto + // anything that returns non-zero. + REX |= 0x40; // REX fixed encoding prefix + } - if (MI.getNumOperands() == 0) + switch (TSFlags & X86II::FormMask) { + case X86II::AddRegFrm: + REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B + break; + case X86II::MRMSrcReg: + case X86II::MRMSrcRegCC: + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R + REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B + break; + case X86II::MRMSrcMem: + case X86II::MRMSrcMemCC: + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X + CurOp += X86::AddrNumOperands; + break; + case X86II::MRMDestReg: + REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R + break; + case X86II::MRMDestMem: + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X + CurOp += X86::AddrNumOperands; + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R + break; + case X86II::MRMXmCC: + case X86II::MRMXm: + case X86II::MRM0m: + case X86II::MRM1m: + case X86II::MRM2m: + case X86II::MRM3m: + case X86II::MRM4m: + case X86II::MRM5m: + case X86II::MRM6m: + case X86II::MRM7m: + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B + REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X + break; + case X86II::MRMXrCC: + case X86II::MRMXr: + case X86II::MRM0r: + case X86II::MRM1r: + case X86II::MRM2r: + case X86II::MRM3r: + case X86II::MRM4r: + case X86II::MRM5r: + case X86II::MRM6r: + case X86II::MRM7r: + REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B + break; + case X86II::MRMr0: + REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R + break; + case X86II::MRMDestMemFSIB: + llvm_unreachable("FSIB format never need REX prefix!"); + } + if (REX && UsesHighByteReg) + report_fatal_error( + "Cannot encode high byte register in REX-prefixed instruction"); return REX; + }(); - unsigned NumOps = MI.getNumOperands(); - unsigned CurOp = X86II::getOperandBias(Desc); - - // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix. - for (unsigned i = CurOp; i != NumOps; ++i) { - const MCOperand &MO = MI.getOperand(i); - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) - UsesHighByteReg = true; - if (X86II::isX86_64NonExtLowByteReg(Reg)) - // FIXME: The caller of determineREXPrefix slaps this prefix onto anything - // that returns non-zero. - REX |= 0x40; // REX fixed encoding prefix - } - - switch (TSFlags & X86II::FormMask) { - case X86II::AddRegFrm: - REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B - break; - case X86II::MRMSrcReg: - case X86II::MRMSrcRegCC: - REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R - REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B - break; - case X86II::MRMSrcMem: - case X86II::MRMSrcMemCC: - REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R - REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B - REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X - CurOp += X86::AddrNumOperands; - break; - case X86II::MRMDestReg: - REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B - REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R - break; - case X86II::MRMDestMem: - REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B - REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X - CurOp += X86::AddrNumOperands; - REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R - break; - case X86II::MRMXmCC: - case X86II::MRMXm: - case X86II::MRM0m: - case X86II::MRM1m: - case X86II::MRM2m: - case X86II::MRM3m: - case X86II::MRM4m: - case X86II::MRM5m: - case X86II::MRM6m: - case X86II::MRM7m: - REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B - REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X - break; - case X86II::MRMXrCC: - case X86II::MRMXr: - case X86II::MRM0r: - case X86II::MRM1r: - case X86II::MRM2r: - case X86II::MRM3r: - case X86II::MRM4r: - case X86II::MRM5r: - case X86II::MRM6r: - case X86II::MRM7r: - REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B - break; - } - if (REX && UsesHighByteReg) - report_fatal_error( - "Cannot encode high byte register in REX-prefixed instruction"); + if (!REX) + return false; - return REX; + emitByte(0x40 | REX, OS); + return true; } /// Emit segment override opcode prefix as needed. -void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte, - unsigned SegOperand, +void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned SegOperand, const MCInst &MI, raw_ostream &OS) const { // Check for explicit segment override on memory operand. - switch (MI.getOperand(SegOperand).getReg()) { - default: - llvm_unreachable("Unknown segment register!"); - case 0: - break; - case X86::CS: - emitByte(0x2E, CurByte, OS); - break; - case X86::SS: - emitByte(0x36, CurByte, OS); - break; - case X86::DS: - emitByte(0x3E, CurByte, OS); - break; - case X86::ES: - emitByte(0x26, CurByte, OS); - break; - case X86::FS: - emitByte(0x64, CurByte, OS); - break; - case X86::GS: - emitByte(0x65, CurByte, OS); - break; - } + if (unsigned Reg = MI.getOperand(SegOperand).getReg()) + emitByte(X86::getSegmentOverridePrefixForReg(Reg), OS); } /// Emit all instruction prefixes prior to the opcode. @@ -1291,48 +1315,44 @@ void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte, /// \param MemOperand the operand # of the start of a memory operand if present. /// If not present, it is -1. /// -/// \returns true if a REX prefix was used. -bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, - int MemOperand, const MCInst &MI, - const MCInstrDesc &Desc, +/// \returns true if REX prefix is used, otherwise returns false. +bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI, const MCSubtargetInfo &STI, raw_ostream &OS) const { - bool Ret = false; + const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); + uint64_t TSFlags = Desc.TSFlags; + // Emit the operand size opcode prefix as needed. if ((TSFlags & X86II::OpSizeMask) == (STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16)) - emitByte(0x66, CurByte, OS); + emitByte(0x66, OS); // Emit the LOCK opcode prefix. if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK) - emitByte(0xF0, CurByte, OS); + emitByte(0xF0, OS); // Emit the NOTRACK opcode prefix. if (TSFlags & X86II::NOTRACK || MI.getFlags() & X86::IP_HAS_NOTRACK) - emitByte(0x3E, CurByte, OS); + emitByte(0x3E, OS); switch (TSFlags & X86II::OpPrefixMask) { case X86II::PD: // 66 - emitByte(0x66, CurByte, OS); + emitByte(0x66, OS); break; case X86II::XS: // F3 - emitByte(0xF3, CurByte, OS); + emitByte(0xF3, OS); break; case X86II::XD: // F2 - emitByte(0xF2, CurByte, OS); + emitByte(0xF2, OS); break; } // Handle REX prefix. - // FIXME: Can this come before F2 etc to simplify emission? - if (STI.hasFeature(X86::Mode64Bit)) { - if (uint8_t REX = determineREXPrefix(MI, TSFlags, MemOperand, Desc)) { - emitByte(0x40 | REX, CurByte, OS); - Ret = true; - } - } else { - assert(!(TSFlags & X86II::REX_W) && "REX.W requires 64bit mode."); - } + assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) && + "REX.W requires 64bit mode."); + bool HasREX = STI.hasFeature(X86::Mode64Bit) + ? emitREXPrefix(MemOperand, MI, OS) + : false; // 0x0F escape code must be emitted just before the opcode. switch (TSFlags & X86II::OpMapMask) { @@ -1340,19 +1360,20 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, case X86II::T8: // 0F 38 case X86II::TA: // 0F 3A case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller. - emitByte(0x0F, CurByte, OS); + emitByte(0x0F, OS); break; } switch (TSFlags & X86II::OpMapMask) { case X86II::T8: // 0F 38 - emitByte(0x38, CurByte, OS); + emitByte(0x38, OS); break; case X86II::TA: // 0F 3A - emitByte(0x3A, CurByte, OS); + emitByte(0x3A, OS); break; } - return Ret; + + return HasREX; } void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS, @@ -1362,16 +1383,12 @@ void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS, uint64_t TSFlags = Desc.TSFlags; // Pseudo instructions don't get encoded. - if ((TSFlags & X86II::FormMask) == X86II::Pseudo) + if (X86II::isPseudo(TSFlags)) return; unsigned CurOp = X86II::getOperandBias(Desc); - // Keep track of the current byte being emitted. - unsigned CurByte = 0; - - bool Rex = false; - emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS); + emitPrefixImpl(CurOp, MI, STI, OS); } void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, @@ -1382,17 +1399,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, uint64_t TSFlags = Desc.TSFlags; // Pseudo instructions don't get encoded. - if ((TSFlags & X86II::FormMask) == X86II::Pseudo) + if (X86II::isPseudo(TSFlags)) return; unsigned NumOps = Desc.getNumOperands(); unsigned CurOp = X86II::getOperandBias(Desc); - // Keep track of the current byte being emitted. - unsigned CurByte = 0; + uint64_t StartByte = OS.tell(); - bool Rex = false; - emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS); + bool HasREX = emitPrefixImpl(CurOp, MI, STI, OS); // It uses the VEX.VVVV field? bool HasVEX_4V = TSFlags & X86II::VEX_4V; @@ -1422,7 +1437,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::RawFrmDstSrc: case X86II::RawFrmSrc: case X86II::RawFrmDst: - emitByte(BaseOpcode, CurByte, OS); + case X86II::PrefixByte: + emitByte(BaseOpcode, OS); break; case X86II::AddCCFrm: { // This will be added to the opcode in the fallthrough. @@ -1431,47 +1447,47 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, --NumOps; // Drop the operand from the end. LLVM_FALLTHROUGH; case X86II::RawFrm: - emitByte(BaseOpcode + OpcodeOffset, CurByte, OS); + emitByte(BaseOpcode + OpcodeOffset, OS); if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII)) break; const MCOperand &Op = MI.getOperand(CurOp++); emitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags), - MCFixupKind(X86::reloc_branch_4byte_pcrel), CurByte, OS, + MCFixupKind(X86::reloc_branch_4byte_pcrel), StartByte, OS, Fixups); break; } case X86II::RawFrmMemOffs: - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), - CurByte, OS, Fixups); + StartByte, OS, Fixups); ++CurOp; // skip segment operand break; case X86II::RawFrmImm8: - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), - CurByte, OS, Fixups); - emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte, + StartByte, OS, Fixups); + emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups); break; case X86II::RawFrmImm16: - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), - CurByte, OS, Fixups); - emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte, + StartByte, OS, Fixups); + emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, StartByte, OS, Fixups); break; case X86II::AddRegFrm: - emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), CurByte, OS); + emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), OS); break; case X86II::MRMDestReg: { - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); unsigned SrcRegNum = CurOp + 1; if (HasEVEX_K) // Skip writemask @@ -1481,12 +1497,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, ++SrcRegNum; emitRegModRMByte(MI.getOperand(CurOp), - getX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS); + getX86RegNum(MI.getOperand(SrcRegNum)), OS); CurOp = SrcRegNum + 1; break; } + case X86II::MRMDestMemFSIB: case X86II::MRMDestMem: { - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); unsigned SrcRegNum = CurOp + X86::AddrNumOperands; if (HasEVEX_K) // Skip writemask @@ -1495,13 +1512,14 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; + bool ForceSIB = (Form == X86II::MRMDestMemFSIB); emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(SrcRegNum)), TSFlags, - Rex, CurByte, OS, Fixups, STI); + HasREX, StartByte, OS, Fixups, STI, ForceSIB); CurOp = SrcRegNum + 1; break; } case X86II::MRMSrcReg: { - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); unsigned SrcRegNum = CurOp + 1; if (HasEVEX_K) // Skip writemask @@ -1511,7 +1529,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, ++SrcRegNum; emitRegModRMByte(MI.getOperand(SrcRegNum), - getX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + getX86RegNum(MI.getOperand(CurOp)), OS); CurOp = SrcRegNum + 1; if (HasVEX_I8Reg) I8RegNum = getX86RegEncoding(MI, CurOp++); @@ -1521,17 +1539,17 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, break; } case X86II::MRMSrcReg4VOp3: { - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); unsigned SrcRegNum = CurOp + 1; emitRegModRMByte(MI.getOperand(SrcRegNum), - getX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + getX86RegNum(MI.getOperand(CurOp)), OS); CurOp = SrcRegNum + 1; ++CurOp; // Encoded in VEX.VVVV break; } case X86II::MRMSrcRegOp4: { - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); unsigned SrcRegNum = CurOp + 1; // Skip 1st src (which is encoded in VEX_VVVV) @@ -1542,7 +1560,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, I8RegNum = getX86RegEncoding(MI, SrcRegNum++); emitRegModRMByte(MI.getOperand(SrcRegNum), - getX86RegNum(MI.getOperand(CurOp)), CurByte, OS); + getX86RegNum(MI.getOperand(CurOp)), OS); CurOp = SrcRegNum + 1; break; } @@ -1551,12 +1569,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned SecondOp = CurOp++; unsigned CC = MI.getOperand(CurOp++).getImm(); - emitByte(BaseOpcode + CC, CurByte, OS); + emitByte(BaseOpcode + CC, OS); emitRegModRMByte(MI.getOperand(SecondOp), - getX86RegNum(MI.getOperand(FirstOp)), CurByte, OS); + getX86RegNum(MI.getOperand(FirstOp)), OS); break; } + case X86II::MRMSrcMemFSIB: case X86II::MRMSrcMem: { unsigned FirstMemOp = CurOp + 1; @@ -1566,10 +1585,11 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); + bool ForceSIB = (Form == X86II::MRMSrcMemFSIB); emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)), - TSFlags, Rex, CurByte, OS, Fixups, STI); + TSFlags, HasREX, StartByte, OS, Fixups, STI, ForceSIB); CurOp = FirstMemOp + X86::AddrNumOperands; if (HasVEX_I8Reg) I8RegNum = getX86RegEncoding(MI, CurOp++); @@ -1578,10 +1598,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRMSrcMem4VOp3: { unsigned FirstMemOp = CurOp + 1; - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)), - TSFlags, Rex, CurByte, OS, Fixups, STI); + TSFlags, HasREX, StartByte, OS, Fixups, STI); CurOp = FirstMemOp + X86::AddrNumOperands; ++CurOp; // Encoded in VEX.VVVV. break; @@ -1595,10 +1615,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg"); I8RegNum = getX86RegEncoding(MI, FirstMemOp++); - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)), - TSFlags, Rex, CurByte, OS, Fixups, STI); + TSFlags, HasREX, StartByte, OS, Fixups, STI); CurOp = FirstMemOp + X86::AddrNumOperands; break; } @@ -1608,10 +1628,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, CurOp = FirstMemOp + X86::AddrNumOperands; unsigned CC = MI.getOperand(CurOp++).getImm(); - emitByte(BaseOpcode + CC, CurByte, OS); + emitByte(BaseOpcode + CC, OS); emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(RegOp)), - TSFlags, Rex, CurByte, OS, Fixups, STI); + TSFlags, HasREX, StartByte, OS, Fixups, STI); break; } @@ -1619,8 +1639,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned RegOp = CurOp++; unsigned CC = MI.getOperand(CurOp++).getImm(); - emitByte(BaseOpcode + CC, CurByte, OS); - emitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS); + emitByte(BaseOpcode + CC, OS); + emitRegModRMByte(MI.getOperand(RegOp), 0, OS); break; } @@ -1637,10 +1657,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, ++CurOp; if (HasEVEX_K) // Skip writemask ++CurOp; - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); emitRegModRMByte(MI.getOperand(CurOp++), - (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, CurByte, - OS); + (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, OS); + break; + case X86II::MRMr0: + emitByte(BaseOpcode, OS); + emitByte(modRMByte(3, getX86RegNum(MI.getOperand(CurOp++)),0), OS); break; case X86II::MRMXmCC: { @@ -1648,9 +1671,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, CurOp = FirstMemOp + X86::AddrNumOperands; unsigned CC = MI.getOperand(CurOp++).getImm(); - emitByte(BaseOpcode + CC, CurByte, OS); + emitByte(BaseOpcode + CC, OS); - emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI); + emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, HasREX, StartByte, OS, Fixups, + STI); break; } @@ -1667,13 +1691,25 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, ++CurOp; if (HasEVEX_K) // Skip writemask ++CurOp; - emitByte(BaseOpcode, CurByte, OS); + emitByte(BaseOpcode, OS); emitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags, - Rex, CurByte, OS, Fixups, STI); + HasREX, StartByte, OS, Fixups, STI); CurOp += X86::AddrNumOperands; break; + case X86II::MRM0X: + case X86II::MRM1X: + case X86II::MRM2X: + case X86II::MRM3X: + case X86II::MRM4X: + case X86II::MRM5X: + case X86II::MRM6X: + case X86II::MRM7X: + emitByte(BaseOpcode, OS); + emitByte(0xC0 + ((Form - X86II::MRM0X) << 3), OS); + break; + case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: @@ -1738,8 +1774,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, case X86II::MRM_FD: case X86II::MRM_FE: case X86II::MRM_FF: - emitByte(BaseOpcode, CurByte, OS); - emitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS); + emitByte(BaseOpcode, OS); + emitByte(0xC0 + Form - X86II::MRM_C0, OS); break; } @@ -1754,7 +1790,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, I8RegNum |= Val; } emitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1, - CurByte, OS, Fixups); + StartByte, OS, Fixups); } else { // If there is a remaining operand, it must be a trailing immediate. Emit it // according to the right size for the instruction. Some instructions @@ -1762,13 +1798,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, while (CurOp != NumOps && NumOps - CurOp <= 2) { emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags), - CurByte, OS, Fixups); + StartByte, OS, Fixups); } } if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow) - emitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS); + emitByte(X86II::getBaseOpcodeFor(TSFlags), OS); + assert(OS.tell() - StartByte <= 15 && + "The size of instruction must be no longer than 15."); #ifndef NDEBUG // FIXME: Verify. if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 049a3a815984..81110ba666e9 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -30,10 +30,6 @@ #include "llvm/Support/Host.h" #include "llvm/Support/TargetRegistry.h" -#if _MSC_VER -#include <intrin.h> -#endif - using namespace llvm; #define GET_REGINFO_MC_DESC @@ -294,7 +290,7 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, if (!FS.empty()) ArchFS = (Twine(ArchFS) + "," + FS).str(); - std::string CPUName = CPU; + std::string CPUName = std::string(CPU); if (CPUName.empty()) CPUName = "generic"; @@ -335,7 +331,10 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, MAI = new X86ELFMCAsmInfo(TheTriple); } else if (TheTriple.isWindowsMSVCEnvironment() || TheTriple.isWindowsCoreCLREnvironment()) { - MAI = new X86MCAsmInfoMicrosoft(TheTriple); + if (Options.getAssemblyLanguage().equals_lower("masm")) + MAI = new X86MCAsmInfoMicrosoftMASM(TheTriple); + else + MAI = new X86MCAsmInfoMicrosoft(TheTriple); } else if (TheTriple.isOSCygMing() || TheTriple.isWindowsItaniumEnvironment()) { MAI = new X86MCAsmInfoGNUCOFF(TheTriple); @@ -350,7 +349,7 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, // Initial state of the frame pointer is esp+stackGrowth. unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP; - MCCFIInstruction Inst = MCCFIInstruction::createDefCfa( + MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa( nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth); MAI->addInitialFrameState(Inst); @@ -401,6 +400,9 @@ public: findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents, uint64_t GotSectionVA, const Triple &TargetTriple) const override; + + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, + uint64_t &Target) const override; Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst, uint64_t Addr, uint64_t Size) const override; @@ -519,6 +521,15 @@ std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries( } } +bool X86MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr, + uint64_t Size, uint64_t &Target) const { + if (Inst.getNumOperands() == 0 || + Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL) + return false; + Target = Addr + Size + Inst.getOperand(0).getImm(); + return true; +} + Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress( const MCInst &Inst, uint64_t Addr, uint64_t Size) const { const MCInstrDesc &MCID = Info->get(Inst.getOpcode()); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 0c789061f0e1..e8c72be1d9b6 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -13,27 +13,28 @@ #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H -#include "llvm/MC/MCRegister.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/Support/DataTypes.h" +#include <memory> #include <string> namespace llvm { +class formatted_raw_ostream; class MCAsmBackend; class MCCodeEmitter; class MCContext; +class MCInst; +class MCInstPrinter; class MCInstrInfo; class MCObjectTargetWriter; class MCObjectWriter; +class MCRegister; class MCRegisterInfo; +class MCStreamer; class MCSubtargetInfo; -class MCRelocationInfo; class MCTargetOptions; +class MCTargetStreamer; class Target; class Triple; class StringRef; -class raw_ostream; -class raw_pwrite_stream; /// Flavour of dwarf regnumbers /// diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp index 48fd3e0b7ab9..62c1c399a606 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp @@ -12,7 +12,9 @@ //===----------------------------------------------------------------------===// #include "X86ShuffleDecode.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" //===----------------------------------------------------------------------===// // Vector Mask Decoding @@ -141,9 +143,6 @@ void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, ShuffleMask.push_back(i + Imm); } -/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*. -/// VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned Size = NumElts * ScalarBits; @@ -197,9 +196,6 @@ void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) { ShuffleMask.push_back(h); } -/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates -/// the type of the vector allowing it to handle different datatypes and vector -/// widths. void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NumLaneElts = 128 / ScalarBits; @@ -217,9 +213,6 @@ void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, } } -/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd -/// and punpckh*. VT indicates the type of the vector allowing it to handle -/// different datatypes and vector widths. void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl<int> &ShuffleMask) { // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -236,9 +229,6 @@ void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, } } -/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd -/// and punpckl*. VT indicates the type of the vector allowing it to handle -/// different datatypes and vector widths. void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl<int> &ShuffleMask) { // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate @@ -255,13 +245,11 @@ void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, } } -/// Decodes a broadcast of the first element of a vector. void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) { ShuffleMask.append(NumElts, 0); } -/// Decodes a broadcast of a subvector to a larger vector type. void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts, SmallVectorImpl<int> &ShuffleMask) { unsigned Scale = DstNumElts / SrcNumElts; @@ -271,9 +259,6 @@ void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts, ShuffleMask.push_back(j); } -/// Decode a shuffle packed values at 128-bit granularity -/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) -/// immediate mask into a shuffle mask. void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { @@ -374,7 +359,6 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts, } } -/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { for (unsigned l = 0; l != NumElts; l += 4) @@ -384,32 +368,31 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm, void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, - SmallVectorImpl<int> &Mask) { + SmallVectorImpl<int> &ShuffleMask) { unsigned Scale = DstScalarBits / SrcScalarBits; assert(SrcScalarBits < DstScalarBits && "Expected zero extension mask to increase scalar size"); + int Sentinel = IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero; for (unsigned i = 0; i != NumDstElts; i++) { - Mask.push_back(i); - for (unsigned j = 1; j != Scale; j++) - Mask.push_back(IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero); + ShuffleMask.push_back(i); + ShuffleMask.append(Scale - 1, Sentinel); } } void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) { ShuffleMask.push_back(0); - for (unsigned i = 1; i < NumElts; i++) - ShuffleMask.push_back(SM_SentinelZero); + ShuffleMask.append(NumElts - 1, SM_SentinelZero); } void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, - SmallVectorImpl<int> &Mask) { + SmallVectorImpl<int> &ShuffleMask) { // First element comes from the first element of second source. // Remaining elements: Load zero extends / Move copies from first source. - Mask.push_back(NumElts); + ShuffleMask.push_back(NumElts); for (unsigned i = 1; i < NumElts; i++) - Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i); + ShuffleMask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i); } void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h index f52785063071..4ef9959f7a27 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h @@ -14,15 +14,16 @@ #ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H #define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/SmallVector.h" +#include <cstdint> //===----------------------------------------------------------------------===// // Vector Mask Decoding //===----------------------------------------------------------------------===// namespace llvm { +class APInt; template <typename T> class ArrayRef; +template <typename T> class SmallVectorImpl; enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 }; @@ -61,20 +62,14 @@ void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps. -/// VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// Decodes the shuffle masks for pshufhw. -/// VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// Decodes the shuffle masks for pshuflw. -/// VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); @@ -82,20 +77,14 @@ void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask); /// Decodes the shuffle masks for shufp*. -/// VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*. -/// VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl<int> &ShuffleMask); /// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*. -/// VT indicates the type of the vector allowing it to handle different -/// datatypes and vector widths. void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl<int> &ShuffleMask); @@ -119,6 +108,7 @@ void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// Decode a shuffle packed values at 128-bit granularity +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) /// immediate mask into a shuffle mask. void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index db624378d517..3bebcc24fd3a 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -28,7 +28,7 @@ public: void EmitWinEHHandlerData(SMLoc Loc) override; void EmitWindowsUnwindTables() override; void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override; - void FinishImpl() override; + void finishImpl() override; }; void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { @@ -52,11 +52,11 @@ void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) { XTS->emitFPOData(ProcSym, Loc); } -void X86WinCOFFStreamer::FinishImpl() { - EmitFrames(nullptr); +void X86WinCOFFStreamer::finishImpl() { + emitFrames(nullptr); EmitWindowsUnwindTables(); - MCWinCOFFStreamer::FinishImpl(); + MCWinCOFFStreamer::finishImpl(); } } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp index d5494ef12370..11251fb2b2ba 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -159,7 +159,7 @@ bool X86WinCOFFTargetStreamer::checkInFPOPrologue(SMLoc L) { MCSymbol *X86WinCOFFTargetStreamer::emitFPOLabel() { MCSymbol *Label = getContext().createTempSymbol("cfi", true); - getStreamer().EmitLabel(Label); + getStreamer().emitLabel(Label); return Label; } @@ -372,13 +372,13 @@ void FPOStateMachine::emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label) { OS.emitAbsoluteSymbolDiff(Label, FPO->Begin, 4); // RvaStart OS.emitAbsoluteSymbolDiff(FPO->End, Label, 4); // CodeSize - OS.EmitIntValue(LocalSize, 4); - OS.EmitIntValue(FPO->ParamsSize, 4); - OS.EmitIntValue(MaxStackSize, 4); - OS.EmitIntValue(FrameFuncStrTabOff, 4); // FrameFunc + OS.emitInt32(LocalSize); + OS.emitInt32(FPO->ParamsSize); + OS.emitInt32(MaxStackSize); + OS.emitInt32(FrameFuncStrTabOff); // FrameFunc OS.emitAbsoluteSymbolDiff(FPO->PrologueEnd, Label, 2); - OS.EmitIntValue(SavedRegSize, 2); - OS.EmitIntValue(CurFlags, 4); + OS.emitInt16(SavedRegSize); + OS.emitInt32(CurFlags); } /// Compute and emit the real CodeView FrameData subsection. @@ -398,12 +398,12 @@ bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) { MCSymbol *FrameBegin = Ctx.createTempSymbol(), *FrameEnd = Ctx.createTempSymbol(); - OS.EmitIntValue(unsigned(DebugSubsectionKind::FrameData), 4); + OS.emitInt32(unsigned(DebugSubsectionKind::FrameData)); OS.emitAbsoluteSymbolDiff(FrameEnd, FrameBegin, 4); - OS.EmitLabel(FrameBegin); + OS.emitLabel(FrameBegin); // Start with the RVA of the function in question. - OS.EmitValue(MCSymbolRefExpr::create(FPO->Function, + OS.emitValue(MCSymbolRefExpr::create(FPO->Function, MCSymbolRefExpr::VK_COFF_IMGREL32, Ctx), 4); @@ -437,8 +437,8 @@ bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) { FSM.emitFrameDataRecord(OS, Inst.Label); } - OS.EmitValueToAlignment(4, 0); - OS.EmitLabel(FrameEnd); + OS.emitValueToAlignment(4, 0); + OS.emitLabel(FrameEnd); return false; } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.h b/contrib/llvm-project/llvm/lib/Target/X86/X86.h index a0ab5c3a5b3c..91ba4e3d091e 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.h @@ -19,9 +19,7 @@ namespace llvm { class FunctionPass; -class ImmutablePass; class InstructionSelector; -class ModulePass; class PassRegistry; class X86RegisterBankInfo; class X86Subtarget; @@ -129,14 +127,23 @@ FunctionPass *createX86DiscriminateMemOpsPass(); /// This pass applies profiling information to insert cache prefetches. FunctionPass *createX86InsertPrefetchPass(); +/// This pass insert wait instruction after X87 instructions which could raise +/// fp exceptions when strict-fp enabled. +FunctionPass *createX86InsertX87waitPass(); + +/// This pass optimizes arithmetic based on knowledge that is only used by +/// a reduction sequence and is therefore safe to reassociate in interesting +/// ways. +FunctionPass *createX86PartialReductionPass(); + InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &, X86RegisterBankInfo &); FunctionPass *createX86LoadValueInjectionLoadHardeningPass(); -FunctionPass *createX86LoadValueInjectionLoadHardeningUnoptimizedPass(); FunctionPass *createX86LoadValueInjectionRetHardeningPass(); FunctionPass *createX86SpeculativeLoadHardeningPass(); +FunctionPass *createX86SpeculativeExecutionSideEffectSuppression(); void initializeEvexToVexInstPassPass(PassRegistry &); void initializeFixupBWInstPassPass(PassRegistry &); @@ -144,18 +151,21 @@ void initializeFixupLEAPassPass(PassRegistry &); void initializeFPSPass(PassRegistry &); void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); +void initializeX86AvoidTrailingCallPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); void initializeX86CondBrFoldingPassPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86ExpandPseudoPass(PassRegistry &); +void initializeX86FixupSetCCPassPass(PassRegistry &); void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); -void initializeX86LoadValueInjectionLoadHardeningUnoptimizedPassPass(PassRegistry &); void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &); void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &); void initializeX86OptimizeLEAPassPass(PassRegistry &); +void initializeX86PartialReductionPass(PassRegistry &); void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); +void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &); namespace X86AS { enum : unsigned { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.td b/contrib/llvm-project/llvm/lib/Target/X86/X86.td index bb8952f54e3a..dc1ff72add49 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.td @@ -52,13 +52,16 @@ def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true", "Support xsave instructions">; def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true", - "Support xsaveopt instructions">; + "Support xsaveopt instructions", + [FeatureXSAVE]>; def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true", - "Support xsavec instructions">; + "Support xsavec instructions", + [FeatureXSAVE]>; def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true", - "Support xsaves instructions">; + "Support xsaves instructions", + [FeatureXSAVE]>; def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", "Enable SSE instructions">; @@ -246,6 +249,14 @@ def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true", // target-feature attribute. def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false", "Deprecated. Support MPX instructions">; +def FeatureAMXTILE : SubtargetFeature<"amx-tile", "HasAMXTILE", "true", + "Support AMX-TILE instructions">; +def FeatureAMXINT8 : SubtargetFeature<"amx-int8", "HasAMXINT8", "true", + "Support AMX-INT8 instructions", + [FeatureAMXTILE]>; +def FeatureAMXBF16 : SubtargetFeature<"amx-bf16", "HasAMXBF16", "true", + "Support AMX-BF16 instructions", + [FeatureAMXTILE]>; def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", @@ -273,6 +284,10 @@ def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", "Wait and pause enhancements">; def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true", "Has ENQCMD instructions">; +def FeatureSERIALIZE : SubtargetFeature<"serialize", "HasSERIALIZE", "true", + "Has serialize instruction">; +def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true", + "Support TSXLDTRK instructions">; // On some processors, instructions that implicitly take two memory operands are // slow. In practice, this means that CALL, PUSH, and POP with memory operands // should be avoided in favor of a MOV + register CALL/PUSH/POP. @@ -329,6 +344,11 @@ def FeatureFastLZCNT : SubtargetFeature< "fast-lzcnt", "HasFastLZCNT", "true", "LZCNT instructions are as fast as most simple integer ops">; +// If the target can efficiently decode NOPs upto 7-bytes in length. +def FeatureFast7ByteNOP + : SubtargetFeature< + "fast-7bytenop", "HasFast7ByteNOP", "true", + "Target can quickly decode up to 7 byte NOPs">; // If the target can efficiently decode NOPs upto 11-bytes in length. def FeatureFast11ByteNOP : SubtargetFeature< @@ -435,6 +455,15 @@ def FeatureLVIControlFlowIntegrity "LFENCE instruction to serialize control flow. Also decompose RET " "instructions into a POP+LFENCE+JMP sequence.">; +// Enable SESES to mitigate speculative execution attacks +def FeatureSpeculativeExecutionSideEffectSuppression + : SubtargetFeature< + "seses", "UseSpeculativeExecutionSideEffectSuppression", "true", + "Prevent speculative execution side channel timing attacks by " + "inserting a speculation barrier before memory reads, memory writes, " + "and conditional branches. Implies LVI Control Flow integrity.", + [FeatureLVIControlFlowIntegrity]>; + // Mitigate LVI attacks against data loads def FeatureLVILoadHardening : SubtargetFeature< @@ -562,7 +591,8 @@ def ProcessorFeatures { FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, FeatureFastSHLDRotate, - FeatureMergeToThreeWayBranch]; + FeatureMergeToThreeWayBranch, + FeatureFast15ByteNOP]; list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32, FeaturePOPCNTFalseDeps]; list<SubtargetFeature> SNBInheritableFeatures = @@ -744,6 +774,7 @@ def ProcessorFeatures { list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM, FeatureSlowDivide64, FeatureSlowPMULLD, + FeatureFast7ByteNOP, FeaturePOPCNTFalseDeps]; list<SubtargetFeature> SLMInheritableFeatures = !listconcat(AtomInheritableFeatures, SLMAdditionalFeatures); @@ -778,15 +809,13 @@ def ProcessorFeatures { !listconcat(GLPInheritableFeatures, GLPSpecificFeatures); // Tremont - list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLDEMOTE, - FeatureGFNI, - FeatureMOVDIRI, - FeatureMOVDIR64B, - FeatureWAITPKG]; + list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB, + FeatureGFNI]; list<SubtargetFeature> TRMSpecificFeatures = [FeatureUseGLMDivSqrtCosts]; + list<SubtargetFeature> TRMInheritableFeatures = + !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures); list<SubtargetFeature> TRMFeatures = - !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures, - TRMSpecificFeatures); + !listconcat(TRMInheritableFeatures, TRMSpecificFeatures); // Knights Landing list<SubtargetFeature> KNLFeatures = [FeatureX87, @@ -838,6 +867,7 @@ def ProcessorFeatures { FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, + FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD, @@ -933,6 +963,8 @@ def ProcessorFeatures { // Excavator list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2, FeatureBMI2, + FeatureMOVBE, + FeatureRDRAND, FeatureMWAITX]; list<SubtargetFeature> BdVer4InheritableFeatures = !listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures); @@ -993,7 +1025,7 @@ def ProcessorFeatures { class Proc<string Name, list<SubtargetFeature> Features> : ProcessorModel<Name, GenericModel, Features>; -// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled +// NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled // if i386/i486 is specifically requested. def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>; @@ -1256,6 +1288,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [ FeatureNOPL, Feature64Bit, FeatureSlow3OpsLEA, + FeatureSlowDivide64, FeatureSlowIncDec, FeatureMacroFusion, FeatureInsertVZEROUPPER diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp index 1ac291fcb887..aa03217d155d 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -18,6 +18,7 @@ #include "TargetInfo/X86TargetInfo.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" +#include "X86Subtarget.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -40,6 +41,8 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" + using namespace llvm; X86AsmPrinter::X86AsmPrinter(TargetMachine &TM, @@ -76,7 +79,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { } // Emit the rest of the function body. - EmitFunctionBody(); + emitFunctionBody(); // Emit the XRay table for this function. emitXRayTable(); @@ -87,7 +90,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { return false; } -void X86AsmPrinter::EmitFunctionBodyStart() { +void X86AsmPrinter::emitFunctionBodyStart() { if (EmitFPOData) { if (auto *XTS = static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer())) @@ -97,7 +100,7 @@ void X86AsmPrinter::EmitFunctionBodyStart() { } } -void X86AsmPrinter::EmitFunctionBodyEnd() { +void X86AsmPrinter::emitFunctionBodyEnd() { if (EmitFPOData) { if (auto *XTS = static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer())) @@ -124,7 +127,7 @@ void X86AsmPrinter::PrintSymbolOperand(const MachineOperand &MO, MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) GVSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); else - GVSym = getSymbol(GV); + GVSym = getSymbolPreferLocal(*GV); // Handle dllimport linkage. if (MO.getTargetFlags() == X86II::MO_DLLIMPORT) @@ -619,7 +622,7 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, return false; } -void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { +void X86AsmPrinter::emitStartOfAsmFile(Module &M) { const Triple &TT = TM.getTargetTriple(); if (TT.isOSBinFormatELF()) { @@ -641,17 +644,17 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { // Emitting note header. int WordSize = TT.isArch64Bit() ? 8 : 4; - EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); - OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0" - OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size - OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/); - OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name + emitAlignment(WordSize == 4 ? Align(4) : Align(8)); + OutStreamer->emitIntValue(4, 4 /*size*/); // data size for "GNU\0" + OutStreamer->emitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size + OutStreamer->emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/); + OutStreamer->emitBytes(StringRef("GNU", 4)); // note name // Emitting an Elf_Prop for the CET properties. - OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4); - OutStreamer->EmitIntValue(4, 4); // data size - OutStreamer->EmitIntValue(FeatureFlagsAnd, 4); // data - EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding + OutStreamer->emitInt32(ELF::GNU_PROPERTY_X86_FEATURE_1_AND); + OutStreamer->emitInt32(4); // data size + OutStreamer->emitInt32(FeatureFlagsAnd); // data + emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding OutStreamer->endSection(Nt); OutStreamer->SwitchSection(Cur); @@ -683,30 +686,30 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { if (M.getModuleFlag("cfguard")) Feat00Flags |= 0x800; // Object is CFG-aware. - OutStreamer->EmitSymbolAttribute(S, MCSA_Global); - OutStreamer->EmitAssignment( + OutStreamer->emitSymbolAttribute(S, MCSA_Global); + OutStreamer->emitAssignment( S, MCConstantExpr::create(Feat00Flags, MMI->getContext())); } - OutStreamer->EmitSyntaxDirective(); + OutStreamer->emitSyntaxDirective(); // If this is not inline asm and we're in 16-bit // mode prefix assembly with .code16. bool is16 = TT.getEnvironment() == Triple::CODE16; if (M.getModuleInlineAsm().empty() && is16) - OutStreamer->EmitAssemblerFlag(MCAF_Code16); + OutStreamer->emitAssemblerFlag(MCAF_Code16); } static void emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, MachineModuleInfoImpl::StubValueTy &MCSym) { // L_foo$stub: - OutStreamer.EmitLabel(StubLabel); + OutStreamer.emitLabel(StubLabel); // .indirect_symbol _foo - OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol); + OutStreamer.emitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol); if (MCSym.getInt()) // External to current translation unit. - OutStreamer.EmitIntValue(0, 4/*size*/); + OutStreamer.emitIntValue(0, 4/*size*/); else // Internal to current translation unit. // @@ -714,7 +717,7 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel, // pointers need to be indirect and pc-rel. We accomplish this by // using NLPs; however, sometimes the types are local to the file. // We need to fill in the value for the NLP in those cases. - OutStreamer.EmitValue( + OutStreamer.emitValue( MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()), 4 /*size*/); } @@ -742,7 +745,7 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) { } } -void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { +void X86AsmPrinter::emitEndOfAsmFile(Module &M) { const Triple &TT = TM.getTargetTriple(); if (TT.isOSBinFormatMachO()) { @@ -759,7 +762,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { // points). If this doesn't occur, the linker can safely perform dead code // stripping. Since LLVM never generates code that does this, it is always // safe to set. - OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols); } else if (TT.isOSBinFormatCOFF()) { if (MMI->usesMSVCFloatingPoint()) { // In Windows' libcmt.lib, there is a file which is linked in only if the @@ -778,7 +781,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) { StringRef SymbolName = (TT.getArch() == Triple::x86) ? "__fltused" : "_fltused"; MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName); - OutStreamer->EmitSymbolAttribute(S, MCSA_Global); + OutStreamer->emitSymbolAttribute(S, MCSA_Global); return; } emitStackMaps(SM); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h index ee79401dc80d..eb485fa2ecef 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h @@ -9,12 +9,9 @@ #ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H #define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H -#include "X86Subtarget.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/FaultMaps.h" #include "llvm/CodeGen/StackMaps.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/Target/TargetMachine.h" // Implemented in X86MCInstLower.cpp namespace { @@ -22,8 +19,10 @@ namespace { } namespace llvm { +class MCCodeEmitter; class MCStreamer; -class MCSymbol; +class X86Subtarget; +class TargetMachine; class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { const X86Subtarget *Subtarget = nullptr; @@ -123,14 +122,14 @@ public: const X86Subtarget &getSubtarget() const { return *Subtarget; } - void EmitStartOfAsmFile(Module &M) override; + void emitStartOfAsmFile(Module &M) override; - void EmitEndOfAsmFile(Module &M) override; + void emitEndOfAsmFile(Module &M) override; - void EmitInstruction(const MachineInstr *MI) override; + void emitInstruction(const MachineInstr *MI) override; - void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override { - AsmPrinter::EmitBasicBlockEnd(MBB); + void emitBasicBlockEnd(const MachineBasicBlock &MBB) override { + AsmPrinter::emitBasicBlockEnd(MBB); SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); } @@ -147,8 +146,8 @@ public: } bool runOnMachineFunction(MachineFunction &F) override; - void EmitFunctionBodyStart() override; - void EmitFunctionBodyEnd() override; + void emitFunctionBodyStart() override; + void emitFunctionBodyEnd() override; }; } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index 0f1d4b51062e..9f1fece1b9dd 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -1,4 +1,4 @@ -//===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===// +//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -33,6 +33,7 @@ // transformation done here is correct regardless to other memory accesses. //===----------------------------------------------------------------------===// +#include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -287,7 +288,7 @@ static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) { return 0; } -static int getAddrOffset(MachineInstr *MI) { +static int getAddrOffset(const MachineInstr *MI) { const MCInstrDesc &Descl = MI->getDesc(); int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags); assert(AddrOffset != -1 && "Expected Memory Operand"); @@ -310,11 +311,11 @@ static MachineOperand &getDispOperand(MachineInstr *MI) { // TODO: Consider expanding to other addressing modes in the future static bool isRelevantAddressingMode(MachineInstr *MI) { int AddrOffset = getAddrOffset(MI); - MachineOperand &Base = getBaseOperand(MI); - MachineOperand &Disp = getDispOperand(MI); - MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt); - MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg); - MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg); + const MachineOperand &Base = getBaseOperand(MI); + const MachineOperand &Disp = getDispOperand(MI); + const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt); + const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg); + const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg); if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI())) return false; @@ -410,9 +411,8 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, // If the load and store are consecutive, use the loadInst location to // reduce register pressure. MachineInstr *StInst = StoreInst; - auto PrevInstrIt = skipDebugInstructionsBackward( - std::prev(MachineBasicBlock::instr_iterator(StoreInst)), - MBB->instr_begin()); + auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst), + MBB->instr_begin()); if (PrevInstrIt.getNodePtr() == LoadInst) StInst = LoadInst; MachineInstr *NewStore = @@ -498,9 +498,10 @@ void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst, static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) { MachineOperand &LoadBase = getBaseOperand(LoadInst); MachineOperand &StoreBase = getBaseOperand(StoreInst); - auto StorePrevNonDbgInstr = skipDebugInstructionsBackward( - std::prev(MachineBasicBlock::instr_iterator(StoreInst)), - LoadInst->getParent()->instr_begin()).getNodePtr(); + auto *StorePrevNonDbgInstr = + prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst), + LoadInst->getParent()->instr_begin()) + .getNodePtr(); if (LoadBase.isReg()) { MachineInstr *LastLoad = LoadInst->getPrevNode(); // If the original load and store to xmm/ymm were consecutive @@ -550,11 +551,8 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { if (StoreMI.getParent() == MI.getParent() && isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) && isRelevantAddressingMode(&MI) && - isRelevantAddressingMode(&StoreMI)) { - assert(MI.hasOneMemOperand() && - "Expected one memory operand for load instruction"); - assert(StoreMI.hasOneMemOperand() && - "Expected one memory operand for store instruction"); + isRelevantAddressingMode(&StoreMI) && + MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) { if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin())) BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI)); } @@ -563,7 +561,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { } unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) { - auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI, + const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI, *LoadInst->getParent()->getParent()); return TRI->getRegSizeInBits(*TRC) / 8; } @@ -616,8 +614,8 @@ void X86AvoidSFBPass::breakBlockedCopies( static bool hasSameBaseOpValue(MachineInstr *LoadInst, MachineInstr *StoreInst) { - MachineOperand &LoadBase = getBaseOperand(LoadInst); - MachineOperand &StoreBase = getBaseOperand(StoreInst); + const MachineOperand &LoadBase = getBaseOperand(LoadInst); + const MachineOperand &StoreBase = getBaseOperand(StoreInst); if (LoadBase.isReg() != StoreBase.isReg()) return false; if (LoadBase.isReg()) @@ -691,13 +689,12 @@ bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) { SmallVector<MachineInstr *, 2> PotentialBlockers = findPotentialBlockers(LoadInst); - for (auto PBInst : PotentialBlockers) { + for (auto *PBInst : PotentialBlockers) { if (!isPotentialBlockingStoreInst(PBInst->getOpcode(), LoadInst->getOpcode()) || - !isRelevantAddressingMode(PBInst)) + !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand()) continue; int64_t PBstDispImm = getDispOperand(PBInst).getImm(); - assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand"); unsigned PBstSize = (*PBInst->memoperands_begin())->getSize(); // This check doesn't cover all cases, but it will suffice for now. // TODO: take branch probability into consideration, if the blocking @@ -727,7 +724,7 @@ bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) { ForRemoval.push_back(LoadInst); ForRemoval.push_back(StoreInst); } - for (auto RemovedInst : ForRemoval) { + for (auto *RemovedInst : ForRemoval) { RemovedInst->eraseFromParent(); } ForRemoval.clear(); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp index fb4f9e2901dc..0899783d5f60 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp @@ -6,10 +6,29 @@ // //===----------------------------------------------------------------------===// // -// The Windows x64 unwinder has trouble unwinding the stack when a return -// address points to the end of the function. This pass maintains the invariant -// that every return address is inside the bounds of its parent function or -// funclet by inserting int3 if the last instruction would otherwise be a call. +// The Windows x64 unwinder decodes the instruction stream during unwinding. +// The unwinder decodes forward from the current PC to detect epilogue code +// patterns. +// +// First, this means that there must be an instruction after every +// call instruction for the unwinder to decode. LLVM must maintain the invariant +// that the last instruction of a function or funclet is not a call, or the +// unwinder may decode into the next function. Similarly, a call may not +// immediately precede an epilogue code pattern. As of this writing, the +// SEH_Epilogue pseudo instruction takes care of that. +// +// Second, all non-tail call jump targets must be within the *half-open* +// interval of the bounds of the function. The unwinder distinguishes between +// internal jump instructions and tail calls in an epilogue sequence by checking +// the jump target against the function bounds from the .pdata section. This +// means that the last regular MBB of an LLVM function must not be empty if +// there are regular jumps targeting it. +// +// This pass upholds these invariants by ensuring that blocks at the end of a +// function or funclet are a) not empty and b) do not end in a CALL instruction. +// +// Unwinder implementation for reference: +// https://github.com/dotnet/coreclr/blob/a9f3fc16483eecfc47fb79c362811d870be02249/src/unwinder/amd64/unwinder_amd64.cpp#L1015 // //===----------------------------------------------------------------------===// @@ -18,33 +37,35 @@ #include "X86Subtarget.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#define DEBUG_TYPE "x86-avoid-trailing-call" +#define AVOIDCALL_DESC "X86 avoid trailing call pass" +#define AVOIDCALL_NAME "x86-avoid-trailing-call" + +#define DEBUG_TYPE AVOIDCALL_NAME using namespace llvm; namespace { - class X86AvoidTrailingCallPass : public MachineFunctionPass { public: X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; -private: - StringRef getPassName() const override { - return "X86 avoid trailing call pass"; - } static char ID; + +private: + StringRef getPassName() const override { return AVOIDCALL_DESC; } }; +} // end anonymous namespace char X86AvoidTrailingCallPass::ID = 0; -} // end anonymous namespace - FunctionPass *llvm::createX86AvoidTrailingCallPass() { return new X86AvoidTrailingCallPass(); } +INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, false) + // A real instruction is a non-meta, non-pseudo instruction. Some pseudos // expand to nothing, and some expand to code. This logic conservatively assumes // they might expand to nothing. @@ -62,6 +83,11 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { const X86InstrInfo &TII = *STI.getInstrInfo(); assert(STI.isTargetWin64() && "pass only runs on Win64"); + // We don't need to worry about any of the invariants described above if there + // is no unwind info (CFI). + if (!MF.hasWinCFI()) + return false; + // FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops // before epilogues. @@ -73,33 +99,34 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { if (NextMBB && !NextMBB->isEHFuncletEntry()) continue; - // Find the last real instruction in this block, or previous blocks if this - // block is empty. - MachineBasicBlock::reverse_iterator LastRealInstr; - for (MachineBasicBlock &RMBB : - make_range(MBB.getReverseIterator(), MF.rend())) { - LastRealInstr = llvm::find_if(reverse(RMBB), isRealInstruction); - if (LastRealInstr != RMBB.rend()) - break; - } - - // Do nothing if this function or funclet has no instructions. - if (LastRealInstr == MF.begin()->rend()) - continue; + // Find the last real instruction in this block. + auto LastRealInstr = llvm::find_if(reverse(MBB), isRealInstruction); - // If this is a call instruction, insert int3 right after it with the same - // DebugLoc. Convert back to a forward iterator and advance the insertion - // position once. - if (isCallInstruction(*LastRealInstr)) { + // If the block is empty or the last real instruction is a call instruction, + // insert an int3. If there is a call instruction, insert the int3 between + // the call and any labels or other meta instructions. If the block is + // empty, insert at block end. + bool IsEmpty = LastRealInstr == MBB.rend(); + bool IsCall = !IsEmpty && isCallInstruction(*LastRealInstr); + if (IsEmpty || IsCall) { LLVM_DEBUG({ - dbgs() << "inserting int3 after trailing call instruction:\n"; - LastRealInstr->dump(); - dbgs() << '\n'; + if (IsCall) { + dbgs() << "inserting int3 after trailing call instruction:\n"; + LastRealInstr->dump(); + dbgs() << '\n'; + } else { + dbgs() << "inserting int3 in trailing empty MBB:\n"; + MBB.dump(); + } }); - MachineBasicBlock::iterator MBBI = std::next(LastRealInstr.getReverse()); - BuildMI(*LastRealInstr->getParent(), MBBI, LastRealInstr->getDebugLoc(), - TII.get(X86::INT3)); + MachineBasicBlock::iterator MBBI = MBB.end(); + DebugLoc DL; + if (IsCall) { + MBBI = std::next(LastRealInstr.getReverse()); + DL = LastRealInstr->getDebugLoc(); + } + BuildMI(MBB, MBBI, DL, TII.get(X86::INT3)); Changed = true; } } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index f8faa572dffc..caa1f7952475 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -17,6 +17,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/X86BaseInfo.h" +#include "X86.h" #include "X86FrameLowering.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" @@ -162,14 +163,13 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // memory for arguments. unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); - bool UseStackProbe = - !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty(); + bool EmitStackProbeCall = STI->getTargetLowering()->hasStackProbeSymbol(MF); unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF); for (MachineBasicBlock &BB : MF) { bool InsideFrameSequence = false; for (MachineInstr &MI : BB) { if (MI.getOpcode() == FrameSetupOpcode) { - if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe) + if (TII->getFrameSize(MI) >= StackProbeSize && EmitStackProbeCall) return false; if (InsideFrameSequence) return false; @@ -199,7 +199,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, if (CannotReserveFrame) return true; - unsigned StackAlign = TFL->getStackAlignment(); + Align StackAlign = TFL->getStackAlign(); int64_t Advantage = 0; for (auto CC : CallSeqVector) { @@ -222,7 +222,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, // We'll need a add after the call. Advantage -= 3; // If we have to realign the stack, we'll also need a sub before - if (CC.ExpectedDist % StackAlign) + if (!isAligned(StackAlign, CC.ExpectedDist)) Advantage -= 3; // Now, for each push, we save ~3 bytes. For small constants, we actually, // save more (up to 5 bytes), but 3 should be a good approximation. @@ -531,6 +531,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8; } Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp); + Push->cloneMemRefs(MF, *Store); break; case X86::MOV32mr: case X86::MOV64mr: { @@ -550,7 +551,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, // If PUSHrmm is not slow on this target, try to fold the source of the // push into the instruction. - bool SlowPUSHrmm = STI->isAtom() || STI->isSLM(); + bool SlowPUSHrmm = STI->slowTwoMemOps(); // Check that this is legal to fold. Right now, we're extremely // conservative about that. @@ -562,6 +563,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, unsigned NumOps = DefMov->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) Push->addOperand(DefMov->getOperand(i)); + Push->cloneMergedMemRefs(MF, {&*DefMov, &*Store}); DefMov->eraseFromParent(); } else { @@ -569,6 +571,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)) .addReg(Reg) .getInstr(); + Push->cloneMemRefs(MF, *Store); } break; } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp index 57bf799cf89c..319dc9470604 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp @@ -108,17 +108,15 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { MachinePointerInfo &MPO) override { LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0)); LLT SType = LLT::scalar(DL.getPointerSizeInBits(0)); - Register SPReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildCopy(SPReg, STI.getRegisterInfo()->getStackRegister()); + auto SPReg = + MIRBuilder.buildCopy(p0, STI.getRegisterInfo()->getStackRegister()); - Register OffsetReg = MRI.createGenericVirtualRegister(SType); - MIRBuilder.buildConstant(OffsetReg, Offset); + auto OffsetReg = MIRBuilder.buildConstant(SType, Offset); - Register AddrReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg); + auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); - return AddrReg; + return AddrReg.getReg(0); } void assignValueToReg(Register ValVReg, Register PhysReg, @@ -139,7 +137,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { if (PhysRegSize > ValSize && LocSize == ValSize) { assert((PhysRegSize == 128 || PhysRegSize == 80) && "We expect that to be 128 bit"); auto MIB = MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg); - ExtReg = MIB->getOperand(0).getReg(); + ExtReg = MIB.getReg(0); } else ExtReg = extendRegister(ValVReg, VA); @@ -148,10 +146,12 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { + MachineFunction &MF = MIRBuilder.getMF(); Register ExtReg = extendRegister(ValVReg, VA); - auto MMO = MIRBuilder.getMF().getMachineMemOperand( - MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(), - /* Alignment */ 1); + + auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, + VA.getLocVT().getStoreSize(), + inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildStore(ExtReg, Addr, *MMO); } @@ -240,17 +240,17 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); - Register AddrReg = MRI.createGenericVirtualRegister( - LLT::pointer(0, DL.getPointerSizeInBits(0))); - MIRBuilder.buildFrameIndex(AddrReg, FI); - return AddrReg; + return MIRBuilder + .buildFrameIndex(LLT::pointer(0, DL.getPointerSizeInBits(0)), FI) + .getReg(0); } void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { - auto MMO = MIRBuilder.getMF().getMachineMemOperand( + MachineFunction &MF = MIRBuilder.getMF(); + auto MMO = MF.getMachineMemOperand( MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, - 1); + inferAlignFromPtrInfo(MF, MPO)); MIRBuilder.buildLoad(ValVReg, Addr, *MMO); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h index 444a0c7d0122..b5ea7782896b 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h @@ -14,12 +14,12 @@ #ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING_H #define LLVM_LIB_TARGET_X86_X86CALLLOWERING_H -#include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include <functional> namespace llvm { +template <typename T> class ArrayRef; class DataLayout; class MachineRegisterInfo; class X86TargetLowering; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp index aee344a26764..c899db60e016 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp @@ -60,7 +60,7 @@ static bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); } - // Successful in allocating regsiters - stop scanning next rules. + // Successful in allocating registers - stop scanning next rules. return true; } @@ -166,7 +166,7 @@ static bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT, State.getMachineFunction().getSubtarget().getRegisterInfo(); if (TRI->regsOverlap(Reg, X86::XMM4) || TRI->regsOverlap(Reg, X86::XMM5)) - State.AllocateStack(8, 8); + State.AllocateStack(8, Align(8)); if (!ArgFlags.isHva()) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); @@ -281,7 +281,7 @@ static bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (UseRegs) It.convertToReg(State.AllocateReg(RegList[FirstFree++])); else - It.convertToMem(State.AllocateStack(4, 4)); + It.convertToMem(State.AllocateStack(4, Align(4))); State.addLoc(It); } @@ -305,7 +305,7 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (ArgCount == 1 && ValNo == 0) { // If we have one argument, the argument is five stack slots big, at fixed // offset zero. - Offset = State.AllocateStack(5 * SlotSize, 4); + Offset = State.AllocateStack(5 * SlotSize, Align(4)); } else if (ArgCount == 2 && ValNo == 0) { // If we have two arguments, the stack slot is *after* the error code // argument. Pretend it doesn't consume stack space, and account for it when @@ -316,7 +316,7 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT, // appears first on the stack, and is then followed by the five slot // interrupt struct. Offset = 0; - (void)State.AllocateStack(6 * SlotSize, 4); + (void)State.AllocateStack(6 * SlotSize, Align(4)); } else { report_fatal_error("unsupported x86 interrupt prototype"); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td index db1aef2fd09d..802e694999b6 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td @@ -789,8 +789,9 @@ def CC_X86_32_Vector_Darwin : CallingConv<[ /// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP /// values are spilled on the stack. def CC_X86_32_Common : CallingConv<[ - // Handles byval parameters. + // Handles byval/preallocated parameters. CCIfByVal<CCPassByVal<4, 4>>, + CCIfPreallocated<CCPassByVal<4, 4>>, // The first 3 float or double arguments, if marked 'inreg' and if the call // is not a vararg call and if SSE2 is available, are passed in SSE registers. @@ -1145,7 +1146,7 @@ def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, (sequence "YMM%u", 8, 15))>; -def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15, +def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RSI, R14, R15, (sequence "ZMM%u", 16, 31), K4, K5, K6, K7)>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp index 7051550d52e6..2ff8ee19561b 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp @@ -29,7 +29,7 @@ using namespace llvm; static cl::opt<bool> EnableDiscriminateMemops( DEBUG_TYPE, cl::init(false), cl::desc("Generate unique debug info for each instruction with a memory " - "operand. Should be enabled for profile-drived cache prefetching, " + "operand. Should be enabled for profile-driven cache prefetching, " "both in the build of the binary being profiled, as well as in " "the build of the binary consuming the profile."), cl::Hidden); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp index 438b9fd8eebb..488ee51f1d89 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -283,7 +283,7 @@ public: // A converter is identified by <destination domain, source opcode> typedef std::pair<int, unsigned> InstrConverterBaseKeyTy; -typedef DenseMap<InstrConverterBaseKeyTy, InstrConverterBase *> +typedef DenseMap<InstrConverterBaseKeyTy, std::unique_ptr<InstrConverterBase>> InstrConverterBaseMap; /// A closure is a set of virtual register representing all of the edges in @@ -471,8 +471,8 @@ void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) { // instruction. for (int i = 0; i != NumDomains; ++i) { if (C.isLegal((RegDomain)i)) { - InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()}); - if (!IC || !IC->isLegal(MI, TII)) + auto I = Converters.find({i, MI->getOpcode()}); + if (I == Converters.end() || !I->second->isLegal(MI, TII)) C.setIllegal((RegDomain)i); } } @@ -484,8 +484,8 @@ double X86DomainReassignment::calculateCost(const Closure &C, double Cost = 0.0; for (auto *MI : C.instructions()) - Cost += - Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI); + Cost += Converters.find({DstDomain, MI->getOpcode()}) + ->second->getExtraCost(MI, MRI); return Cost; } @@ -501,8 +501,8 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const { // appropriate converter. SmallVector<MachineInstr *, 8> ToErase; for (auto *MI : C.instructions()) - if (Converters.lookup({Domain, MI->getOpcode()}) - ->convertInstr(MI, TII, MRI)) + if (Converters.find({Domain, MI->getOpcode()}) + ->second->convertInstr(MI, TII, MRI)) ToErase.push_back(MI); // Iterate all registers in the closure, replace them with registers in the @@ -606,19 +606,21 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) { void X86DomainReassignment::initConverters() { Converters[{MaskDomain, TargetOpcode::PHI}] = - new InstrIgnore(TargetOpcode::PHI); + std::make_unique<InstrIgnore>(TargetOpcode::PHI); Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] = - new InstrIgnore(TargetOpcode::IMPLICIT_DEF); + std::make_unique<InstrIgnore>(TargetOpcode::IMPLICIT_DEF); Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] = - new InstrReplaceWithCopy(TargetOpcode::INSERT_SUBREG, 2); + std::make_unique<InstrReplaceWithCopy>(TargetOpcode::INSERT_SUBREG, 2); Converters[{MaskDomain, TargetOpcode::COPY}] = - new InstrCOPYReplacer(TargetOpcode::COPY, MaskDomain, TargetOpcode::COPY); + std::make_unique<InstrCOPYReplacer>(TargetOpcode::COPY, MaskDomain, + TargetOpcode::COPY); auto createReplacerDstCOPY = [&](unsigned From, unsigned To) { - Converters[{MaskDomain, From}] = new InstrReplacerDstCOPY(From, To); + Converters[{MaskDomain, From}] = + std::make_unique<InstrReplacerDstCOPY>(From, To); }; createReplacerDstCOPY(X86::MOVZX32rm16, X86::KMOVWkm); @@ -638,7 +640,7 @@ void X86DomainReassignment::initConverters() { } auto createReplacer = [&](unsigned From, unsigned To) { - Converters[{MaskDomain, From}] = new InstrReplacer(From, To); + Converters[{MaskDomain, From}] = std::make_unique<InstrReplacer>(From, To); }; createReplacer(X86::MOV16rm, X86::KMOVWkm); @@ -779,8 +781,6 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { } } - DeleteContainerSeconds(Converters); - LLVM_DEBUG( dbgs() << "***** Machine Function after Domain Reassignment *****\n"); LLVM_DEBUG(MF.print(dbgs())); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp index f1cf9b94c9e5..540ad98b6d54 100755 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp @@ -237,11 +237,9 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { // Make sure the tables are sorted. static std::atomic<bool> TableChecked(false); if (!TableChecked.load(std::memory_order_relaxed)) { - assert(std::is_sorted(std::begin(X86EvexToVex128CompressTable), - std::end(X86EvexToVex128CompressTable)) && + assert(llvm::is_sorted(X86EvexToVex128CompressTable) && "X86EvexToVex128CompressTable is not sorted!"); - assert(std::is_sorted(std::begin(X86EvexToVex256CompressTable), - std::end(X86EvexToVex256CompressTable)) && + assert(llvm::is_sorted(X86EvexToVex256CompressTable) && "X86EvexToVex256CompressTable is not sorted!"); TableChecked.store(true, std::memory_order_relaxed); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp index d35d65914b34..c47ef4708e91 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -275,7 +275,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); - MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI); + + // Update the call site info. + if (MBBI->isCandidateForCallSiteEntry()) + MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); @@ -331,14 +334,6 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MBB.erase(MBBI); return true; } - case X86::EH_RESTORE: { - // Restore ESP and EBP, and optionally ESI if required. - bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality( - MBB.getParent()->getFunction().getPersonalityFn())); - X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH); - MBBI->eraseFromParent(); - return true; - } case X86::LCMPXCHG8B_SAVE_EBX: case X86::LCMPXCHG16B_SAVE_RBX: { // Perform the following transformation. @@ -371,6 +366,82 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MBBI->eraseFromParent(); return true; } + // Loading/storing mask pairs requires two kmov operations. The second one of + // these needs a 2 byte displacement relative to the specified address (with + // 32 bit spill size). The pairs of 1bit masks up to 16 bit masks all use the + // same spill size, they all are stored using MASKPAIR16STORE, loaded using + // MASKPAIR16LOAD. + // + // The displacement value might wrap around in theory, thus the asserts in + // both cases. + case X86::MASKPAIR16LOAD: { + int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm(); + assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); + Register Reg = MBBI->getOperand(0).getReg(); + bool DstIsDead = MBBI->getOperand(0).isDead(); + Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0); + Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1); + + auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm)) + .addReg(Reg0, RegState::Define | getDeadRegState(DstIsDead)); + auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm)) + .addReg(Reg1, RegState::Define | getDeadRegState(DstIsDead)); + + for (int i = 0; i < X86::AddrNumOperands; ++i) { + MIBLo.add(MBBI->getOperand(1 + i)); + if (i == X86::AddrDisp) + MIBHi.addImm(Disp + 2); + else + MIBHi.add(MBBI->getOperand(1 + i)); + } + + // Split the memory operand, adjusting the offset and size for the halves. + MachineMemOperand *OldMMO = MBBI->memoperands().front(); + MachineFunction *MF = MBB.getParent(); + MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2); + MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2); + + MIBLo.setMemRefs(MMOLo); + MIBHi.setMemRefs(MMOHi); + + // Delete the pseudo. + MBB.erase(MBBI); + return true; + } + case X86::MASKPAIR16STORE: { + int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm(); + assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); + Register Reg = MBBI->getOperand(X86::AddrNumOperands).getReg(); + bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill(); + Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0); + Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1); + + auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk)); + auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk)); + + for (int i = 0; i < X86::AddrNumOperands; ++i) { + MIBLo.add(MBBI->getOperand(i)); + if (i == X86::AddrDisp) + MIBHi.addImm(Disp + 2); + else + MIBHi.add(MBBI->getOperand(i)); + } + MIBLo.addReg(Reg0, getKillRegState(SrcIsKill)); + MIBHi.addReg(Reg1, getKillRegState(SrcIsKill)); + + // Split the memory operand, adjusting the offset and size for the halves. + MachineMemOperand *OldMMO = MBBI->memoperands().front(); + MachineFunction *MF = MBB.getParent(); + MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2); + MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2); + + MIBLo.setMemRefs(MMOLo); + MIBHi.setMemRefs(MMOHi); + + // Delete the pseudo. + MBB.erase(MBBI); + return true; + } case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp index a1d256ea872d..b305940139c0 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp @@ -26,7 +26,6 @@ #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" @@ -498,7 +497,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, default: return false; case MVT::i1: { // Mask out all but lowest bit. - unsigned AndResult = createResultReg(&X86::GR8RegClass); + Register AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::AND8ri), AndResult) .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); @@ -691,7 +690,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, } } - unsigned ValReg = getRegForValue(Val); + Register ValReg = getRegForValue(Val); if (ValReg == 0) return false; @@ -761,9 +760,9 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { // Ok, we need to do a load from a stub. If we've already loaded from // this stub, reuse the loaded pointer, otherwise emit the load now. - DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V); - unsigned LoadReg; - if (I != LocalValueMap.end() && I->second != 0) { + DenseMap<const Value *, Register>::iterator I = LocalValueMap.find(V); + Register LoadReg; + if (I != LocalValueMap.end() && I->second) { LoadReg = I->second; } else { // Issue load from stub. @@ -1128,10 +1127,8 @@ bool X86FastISel::X86SelectStore(const Instruction *I) { if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) return false; - unsigned Alignment = S->getAlignment(); - unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); - if (Alignment == 0) // Ensure that codegen never sees alignment 0 - Alignment = ABIAlignment; + Align Alignment = S->getAlign(); + Align ABIAlignment = DL.getABITypeAlign(Val->getType()); bool Aligned = Alignment >= ABIAlignment; X86AddressMode AM; @@ -1196,7 +1193,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { CCInfo.AnalyzeReturn(Outs, RetCC_X86); const Value *RV = Ret->getOperand(0); - unsigned Reg = getRegForValue(RV); + Register Reg = getRegForValue(RV); if (Reg == 0) return false; @@ -1264,7 +1261,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. if (F.hasStructRetAttr() && CC != CallingConv::Swift) { - unsigned Reg = X86MFInfo->getSRetReturnReg(); + Register Reg = X86MFInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); unsigned RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX; @@ -1322,14 +1319,9 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { if (!X86SelectAddress(Ptr, AM)) return false; - unsigned Alignment = LI->getAlignment(); - unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType()); - if (Alignment == 0) // Ensure that codegen never sees alignment 0 - Alignment = ABIAlignment; - unsigned ResultReg = 0; if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg, - Alignment)) + LI->getAlign().value())) return false; updateValueMap(I, ResultReg); @@ -1392,7 +1384,7 @@ static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT, const DebugLoc &CurDbgLoc) { - unsigned Op0Reg = getRegForValue(Op0); + Register Op0Reg = getRegForValue(Op0); if (Op0Reg == 0) return false; // Handle 'null' like i32/i64 0. @@ -1414,7 +1406,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT, unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); if (CompareOpc == 0) return false; - unsigned Op1Reg = getRegForValue(Op1); + Register Op1Reg = getRegForValue(Op1); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc)) .addReg(Op0Reg) @@ -1487,8 +1479,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc())) return false; - unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); - unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + Register FlagReg1 = createResultReg(&X86::GR8RegClass); + Register FlagReg2 = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), @@ -1522,7 +1514,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { if (!TLI.isTypeLegal(DstVT)) return false; - unsigned ResultReg = getRegForValue(I->getOperand(0)); + Register ResultReg = getRegForValue(I->getOperand(0)); if (ResultReg == 0) return false; @@ -1548,7 +1540,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { default: llvm_unreachable("Unexpected zext to i64 source type"); } - unsigned Result32 = createResultReg(&X86::GR32RegClass); + Register Result32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32) .addReg(ResultReg); @@ -1559,7 +1551,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { } else if (DstVT == MVT::i16) { // i8->i16 doesn't exist in the autogenerated isel table. Need to zero // extend to 32-bits and then extract down to 16-bits. - unsigned Result32 = createResultReg(&X86::GR32RegClass); + Register Result32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8), Result32).addReg(ResultReg); @@ -1581,7 +1573,7 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) { if (!TLI.isTypeLegal(DstVT)) return false; - unsigned ResultReg = getRegForValue(I->getOperand(0)); + Register ResultReg = getRegForValue(I->getOperand(0)); if (ResultReg == 0) return false; @@ -1589,7 +1581,7 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) { MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); if (SrcVT == MVT::i1) { // Set the high bits to zero. - unsigned ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg, + Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); if (ZExtReg == 0) return false; @@ -1605,7 +1597,7 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) { if (DstVT == MVT::i16) { // i8->i16 doesn't exist in the autogenerated isel table. Need to sign // extend to 32-bits and then extract down to 16-bits. - unsigned Result32 = createResultReg(&X86::GR32RegClass); + Register Result32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8), Result32).addReg(ResultReg); @@ -1720,7 +1712,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { case MVT::i64: TestOpc = X86::TEST64ri32; break; } if (TestOpc) { - unsigned OpReg = getRegForValue(TI->getOperand(0)); + Register OpReg = getRegForValue(TI->getOperand(0)); if (OpReg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) @@ -1742,7 +1734,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { // Fake request the condition, otherwise the intrinsic might be completely // optimized away. - unsigned TmpReg = getRegForValue(BI->getCondition()); + Register TmpReg = getRegForValue(BI->getCondition()); if (TmpReg == 0) return false; @@ -1755,7 +1747,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { // Otherwise do a clumsy setcc and re-test it. // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used // in an explicit cast, so make sure to handle that correctly. - unsigned OpReg = getRegForValue(BI->getCondition()); + Register OpReg = getRegForValue(BI->getCondition()); if (OpReg == 0) return false; // In case OpReg is a K register, COPY to a GPR @@ -1824,10 +1816,10 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { if (!isTypeLegal(I->getType(), VT)) return false; - unsigned Op0Reg = getRegForValue(I->getOperand(0)); + Register Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; - unsigned Op1Reg = getRegForValue(I->getOperand(1)); + Register Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CReg).addReg(Op1Reg); @@ -1839,7 +1831,7 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { TII.get(TargetOpcode::KILL), X86::CL) .addReg(CReg, RegState::Kill); - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) .addReg(Op0Reg); updateValueMap(I, ResultReg); @@ -1933,10 +1925,10 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { const DivRemEntry &TypeEntry = OpTable[TypeIndex]; const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; - unsigned Op0Reg = getRegForValue(I->getOperand(0)); + Register Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; - unsigned Op1Reg = getRegForValue(I->getOperand(1)); + Register Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; @@ -1949,7 +1941,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpEntry.OpSignExtend)); else { - unsigned Zero32 = createResultReg(&X86::GR32RegClass); + Register Zero32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), Zero32); @@ -1986,8 +1978,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { if ((I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem) && OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { - unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass); - unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass); + Register SourceSuperReg = createResultReg(&X86::GR16RegClass); + Register ResultSuperReg = createResultReg(&X86::GR16RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), SourceSuperReg).addReg(X86::AX); @@ -2066,15 +2058,15 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { return false; if (SETFOpc) { - unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); - unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); + Register FlagReg1 = createResultReg(&X86::GR8RegClass); + Register FlagReg2 = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), FlagReg1).addImm(SETFOpc[0]); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), FlagReg2).addImm(SETFOpc[1]); auto const &II = TII.get(SETFOpc[2]); if (II.getNumDefs()) { - unsigned TmpReg = createResultReg(&X86::GR8RegClass); + Register TmpReg = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) .addReg(FlagReg2).addReg(FlagReg1); } else { @@ -2086,7 +2078,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { } else if (foldX86XALUIntrinsic(CC, I, Cond)) { // Fake request the condition, otherwise the intrinsic might be completely // optimized away. - unsigned TmpReg = getRegForValue(Cond); + Register TmpReg = getRegForValue(Cond); if (TmpReg == 0) return false; @@ -2099,7 +2091,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { // accurate. If we read more than the lsb, we may see non-zero values // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for // the select. This is achieved by performing TEST against 1. - unsigned CondReg = getRegForValue(Cond); + Register CondReg = getRegForValue(Cond); if (CondReg == 0) return false; bool CondIsKill = hasTrivialKill(Cond); @@ -2122,10 +2114,10 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); - unsigned RHSReg = getRegForValue(RHS); + Register RHSReg = getRegForValue(RHS); bool RHSIsKill = hasTrivialKill(RHS); - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); bool LHSIsKill = hasTrivialKill(LHS); if (!LHSReg || !RHSReg) @@ -2133,7 +2125,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo(); unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8); - unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, + Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); updateValueMap(I, ResultReg); return true; @@ -2182,19 +2174,19 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); bool LHSIsKill = hasTrivialKill(LHS); - unsigned RHSReg = getRegForValue(RHS); + Register RHSReg = getRegForValue(RHS); bool RHSIsKill = hasTrivialKill(RHS); - unsigned CmpLHSReg = getRegForValue(CmpLHS); + Register CmpLHSReg = getRegForValue(CmpLHS); bool CmpLHSIsKill = hasTrivialKill(CmpLHS); - unsigned CmpRHSReg = getRegForValue(CmpRHS); + Register CmpRHSReg = getRegForValue(CmpRHS); bool CmpRHSIsKill = hasTrivialKill(CmpRHS); - if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) + if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg) return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); @@ -2207,12 +2199,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { unsigned CmpOpcode = (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr; - unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill, + Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); // Need an IMPLICIT_DEF for the input that is used to generate the upper // bits of the result register since its not based on any of the inputs. - unsigned ImplicitDefReg = createResultReg(VR128X); + Register ImplicitDefReg = createResultReg(VR128X); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); @@ -2241,9 +2233,9 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { unsigned BlendOpcode = (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; - unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, + Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, + Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CmpReg, true); ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -2263,13 +2255,13 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { } const TargetRegisterClass *VR128 = &X86::VR128RegClass; - unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, + Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false, + Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false, LHSReg, LHSIsKill); - unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true, + Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true, RHSReg, RHSIsKill); - unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true, + Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true, AndReg, /*IsKill=*/true); ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -2317,7 +2309,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc())) return false; } else { - unsigned CondReg = getRegForValue(Cond); + Register CondReg = getRegForValue(Cond); if (CondReg == 0) return false; bool CondIsKill = hasTrivialKill(Cond); @@ -2340,10 +2332,10 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); bool LHSIsKill = hasTrivialKill(LHS); - unsigned RHSReg = getRegForValue(RHS); + Register RHSReg = getRegForValue(RHS); bool RHSIsKill = hasTrivialKill(RHS); if (!LHSReg || !RHSReg) @@ -2351,7 +2343,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); - unsigned ResultReg = + Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); updateValueMap(I, ResultReg); return true; @@ -2373,12 +2365,12 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) { } // No need for a select anymore - this is an unconditional move. if (Opnd) { - unsigned OpReg = getRegForValue(Opnd); + Register OpReg = getRegForValue(Opnd); if (OpReg == 0) return false; bool OpIsKill = hasTrivialKill(Opnd); const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(OpReg, getKillRegState(OpIsKill)); @@ -2419,7 +2411,7 @@ bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) { return false; // Select integer to float/double conversion. - unsigned OpReg = getRegForValue(I->getOperand(0)); + Register OpReg = getRegForValue(I->getOperand(0)); if (OpReg == 0) return false; @@ -2448,10 +2440,10 @@ bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) { MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT(); const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT); - unsigned ImplicitDefReg = createResultReg(RC); + Register ImplicitDefReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); - unsigned ResultReg = + Register ResultReg = fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false); updateValueMap(I, ResultReg); return true; @@ -2474,7 +2466,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, "Instruction must be an FPExt or FPTrunc!"); bool HasAVX = Subtarget->hasAVX(); - unsigned OpReg = getRegForValue(I->getOperand(0)); + Register OpReg = getRegForValue(I->getOperand(0)); if (OpReg == 0) return false; @@ -2486,7 +2478,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, } - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); MachineInstrBuilder MIB; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc), ResultReg); @@ -2537,7 +2529,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { if (!TLI.isTypeLegal(SrcVT)) return false; - unsigned InputReg = getRegForValue(I->getOperand(0)); + Register InputReg = getRegForValue(I->getOperand(0)); if (!InputReg) // Unhandled operand. Halt "fast" selection and bail. return false; @@ -2549,7 +2541,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { } // Issue an extract_subreg. - unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, + Register ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg, false, X86::sub_8bit); if (!ResultReg) @@ -2608,7 +2600,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { return false; const Value *Op = II->getArgOperand(0); - unsigned InputReg = getRegForValue(Op); + Register InputReg = getRegForValue(Op); if (InputReg == 0) return false; @@ -2632,12 +2624,15 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // used to provide rounding control: use MXCSR.RC, encoded as 0b100. // It's consistent with the other FP instructions, which are usually // controlled by MXCSR. - InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4); + unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr + : X86::VCVTPS2PHrr; + InputReg = fastEmitInst_ri(Opc, RC, InputReg, false, 4); // Move the lower 32-bits of ResultReg to another register of class GR32. + Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr + : X86::VMOVPDI2DIrr; ResultReg = createResultReg(&X86::GR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::VMOVPDI2DIrr), ResultReg) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(InputReg, RegState::Kill); // The result value is in the lower 16-bits of ResultReg. @@ -2645,19 +2640,21 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx); } else { assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!"); - // Explicitly sign-extend the input to 32-bit. - InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg, + // Explicitly zero-extend the input to 32-bit. + InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg, /*Kill=*/false); // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr. InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR, InputReg, /*Kill=*/true); - InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true); + unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr + : X86::VCVTPH2PSrr; + InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Kill=*/true); // The result value is in the lower 32-bits of ResultReg. // Emit an explicit copy from register class VR128 to register class FR32. - ResultReg = createResultReg(&X86::FR32RegClass); + ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(InputReg, RegState::Kill); @@ -2700,7 +2697,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // Always make a copy of the frame register to a vreg first, so that we // never directly reference the frame register (the TwoAddressInstruction- // Pass doesn't like that). - unsigned SrcReg = createResultReg(RC); + Register SrcReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); @@ -2830,7 +2827,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { } const Value *SrcVal = II->getArgOperand(0); - unsigned SrcReg = getRegForValue(SrcVal); + Register SrcReg = getRegForValue(SrcVal); if (SrcReg == 0) return false; @@ -2843,7 +2840,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); } - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); MachineInstrBuilder MIB; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); @@ -2903,7 +2900,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break; } - unsigned LHSReg = getRegForValue(LHS); + Register LHSReg = getRegForValue(LHS); if (LHSReg == 0) return false; bool LHSIsKill = hasTrivialKill(LHS); @@ -2974,7 +2971,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { return false; // Assign to a GPR since the overflow return value is lowered to a SETcc. - unsigned ResultReg2 = createResultReg(&X86::GR8RegClass); + Register ResultReg2 = createResultReg(&X86::GR8RegClass); assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr), ResultReg2).addImm(CondCode); @@ -3041,11 +3038,11 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { Op = IE->getOperand(0); } - unsigned Reg = getRegForValue(Op); + Register Reg = getRegForValue(Op); if (Reg == 0) return false; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + Register ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(Reg); @@ -3139,11 +3136,11 @@ bool X86FastISel::fastLowerArguments() { case MVT::f32: LLVM_FALLTHROUGH; case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; } - unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); + Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. // Without this, EmitLiveInCopies may eliminate the livein if its only // use is a bitcast (which isn't turned into an instruction). - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(DstReg, getKillRegState(true)); @@ -3154,7 +3151,7 @@ bool X86FastISel::fastLowerArguments() { static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget, CallingConv::ID CC, - ImmutableCallSite *CS) { + const CallBase *CB) { if (Subtarget->is64Bit()) return 0; if (Subtarget->getTargetTriple().isOSMSVCRT()) @@ -3163,9 +3160,9 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget, CC == CallingConv::HiPE || CC == CallingConv::Tail) return 0; - if (CS) - if (CS->arg_empty() || !CS->paramHasAttr(0, Attribute::StructRet) || - CS->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU()) + if (CB) + if (CB->arg_empty() || !CB->paramHasAttr(0, Attribute::StructRet) || + CB->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU()) return 0; return 4; @@ -3186,14 +3183,12 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CC); - const CallInst *CI = - CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr; + const CallInst *CI = dyn_cast_or_null<CallInst>(CLI.CB); const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr; // Call / invoke instructions with NoCfCheck attribute require special // handling. - const auto *II = - CLI.CS ? dyn_cast<InvokeInst>(CLI.CS->getInstruction()) : nullptr; + const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB); if ((CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck())) return false; @@ -3239,11 +3234,11 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { return false; // Don't know about inalloca yet. - if (CLI.CS && CLI.CS->hasInAllocaArgument()) + if (CLI.CB && CLI.CB->hasInAllocaArgument()) return false; for (auto Flag : CLI.OutFlags) - if (Flag.isSwiftError()) + if (Flag.isSwiftError() || Flag.isPreallocated()) return false; SmallVector<MVT, 16> OutVTs; @@ -3269,9 +3264,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { MVT VT; auto *TI = dyn_cast<TruncInst>(Val); unsigned ResultReg; - if (TI && TI->getType()->isIntegerTy(1) && CLI.CS && - (TI->getParent() == CLI.CS->getInstruction()->getParent()) && - TI->hasOneUse()) { + if (TI && TI->getType()->isIntegerTy(1) && CLI.CB && + (TI->getParent() == CLI.CB->getParent()) && TI->hasOneUse()) { Value *PrevVal = TI->getOperand(0); ResultReg = getRegForValue(PrevVal); @@ -3284,7 +3278,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { ResultReg = fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1); } else { - if (!isTypeLegal(Val->getType(), VT)) + if (!isTypeLegal(Val->getType(), VT) || + (VT.isVector() && VT.getVectorElementType() == MVT::i1)) return false; ResultReg = getRegForValue(Val); } @@ -3302,7 +3297,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Allocate shadow area for Win64 if (IsWin64) - CCInfo.AllocateStack(32, 8); + CCInfo.AllocateStack(32, Align(8)); CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); @@ -3406,7 +3401,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); OutRegs.push_back(VA.getLocReg()); } else { - assert(VA.isMemLoc()); + assert(VA.isMemLoc() && "Unknown value location!"); // Don't emit stores for undef values. if (isa<UndefValue>(ArgVal)) @@ -3417,7 +3412,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { AM.Base.Reg = RegInfo->getStackRegister(); AM.Disp = LocMemOffset; ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; - unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); + Align Alignment = DL.getABITypeAlign(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset), MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); @@ -3537,7 +3532,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg, TM.Options.GuaranteedTailCallOpt) ? NumBytes // Callee pops everything. - : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CS); + : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CB); unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) .addImm(NumBytes).addImm(NumBytesForCalleeToPop); @@ -3549,7 +3544,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. - unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy); + Register ResultReg = FuncInfo.CreateRegs(CLI.RetTy); for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getValVT(); @@ -3582,7 +3577,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { EVT ResVT = VA.getValVT(); unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; unsigned MemSize = ResVT.getSizeInBits()/8; - int FI = MFI.CreateStackObject(MemSize, MemSize, false); + int FI = MFI.CreateStackObject(MemSize, Align(MemSize), false); addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)), FI) .addReg(CopyReg); @@ -3647,7 +3642,7 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { return X86SelectZExt(I); if (DstVT.bitsLT(SrcVT)) return X86SelectTrunc(I); - unsigned Reg = getRegForValue(I->getOperand(0)); + Register Reg = getRegForValue(I->getOperand(0)); if (Reg == 0) return false; updateValueMap(I, Reg); return true; @@ -3668,13 +3663,18 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { DstVT.getVectorElementType() == MVT::i1) return false; - unsigned Reg = getRegForValue(I->getOperand(0)); - if (Reg == 0) + Register Reg = getRegForValue(I->getOperand(0)); + if (!Reg) return false; - // No instruction is needed for conversion. Reuse the register used by - // the fist operand. - updateValueMap(I, Reg); + // Emit a reg-reg copy so we don't propagate cached known bits information + // with the wrong VT if we fall out of fast isel after selecting this. + const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT); + Register ResultReg = createResultReg(DstClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg); + + updateValueMap(I, ResultReg); return true; } } @@ -3688,7 +3688,7 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { uint64_t Imm = CI->getZExtValue(); if (Imm == 0) { - unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); + Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: @@ -3701,7 +3701,7 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { case MVT::i32: return SrcReg; case MVT::i64: { - unsigned ResultReg = createResultReg(&X86::GR64RegClass); + Register ResultReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); @@ -3769,11 +3769,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { } // MachineConstantPool wants an explicit alignment. - unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); - if (Align == 0) { - // Alignment of vector types. FIXME! - Align = DL.getTypeAllocSize(CFP->getType()); - } + Align Alignment = DL.getPrefTypeAlign(CFP->getType()); // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; @@ -3786,11 +3782,12 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { PICBase = X86::RIP; // Create the load from the constant pool. - unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy)); + unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment); + Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy)); - if (CM == CodeModel::Large) { - unsigned AddrReg = createResultReg(&X86::GR64RegClass); + // Large code model only applies to 64-bit mode. + if (Subtarget->is64Bit() && CM == CodeModel::Large) { + Register AddrReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), AddrReg) .addConstantPoolIndex(CPI, 0, OpFlag); @@ -3799,7 +3796,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { addDirectMem(MIB, AddrReg); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getConstantPool(*FuncInfo.MF), - MachineMemOperand::MOLoad, DL.getPointerSize(), Align); + MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment); MIB->addMemOperand(*FuncInfo.MF, MMO); return ResultReg; } @@ -3824,7 +3821,7 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) return AM.Base.Reg; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + Register ResultReg = createResultReg(TLI.getRegClassFor(VT)); if (TM.getRelocationModel() == Reloc::Static && TLI.getPointerTy(DL) == MVT::i64) { // The displacement code could be more than 32 bits away so we need to use @@ -3883,7 +3880,7 @@ unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r) : X86::LEA64r; const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL)); - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); return ResultReg; @@ -3916,7 +3913,7 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { return 0; } - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + Register ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); return ResultReg; } @@ -3932,16 +3929,12 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const X86InstrInfo &XII = (const X86InstrInfo &)TII; unsigned Size = DL.getTypeAllocSize(LI->getType()); - unsigned Alignment = LI->getAlignment(); - - if (Alignment == 0) // Ensure that codegen never sees alignment 0 - Alignment = DL.getABITypeAlignment(LI->getType()); SmallVector<MachineOperand, 8> AddrOps; AM.getFullAddress(AddrOps); MachineInstr *Result = XII.foldMemoryOperandImpl( - *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment, + *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, LI->getAlign(), /*AllowCommute=*/true); if (!Result) return false; @@ -3958,7 +3951,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg) continue; // Found the index reg, now try to rewrite it. - unsigned IndexReg = constrainOperandRegClass(Result->getDesc(), + Register IndexReg = constrainOperandRegClass(Result->getDesc(), MO.getReg(), OperandNo); if (IndexReg == MO.getReg()) continue; @@ -3980,7 +3973,7 @@ unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode, unsigned Op3, bool Op3IsKill) { const MCInstrDesc &II = TII.get(MachineInstOpcode); - unsigned ResultReg = createResultReg(RC); + Register ResultReg = createResultReg(RC); Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs()); Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1); Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp index f8c4a2adb851..78de041329e2 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -350,7 +350,7 @@ MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode, return nullptr; // Don't interfere with formation of CBW instructions which should be a - // shorter encoding than even the MOVSX32rr8. It's also immunte to partial + // shorter encoding than even the MOVSX32rr8. It's also immune to partial // merge issues on Intel CPUs. if (MI->getOpcode() == X86::MOVSX16rr8 && MI->getOperand(0).getReg() == X86::AX && diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp index 9ac401bb0253..424279038921 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -16,8 +16,11 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineSizeOpts.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/Debug.h" @@ -111,6 +114,12 @@ public: MachineFunctionProperties::Property::NoVRegs); } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ProfileSummaryInfoWrapperPass>(); + AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + private: TargetSchedModel TSM; const X86InstrInfo *TII = nullptr; @@ -205,21 +214,27 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) { TSM.init(&ST); TII = ST.getInstrInfo(); TRI = ST.getRegisterInfo(); + auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + auto *MBFI = (PSI && PSI->hasProfileSummary()) + ? &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() + : nullptr; LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";); for (MachineBasicBlock &MBB : MF) { // First pass. Try to remove or optimize existing LEAs. + bool OptIncDecPerBB = + OptIncDec || llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { if (!isLEA(I->getOpcode())) continue; - if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP)) + if (optTwoAddrLEA(I, MBB, OptIncDecPerBB, UseLEAForSP)) continue; if (IsSlowLEA) processInstructionForSlowLEA(I, MBB); else if (IsSlow3OpsLEA) - processInstrForSlow3OpLEA(I, MBB, OptIncDec); + processInstrForSlow3OpLEA(I, MBB, OptIncDecPerBB); } // Second pass for creating LEAs. This may reverse some of the diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp index 924f429fc138..09668d7c5468 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp @@ -36,6 +36,8 @@ STATISTIC(NumSubstZexts, "Number of setcc + zext pairs substituted"); namespace { class X86FixupSetCCPass : public MachineFunctionPass { public: + static char ID; + X86FixupSetCCPass() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "X86 Fixup SetCC"; } @@ -47,12 +49,12 @@ private: const X86InstrInfo *TII = nullptr; enum { SearchBound = 16 }; - - static char ID; }; +} // end anonymous namespace char X86FixupSetCCPass::ID = 0; -} + +INITIALIZE_PASS(X86FixupSetCCPass, DEBUG_TYPE, DEBUG_TYPE, false, false) FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index b1d2de29c896..831695dabcd8 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -124,10 +124,6 @@ private: MachineInstr &JmpI, CondRegArray &CondRegs); void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse, MachineInstr &CopyDefI); - void rewriteSetCarryExtended(MachineBasicBlock &TestMBB, - MachineBasicBlock::iterator TestPos, - DebugLoc TestLoc, MachineInstr &SetBI, - MachineOperand &FlagUse, CondRegArray &CondRegs); void rewriteSetCC(MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, MachineInstr &SetCCI, MachineOperand &FlagUse, @@ -165,6 +161,7 @@ enum class FlagArithMnemonic { RCL, RCR, SBB, + SETB, }; } // namespace @@ -235,6 +232,10 @@ static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { case X86::ADOX32rm: case X86::ADOX64rm: return FlagArithMnemonic::ADOX; + + case X86::SETB_C32r: + case X86::SETB_C64r: + return FlagArithMnemonic::SETB; } } @@ -638,24 +639,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) { // logic. FlagsKilled = true; - switch (MI.getOpcode()) { - case X86::SETB_C8r: - case X86::SETB_C16r: - case X86::SETB_C32r: - case X86::SETB_C64r: - // Use custom lowering for arithmetic that is merely extending the - // carry flag. We model this as the SETB_C* pseudo instructions. - rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse, - CondRegs); - break; - - default: - // Generically handle remaining uses as arithmetic instructions. - rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse, - CondRegs); - break; - } - break; + // Generically handle remaining uses as arithmetic instructions. + rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse, + CondRegs); } // If this was the last use of the flags, we're done. @@ -821,6 +807,7 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic( case FlagArithMnemonic::RCL: case FlagArithMnemonic::RCR: case FlagArithMnemonic::SBB: + case FlagArithMnemonic::SETB: Cond = X86::COND_B; // CF == 1 // Set up an addend that when one is added will need a carry due to not // having a higher bit available. @@ -959,130 +946,6 @@ void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI, MI.eraseFromParent(); } -void X86FlagsCopyLoweringPass::rewriteSetCarryExtended( - MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, - DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse, - CondRegArray &CondRegs) { - // This routine is only used to handle pseudos for setting a register to zero - // or all ones based on CF. This is essentially the sign extended from 1-bit - // form of SETB and modeled with the SETB_C* pseudos. They require special - // handling as they aren't normal SETcc instructions and are lowered to an - // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that - // they are only provided in reg-defining forms. A complicating factor is that - // they can define many different register widths. - assert(SetBI.getOperand(0).isReg() && - "Cannot have a non-register defined operand to this variant of SETB!"); - - // Little helper to do the common final step of replacing the register def'ed - // by this SETB instruction with a new register and removing the SETB - // instruction. - auto RewriteToReg = [&](unsigned Reg) { - MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg); - SetBI.eraseFromParent(); - }; - - // Grab the register class used for this particular instruction. - auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg()); - - MachineBasicBlock &MBB = *SetBI.getParent(); - auto SetPos = SetBI.getIterator(); - auto SetLoc = SetBI.getDebugLoc(); - - auto AdjustReg = [&](unsigned Reg) { - auto &OrigRC = *MRI->getRegClass(Reg); - if (&OrigRC == &SetBRC) - return Reg; - - unsigned NewReg; - - int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8; - int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8; - assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!"); - assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!"); - int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit, - X86::NoSubRegister, X86::sub_32bit}; - - // If the original size is smaller than the target *and* is smaller than 4 - // bytes, we need to explicitly zero extend it. We always extend to 4-bytes - // to maximize the chance of being able to CSE that operation and to avoid - // partial dependency stalls extending to 2-bytes. - if (OrigRegSize < TargetRegSize && OrigRegSize < 4) { - NewReg = MRI->createVirtualRegister(&X86::GR32RegClass); - BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg) - .addReg(Reg); - if (&SetBRC == &X86::GR32RegClass) - return NewReg; - Reg = NewReg; - OrigRegSize = 4; - } - - NewReg = MRI->createVirtualRegister(&SetBRC); - if (OrigRegSize < TargetRegSize) { - BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG), - NewReg) - .addImm(0) - .addReg(Reg) - .addImm(SubRegIdx[OrigRegSize]); - } else if (OrigRegSize > TargetRegSize) { - if (TargetRegSize == 1 && !Subtarget->is64Bit()) { - // Need to constrain the register class. - MRI->constrainRegClass(Reg, &X86::GR32_ABCDRegClass); - } - - BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), - NewReg) - .addReg(Reg, 0, SubRegIdx[TargetRegSize]); - } else { - BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg) - .addReg(Reg); - } - return NewReg; - }; - - unsigned &CondReg = CondRegs[X86::COND_B]; - if (!CondReg) - CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B); - - // Adjust the condition to have the desired register width by zero-extending - // as needed. - // FIXME: We should use a better API to avoid the local reference and using a - // different variable here. - unsigned ExtCondReg = AdjustReg(CondReg); - - // Now we need to turn this into a bitmask. We do this by subtracting it from - // zero. - Register ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass); - BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg); - ZeroReg = AdjustReg(ZeroReg); - - unsigned Sub; - switch (SetBI.getOpcode()) { - case X86::SETB_C8r: - Sub = X86::SUB8rr; - break; - - case X86::SETB_C16r: - Sub = X86::SUB16rr; - break; - - case X86::SETB_C32r: - Sub = X86::SUB32rr; - break; - - case X86::SETB_C64r: - Sub = X86::SUB64rr; - break; - - default: - llvm_unreachable("Invalid SETB_C* opcode!"); - } - Register ResultReg = MRI->createVirtualRegister(&SetBRC); - BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg) - .addReg(ZeroReg) - .addReg(ExtCondReg); - return RewriteToReg(ResultReg); -} - void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos, DebugLoc TestLoc, diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp index 13bbd6ccfce4..e6ee46957500 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -1364,6 +1364,9 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { MBB->remove(&*I++); I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS)); + if (!MI.mayRaiseFPException()) + I->setFlag(MachineInstr::MIFlag::NoFPExcept); + // If both operands are killed, pop one off of the stack in addition to // overwriting the other one. if (KillsOp0 && KillsOp1 && Op0 != Op1) { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp index 1da20371caf5..c7ca6fb2a4fc 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -17,6 +17,7 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -32,6 +33,12 @@ #include "llvm/Target/TargetOptions.h" #include <cstdlib> +#define DEBUG_TYPE "x86-fl" + +STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue"); +STATISTIC(NumFrameExtraProbe, + "Number of extra stack probes generated in prologue"); + using namespace llvm; X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, @@ -50,7 +57,8 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { return !MF.getFrameInfo().hasVarSizedObjects() && - !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); + !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() && + !MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall(); } /// canSimplifyCallFramePseudos - If there is a reserved call frame, the @@ -60,6 +68,7 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { bool X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || + MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() || (hasFP(MF) && !TRI->needsStackRealignment(MF)) || TRI->hasBasePointer(MF); } @@ -83,10 +92,10 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || - TRI->needsStackRealignment(MF) || - MFI.hasVarSizedObjects() || + TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || + MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() || MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() || MFI.hasStackMap() || MFI.hasPatchPoint() || MFI.hasCopyImplyingStackAdjustment()); @@ -257,7 +266,20 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, uint64_t Chunk = (1LL << 31) - 1; - if (Offset > Chunk) { + MachineFunction &MF = *MBB.getParent(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF); + + // It's ok to not take into account large chunks when probing, as the + // allocation is split in smaller chunks anyway. + if (EmitInlineStackProbe && !InEpilogue) { + + // This pseudo-instruction is going to be expanded, potentially using a + // loop, by inlineStackProbe(). + BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset); + return; + } else if (Offset > Chunk) { // Rather than emit a long series of instructions for large offsets, // load the offset into a register and do one sub/add unsigned Reg = 0; @@ -381,8 +403,8 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( } else { bool IsSub = Offset < 0; uint64_t AbsOffset = IsSub ? -Offset : Offset; - unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) - : getADDriOpcode(Uses64BitFramePtr, AbsOffset); + const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset) + : getADDriOpcode(Uses64BitFramePtr, AbsOffset); MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(AbsOffset); @@ -457,9 +479,32 @@ void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, .addCFIIndex(CFIIndex); } +/// Emits Dwarf Info specifying offsets of callee saved registers and +/// frame pointer. This is called only when basic block sections are enabled. +void X86FrameLowering::emitCalleeSavedFrameMoves( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const { + MachineFunction &MF = *MBB.getParent(); + if (!hasFP(MF)) { + emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true); + return; + } + const MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + const unsigned FramePtr = TRI->getFrameRegister(MF); + const unsigned MachineFramePtr = + STI.isTarget64BitILP32() ? unsigned(getX86SubSuperRegister(FramePtr, 64)) + : FramePtr; + unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true); + // Offset = space for return address + size of the frame pointer itself. + unsigned Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4); + BuildCFI(MBB, MBBI, DebugLoc{}, + MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset)); + emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true); +} + void X86FrameLowering::emitCalleeSavedFrameMoves( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL) const { + const DebugLoc &DL, bool IsPrologue) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); @@ -474,10 +519,15 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( I = CSI.begin(), E = CSI.end(); I != E; ++I) { int64_t Offset = MFI.getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); - unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + + if (IsPrologue) { + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + } else { + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createRestore(nullptr, DwarfReg)); + } } } @@ -488,7 +538,8 @@ void X86FrameLowering::emitStackProbe(MachineFunction &MF, const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); if (STI.isTargetWindowsCoreCLR()) { if (InProlog) { - emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)) + .addImm(0 /* no explicit stack size */); } else { emitStackProbeInline(MF, MBB, MBBI, DL, false); } @@ -499,26 +550,13 @@ void X86FrameLowering::emitStackProbe(MachineFunction &MF, void X86FrameLowering::inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologMBB) const { - const StringRef ChkStkStubSymbol = "__chkstk_stub"; - MachineInstr *ChkStkStub = nullptr; - - for (MachineInstr &MI : PrologMBB) { - if (MI.isCall() && MI.getOperand(0).isSymbol() && - ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) { - ChkStkStub = &MI; - break; - } - } - - if (ChkStkStub != nullptr) { - assert(!ChkStkStub->isBundled() && - "Not expecting bundled instructions here"); - MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator()); - assert(std::prev(MBBI) == ChkStkStub && - "MBBI expected after __chkstk_stub."); - DebugLoc DL = PrologMBB.findDebugLoc(MBBI); - emitStackProbeInline(MF, PrologMBB, MBBI, DL, true); - ChkStkStub->eraseFromParent(); + auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) { + return MI.getOpcode() == X86::STACKALLOC_W_PROBING; + }); + if (Where != PrologMBB.end()) { + DebugLoc DL = PrologMBB.findDebugLoc(Where); + emitStackProbeInline(MF, PrologMBB, Where, DL, true); + Where->eraseFromParent(); } } @@ -528,6 +566,167 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF, const DebugLoc &DL, bool InProlog) const { const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + if (STI.isTargetWindowsCoreCLR() && STI.is64Bit()) + emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog); + else + emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog); +} + +void X86FrameLowering::emitStackProbeInlineGeneric( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { + MachineInstr &AllocWithProbe = *MBBI; + uint64_t Offset = AllocWithProbe.getOperand(0).getImm(); + + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) && + "different expansion expected for CoreCLR 64 bit"); + + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + uint64_t ProbeChunk = StackProbeSize * 8; + + // Synthesize a loop or unroll it, depending on the number of iterations. + if (Offset > ProbeChunk) { + emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset); + } else { + emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset); + } +} + +void X86FrameLowering::emitStackProbeInlineGenericBlock( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + uint64_t Offset) const { + + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + uint64_t CurrentOffset = 0; + // 0 Thanks to return address being saved on the stack + uint64_t CurrentProbeOffset = 0; + + // For the first N - 1 pages, just probe. I tried to take advantage of + // natural probes but it implies much more logic and there was very few + // interesting natural probes to interleave. + while (CurrentOffset + StackProbeSize < Offset) { + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. + + + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + NumFrameExtraProbe++; + CurrentOffset += StackProbeSize; + CurrentProbeOffset += StackProbeSize; + } + + uint64_t ChunkSize = Offset - CurrentOffset; + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(ChunkSize) + .setMIFlag(MachineInstr::FrameSetup); + MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. +} + +void X86FrameLowering::emitStackProbeInlineGenericLoop( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + uint64_t Offset) const { + assert(Offset && "null offset"); + + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const X86TargetLowering &TLI = *STI.getTargetLowering(); + const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi; + const uint64_t StackProbeSize = TLI.getStackProbeSize(MF); + + // Synthesize a loop + NumFrameLoopProbe++; + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = ++MBB.getIterator(); + MF.insert(MBBIter, testMBB); + MF.insert(MBBIter, tailMBB); + + Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D; + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed) + .addReg(StackPtr) + .setMIFlag(MachineInstr::FrameSetup); + + // save loop bound + { + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset); + BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed) + .addReg(FinalStackProbed) + .addImm(Offset / StackProbeSize * StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + // allocate a page + { + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize); + BuildMI(testMBB, DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(StackProbeSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + // touch the page + addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc)) + .setMIFlag(MachineInstr::FrameSetup), + StackPtr, false, 0) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + // cmp with stack pointer bound + BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) + .addReg(StackPtr) + .addReg(FinalStackProbed) + .setMIFlag(MachineInstr::FrameSetup); + + // jump + BuildMI(testMBB, DL, TII.get(X86::JCC_1)) + .addMBB(testMBB) + .addImm(X86::COND_NE) + .setMIFlag(MachineInstr::FrameSetup); + testMBB->addSuccessor(testMBB); + testMBB->addSuccessor(tailMBB); + + // BB management + tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end()); + tailMBB->transferSuccessorsAndUpdatePHIs(&MBB); + MBB.addSuccessor(testMBB); + + // handle tail + unsigned TailOffset = Offset % StackProbeSize; + if (TailOffset) { + const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, TailOffset); + BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr) + .addReg(StackPtr) + .addImm(TailOffset) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Update Live In information + recomputeLiveIns(*testMBB); + recomputeLiveIns(*tailMBB); +} + +void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); assert(STI.is64Bit() && "different expansion needed for 32 bit"); assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); const TargetInstrInfo &TII = *STI.getInstrInfo(); @@ -821,16 +1020,6 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, } } -void X86FrameLowering::emitStackProbeInlineStub( - MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const { - - assert(InProlog && "ChkStkStub called outside prolog!"); - - BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) - .addExternalSymbol("__chkstk_stub"); -} - static unsigned calculateSetFPREG(uint64_t SPAdjust) { // Win64 ABI has a less restrictive limitation of 240; 128 works equally well // and might require smaller successive adjustments. @@ -846,15 +1035,15 @@ static unsigned calculateSetFPREG(uint64_t SPAdjust) { // go with the minimum SlotSize. uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment. - unsigned StackAlign = getStackAlignment(); + Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment. + Align StackAlign = getStackAlign(); if (MF.getFunction().hasFnAttribute("stackrealign")) { if (MFI.hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else if (MaxAlign < SlotSize) - MaxAlign = SlotSize; + MaxAlign = Align(SlotSize); } - return MaxAlign; + return MaxAlign.value(); } void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, @@ -1014,7 +1203,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty(); + const bool EmitStackProbeCall = + STI.getTargetLowering()->hasStackProbeSymbol(MF); unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF); // Re-align the stack on 64-bit if the x86-interrupt calling convention is @@ -1032,11 +1222,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. - if (has128ByteRedZone(MF) && - !TRI->needsStackRealignment(MF) && + if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) && !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. - !UseStackProbe && // No stack probes. + !EmitStackProbeCall && // No stack probes. !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); @@ -1115,7 +1304,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth)); // Change the rule for the FramePtr to be an "offset" rule. unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); @@ -1192,7 +1381,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Define the current CFA rule to use the provided offset. assert(StackSize); BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset)); StackOffset += stackGrowth; } @@ -1237,7 +1426,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, uint64_t AlignedNumBytes = NumBytes; if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign); - if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { + if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) { assert(!X86FI->getUsesRedZone() && "The Red Zone is not accounted for in stack probes"); @@ -1323,17 +1512,17 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher), Establisher, false, PSPSlotOffset) .addMemOperand(MF.getMachineMemOperand( - NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize)); + NoInfo, MachineMemOperand::MOLoad, SlotSize, Align(SlotSize))); ; // Save the root establisher back into the current funclet's (mostly // empty) frame, in case a sub-funclet or the GC needs it. addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false, PSPSlotOffset) .addReg(Establisher) - .addMemOperand( - MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore | - MachineMemOperand::MOVolatile, - SlotSize, SlotSize)); + .addMemOperand(MF.getMachineMemOperand( + NoInfo, + MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, + SlotSize, Align(SlotSize))); } SPOrEstablisher = Establisher; } else { @@ -1370,7 +1559,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // into the registration node so that the runtime will restore it for us. if (!MBB.isCleanupFuncletEntry()) { assert(Personality == EHPersonality::MSVC_CXX); - unsigned FrameReg; + Register FrameReg; int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex; int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg); // ESP is the first field, so no extra displacement is needed. @@ -1389,7 +1578,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { int Offset; - unsigned IgnoredFrameReg; + Register IgnoredFrameReg; if (IsWin64Prologue && IsFunclet) Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg); else @@ -1423,7 +1612,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, .addReg(StackPtr) .addMemOperand(MF.getMachineMemOperand( PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, - SlotSize, SlotSize)); + SlotSize, Align(SlotSize))); } // Realign stack after we spilled callee-saved registers (so that we'll be @@ -1464,7 +1653,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // it recovers the frame pointer from the base pointer rather than the // other way around. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; - unsigned UsedReg; + Register UsedReg; int Offset = getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); assert(UsedReg == BasePtr); @@ -1479,12 +1668,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. assert(StackSize); - BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( - nullptr, -StackSize + stackGrowth)); + BuildCFI( + MBB, MBBI, DL, + MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth)); } // Emit DWARF info specifying the offsets of the callee-saved registers. - emitCalleeSavedFrameMoves(MBB, MBBI, DL); + emitCalleeSavedFrameMoves(MBB, MBBI, DL, true); } // X86 Interrupt handling function cannot assume anything about the direction @@ -1541,7 +1731,7 @@ static bool isFuncletReturnInstr(MachineInstr &MI) { unsigned X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo(); - unsigned SPReg; + Register SPReg; int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg, /*IgnoreSPUpdates*/ true); assert(Offset >= 0 && SPReg == TRI->getStackRegister()); @@ -1573,7 +1763,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { // RBP is not included in the callee saved register block. After pushing RBP, // everything is 16 byte aligned. Everything we allocate before an outgoing // call must also be 16 byte aligned. - unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment()); + unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign()); // Subtract out the size of the callee saved registers. This is how much stack // each funclet will allocate. return FrameSizeMinusRBP + XMMSize - CSSize; @@ -1634,6 +1824,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } uint64_t SEHStackAllocAmt = NumBytes; + // AfterPop is the position to insert .cfi_restore. + MachineBasicBlock::iterator AfterPop = MBBI; if (HasFP) { // Pop EBP. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), @@ -1642,8 +1834,15 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (NeedsDwarfCFI) { unsigned DwarfStackPtr = TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true); - BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa( - nullptr, DwarfStackPtr, -SlotSize)); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize)); + if (!MBB.succ_empty() && !MBB.isReturnBlock()) { + unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); + BuildCFI(MBB, AfterPop, DL, + MCCFIInstruction::createRestore(nullptr, DwarfFramePtr)); + --MBBI; + --AfterPop; + } --MBBI; } } @@ -1711,8 +1910,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true); if (!hasFP(MF) && NeedsDwarfCFI) { // Define the current CFA rule to use the provided offset. - BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset( - nullptr, -CSSize - SlotSize)); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::cfiDefCfaOffset(nullptr, CSSize + SlotSize)); } --MBBI; } @@ -1738,11 +1937,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, if (Opc == X86::POP32r || Opc == X86::POP64r) { Offset += SlotSize; BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::createDefCfaOffset(nullptr, Offset)); + MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset)); } } } + // Emit DWARF info specifying the restores of the callee-saved registers. + // For epilogue with return inside or being other block without successor, + // no need to generate .cfi_restore for callee-saved registers. + if (NeedsDwarfCFI && !MBB.succ_empty() && !MBB.isReturnBlock()) { + emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false); + } + if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. int Offset = -1 * X86FI->getTCReturnAddrDelta(); @@ -1756,7 +1962,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { + Register &FrameReg) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); bool IsFixed = MFI.isFixedObjectIndex(FI); @@ -1821,7 +2027,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, // Skip the saved EBP. return Offset + SlotSize + FPDelta; } else { - assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0); + assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize))); return Offset + StackSize; } } else if (TRI->needsStackRealignment(MF)) { @@ -1829,7 +2035,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, // Skip the saved EBP. return Offset + SlotSize + FPDelta; } else { - assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0); + assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize))); return Offset + StackSize; } // FIXME: Support tail calls @@ -1849,8 +2055,8 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, return Offset + FPDelta; } -int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, - int FI, unsigned &FrameReg) const { +int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI, + Register &FrameReg) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo(); @@ -1860,21 +2066,21 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, return getFrameIndexReference(MF, FI, FrameReg); FrameReg = TRI->getStackRegister(); - return alignDown(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second; + return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) + + it->second; } int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, - int FI, unsigned &FrameReg, + int FI, Register &FrameReg, int Adjustment) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); FrameReg = TRI->getStackRegister(); return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment; } -int -X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF, - int FI, unsigned &FrameReg, - bool IgnoreSPUpdates) const { +int X86FrameLowering::getFrameIndexReferencePreferSP( + const MachineFunction &MF, int FI, Register &FrameReg, + bool IgnoreSPUpdates) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); // Does not include any dynamic realign. @@ -1985,7 +2191,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( if (this->TRI->hasBasePointer(MF)) { // Allocate a spill slot for EBP if we have a base pointer and EH funclets. if (MF.hasEHFunclets()) { - int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize); + int FI = MFI.CreateSpillStackObject(SlotSize, Align(SlotSize)); X86FI->setHasSEHFramePtrSave(true); X86FI->setSEHFramePtrSaveIndex(FI); } @@ -2038,16 +2244,16 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); unsigned Size = TRI->getSpillSize(*RC); - unsigned Align = TRI->getSpillAlignment(*RC); + Align Alignment = TRI->getSpillAlign(*RC); // ensure alignment assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86"); - SpillSlotOffset = -alignTo(-SpillSlotOffset, Align); + SpillSlotOffset = -alignTo(-SpillSlotOffset, Alignment); // spill into slot SpillSlotOffset -= Size; int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); - MFI.ensureMaxAlignment(Align); + MFI.ensureMaxAlignment(Alignment); // Save the start offset and size of XMM in stack frame for funclets. if (X86::VR128RegClass.contains(Reg)) { @@ -2061,8 +2267,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( bool X86FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(MI); // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI @@ -2161,10 +2366,9 @@ void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB, CatchRetTarget->setHasAddressTaken(); } -bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { +bool X86FrameLowering::restoreCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; @@ -2799,6 +3003,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, I = MBB.erase(I); auto InsertPos = skipDebugInstructionsForward(I, MBB.end()); + // Try to avoid emitting dead SP adjustments if the block end is unreachable, + // typically because the function is marked noreturn (abort, throw, + // assert_fail, etc). + if (isDestroy && blockEndIsUnreachable(MBB, I)) + return I; + if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, <amt>' and the @@ -2807,8 +3017,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned StackAlign = getStackAlignment(); - Amount = alignTo(Amount, StackAlign); + Amount = alignTo(Amount, getStackAlign()); const Function &F = MF.getFunction(); bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); @@ -2881,13 +3090,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, return I; } - if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) { - // If we are performing frame pointer elimination and if the callee pops - // something off the stack pointer, add it back. We do this until we have - // more advanced stack pointer tracking ability. - // We are not tracking the stack pointer adjustment by the callee, so make - // sure we restore the stack pointer immediately after the call, there may - // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. + if (InternalAmt) { MachineBasicBlock::iterator CI = I; MachineBasicBlock::iterator B = MBB.begin(); while (CI != B && !std::prev(CI)->isCall()) @@ -2964,7 +3167,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( .setMIFlag(MachineInstr::FrameSetup); } - unsigned UsedReg; + Register UsedReg; int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg); int EndOffset = -EHRegOffset - EHRegSize; FuncInfo.EHRegNodeEndOffset = EndOffset; @@ -3003,8 +3206,8 @@ int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const { return TRI->getSlotSize(); } -unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) - const { +Register +X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) const { return TRI->getDwarfRegNum(StackPtr, true); } @@ -3014,7 +3217,7 @@ struct X86FrameSortingObject { bool IsValid = false; // true if we care about this Object. unsigned ObjectIndex = 0; // Index of Object into MFI list. unsigned ObjectSize = 0; // Size of Object in bytes. - unsigned ObjectAlignment = 1; // Alignment of Object in bytes. + Align ObjectAlignment = Align(1); // Alignment of Object in bytes. unsigned ObjectNumUses = 0; // Object static number of uses. }; @@ -3099,7 +3302,7 @@ void X86FrameLowering::orderFrameObjects( for (auto &Obj : ObjectsToAllocate) { SortingObjects[Obj].IsValid = true; SortingObjects[Obj].ObjectIndex = Obj; - SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj); + SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlign(Obj); // Set the size. int ObjectSize = MFI.getObjectSize(Obj); if (ObjectSize == 0) @@ -3192,7 +3395,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( int FrameIndex = H.CatchObj.FrameIndex; if (FrameIndex != INT_MAX) { // Ensure alignment. - unsigned Align = MFI.getObjectAlignment(FrameIndex); + unsigned Align = MFI.getObjectAlign(FrameIndex).value(); MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align; MinFixedObjOffset -= MFI.getObjectSize(FrameIndex); MFI.setObjectOffset(FrameIndex, MinFixedObjOffset); @@ -3219,3 +3422,24 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( UnwindHelpFI) .addImm(-2); } + +void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced( + MachineFunction &MF, RegScavenger *RS) const { + if (STI.is32Bit() && MF.hasEHFunclets()) + restoreWinEHStackPointersInParent(MF); +} + +void X86FrameLowering::restoreWinEHStackPointersInParent( + MachineFunction &MF) const { + // 32-bit functions have to restore stack pointers when control is transferred + // back to the parent function. These blocks are identified as eh pads that + // are not funclet entries. + bool IsSEH = isAsynchronousEHPersonality( + classifyEHPersonality(MF.getFunction().getPersonalityFn())); + for (MachineBasicBlock &MBB : MF) { + bool NeedsRestore = MBB.isEHPad() && !MBB.isEHFuncletEntry(); + if (NeedsRestore) + restoreWin32EHStackPointers(MBB, MBB.begin(), DebugLoc(), + /*RestoreSP=*/IsSEH); + } +} diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h index 2103d6471ead..c0b4be95f88d 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h @@ -58,9 +58,14 @@ public: void inlineStackProbe(MachineFunction &MF, MachineBasicBlock &PrologMBB) const override; + void + emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const override; + void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL) const; + const DebugLoc &DL, + bool IsPrologue) const override; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. @@ -83,13 +88,14 @@ public: bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const override; - bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const override; + bool + restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + MutableArrayRef<CalleeSavedInfo> CSI, + const TargetRegisterInfo *TRI) const override; bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; @@ -97,14 +103,14 @@ public: bool needsFrameIndexResolution(const MachineFunction &MF) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const override; + Register &FrameReg) const override; - int getWin64EHFrameIndexRef(const MachineFunction &MF, - int FI, unsigned &SPReg) const; - int getFrameIndexReferenceSP(const MachineFunction &MF, - int FI, unsigned &SPReg, int Adjustment) const; + int getWin64EHFrameIndexRef(const MachineFunction &MF, int FI, + Register &SPReg) const; + int getFrameIndexReferenceSP(const MachineFunction &MF, int FI, + Register &SPReg, int Adjustment) const; int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI, - unsigned &FrameReg, + Register &FrameReg, bool IgnoreSPUpdates) const override; MachineBasicBlock::iterator @@ -116,6 +122,10 @@ public: void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; + void + processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, + RegScavenger *RS) const override; + /// Check the instruction before/after the passed instruction. If /// it is an ADD/SUB/LEA instruction it is deleted argument and the /// stack adjustment is returned as a positive value for ADD/LEA and @@ -169,12 +179,14 @@ public: MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool RestoreSP = false) const; + void restoreWinEHStackPointersInParent(MachineFunction &MF) const; + int getInitialCFAOffset(const MachineFunction &MF) const override; - unsigned getInitialCFARegister(const MachineFunction &MF) const override; + Register getInitialCFARegister(const MachineFunction &MF) const override; /// Return true if the function has a redzone (accessible bytes past the - /// frame of the top of stack function) as part of it's ABI. + /// frame of the top of stack function) as part of it's ABI. bool has128ByteRedZone(const MachineFunction& MF) const; private: @@ -189,11 +201,33 @@ private: void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const; + void emitStackProbeInlineWindowsCoreCLR64(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + bool InProlog) const; + void emitStackProbeInlineGeneric(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, bool InProlog) const; + + void emitStackProbeInlineGenericBlock(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + uint64_t Offset) const; + + void emitStackProbeInlineGenericLoop(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + uint64_t Offset) const; /// Emit a stub to later inline the target stack probe. - void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool InProlog) const; + MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, + bool InProlog) const; /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 88af0ebcfd0e..3cd80cb04ab8 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -17,8 +17,6 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/ConstantRange.h" @@ -31,9 +29,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" #include <stdint.h> using namespace llvm; @@ -45,6 +40,10 @@ static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden); +static cl::opt<bool> EnablePromoteAnyextLoad( + "x86-promote-anyext-load", cl::init(true), + cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden); + //===----------------------------------------------------------------------===// // Pattern Matcher Implementation //===----------------------------------------------------------------------===// @@ -72,14 +71,14 @@ namespace { const char *ES; MCSymbol *MCSym; int JT; - unsigned Align; // CP alignment. + Align Alignment; // CP alignment. unsigned char SymbolFlags; // X86II::MO_* bool NegateIndex = false; X86ISelAddressMode() : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), - MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {} + MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {} bool hasSymbolicDisplacement() const { return GV != nullptr || CP != nullptr || ES != nullptr || @@ -145,7 +144,7 @@ namespace { dbgs() << MCSym; else dbgs() << "nul"; - dbgs() << " JT" << JT << " Align" << Align << '\n'; + dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; } #endif }; @@ -161,10 +160,6 @@ namespace { /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - /// If true, selector should try to optimize for code size instead of - /// performance. - bool OptForSize; - /// If true, selector should try to optimize for minimum code size. bool OptForMinSize; @@ -173,7 +168,7 @@ namespace { public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false), + : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForMinSize(false), IndirectTlsSegRefs(false) {} StringRef getPassName() const override { @@ -187,16 +182,15 @@ namespace { "indirect-tls-seg-refs"); // OptFor[Min]Size are used in pattern predicates that isel is matching. - OptForSize = MF.getFunction().hasOptSize(); OptForMinSize = MF.getFunction().hasMinSize(); - assert((!OptForMinSize || OptForSize) && + assert((!OptForMinSize || MF.getFunction().hasOptSize()) && "OptForMinSize implies OptForSize"); SelectionDAGISel::runOnMachineFunction(MF); return true; } - void EmitFunctionEntryCode() override; + void emitFunctionEntryCode() override; bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; @@ -221,9 +215,9 @@ namespace { bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, - SDValue &Scale, SDValue &Index, SDValue &Disp, - SDValue &Segment); + bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, + SDValue ScaleOp, SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectMOV64Imm32(SDValue N, SDValue &Imm); bool selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, @@ -234,11 +228,6 @@ namespace { bool selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N, - SDValue &Base, SDValue &Scale, - SDValue &Index, SDValue &Disp, - SDValue &Segment, - SDValue &NodeWithChain); bool selectRelocImm(SDValue N, SDValue &Op); bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, @@ -259,6 +248,8 @@ namespace { SDValue &Index, SDValue &Disp, SDValue &Segment); + bool isProfitableToFormMaskedOp(SDNode *N) const; + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -300,8 +291,8 @@ namespace { MVT::i32, AM.Disp, AM.SymbolFlags); else if (AM.CP) - Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, - AM.Align, AM.Disp, AM.SymbolFlags); + Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment, + AM.Disp, AM.SymbolFlags); else if (AM.ES) { assert(!AM.Disp && "Non-zero displacement is ignored with ES."); Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); @@ -368,9 +359,10 @@ namespace { if (User->getNumOperands() != 2) continue; - // If this can match to INC/DEC, don't count it as a use. - if (User->getOpcode() == ISD::ADD && - (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0)))) + // If this is a sign-extended 8-bit integer immediate used in an ALU + // instruction, there is probably an opcode encoding to save space. + auto *C = dyn_cast<ConstantSDNode>(N); + if (C && isInt<8>(C->getSExtValue())) continue; // Immediates that are used for offsets as part of stack @@ -475,14 +467,6 @@ namespace { bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; - /// Returns whether this is a relocatable immediate in the range - /// [-2^Width .. 2^Width-1]. - template <unsigned Width> bool isSExtRelocImm(SDNode *N) const { - if (auto *CN = dyn_cast<ConstantSDNode>(N)) - return isInt<Width>(CN->getSExtValue()); - return isSExtAbsoluteSymbolRef(Width, N); - } - // Indicates we should prefer to use a non-temporal load for this load. bool useNonTemporalLoad(LoadSDNode *N) const { if (!N->isNonTemporal()) @@ -513,8 +497,8 @@ namespace { bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); - bool combineIncDecVector(SDNode *Node); bool tryShrinkShlLogicImm(SDNode *N); + bool tryVPTERNLOG(SDNode *N); bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); bool tryMatchBitSelect(SDNode *N); @@ -581,12 +565,6 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { if (!N.hasOneUse()) return false; - // FIXME: Temporary hack to prevent strict floating point nodes from - // folding into masked operations illegally. - if (U == Root && Root->getOpcode() == ISD::VSELECT && - N.getOpcode() != ISD::LOAD && N.getOpcode() != X86ISD::VBROADCAST_LOAD) - return false; - if (N.getOpcode() != ISD::LOAD) return true; @@ -650,6 +628,11 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && (-Imm->getAPIntValue()).isSignedIntN(8)) return false; + + if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && + (-Imm->getAPIntValue()).isSignedIntN(8) && + hasNoCarryFlagUses(SDValue(U, 1))) + return false; } // If the other operand is a TLS address, we should fold it instead. @@ -724,6 +707,20 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { return true; } +// Indicates it is profitable to form an AVX512 masked operation. Returning +// false will favor a masked register-register masked move or vblendm and the +// operation will be selected separately. +bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { + assert( + (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && + "Unexpected opcode!"); + + // If the operation has additional users, the operation will be duplicated. + // Check the use count to prevent that. + // FIXME: Are there cheap opcodes we might want to duplicate? + return N->getOperand(1).hasOneUse(); +} + /// Replace the original chain operand of the call with /// load's chain operand and move load below the call's chain operand. static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, @@ -799,6 +796,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { } void X86DAGToDAGISel::PreprocessISelDAG() { + bool MadeChange = false; for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. @@ -811,11 +809,111 @@ void X86DAGToDAGISel::PreprocessISelDAG() { --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; - CurDAG->DeleteNode(N); + MadeChange = true; continue; } + /// Convert vector increment or decrement to sub/add with an all-ones + /// constant: + /// add X, <1, 1...> --> sub X, <-1, -1...> + /// sub X, <1, 1...> --> add X, <-1, -1...> + /// The all-ones vector constant can be materialized using a pcmpeq + /// instruction that is commonly recognized as an idiom (has no register + /// dependency), so that's better/smaller than loading a splat 1 constant. + if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && + N->getSimpleValueType(0).isVector()) { + + APInt SplatVal; + if (X86::isConstantSplat(N->getOperand(1), SplatVal) && + SplatVal.isOneValue()) { + SDLoc DL(N); + + MVT VT = N->getSimpleValueType(0); + unsigned NumElts = VT.getSizeInBits() / 32; + SDValue AllOnes = + CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts)); + AllOnes = CurDAG->getBitcast(VT, AllOnes); + + unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; + SDValue Res = + CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes); + --I; + CurDAG->ReplaceAllUsesWith(N, Res.getNode()); + ++I; + MadeChange = true; + continue; + } + } + switch (N->getOpcode()) { + case X86ISD::VBROADCAST: { + MVT VT = N->getSimpleValueType(0); + // Emulate v32i16/v64i8 broadcast without BWI. + if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { + MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; + SDLoc dl(N); + SDValue NarrowBCast = + CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0)); + SDValue Res = + CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), + NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); + unsigned Index = VT == MVT::v32i16 ? 16 : 32; + Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, + CurDAG->getIntPtrConstant(Index, dl)); + + --I; + CurDAG->ReplaceAllUsesWith(N, Res.getNode()); + ++I; + MadeChange = true; + continue; + } + + break; + } + case X86ISD::VBROADCAST_LOAD: { + MVT VT = N->getSimpleValueType(0); + // Emulate v32i16/v64i8 broadcast without BWI. + if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { + MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; + auto *MemNode = cast<MemSDNode>(N); + SDLoc dl(N); + SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other); + SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; + SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(), + MemNode->getMemOperand()); + SDValue Res = + CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), + NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); + unsigned Index = VT == MVT::v32i16 ? 16 : 32; + Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, + CurDAG->getIntPtrConstant(Index, dl)); + + --I; + SDValue To[] = {Res, NarrowBCast.getValue(1)}; + CurDAG->ReplaceAllUsesWith(N, To); + ++I; + MadeChange = true; + continue; + } + + break; + } + case ISD::VSELECT: { + // Replace VSELECT with non-mask conditions with with BLENDV. + if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1) + break; + + assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); + SDValue Blendv = + CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), + N->getOperand(0), N->getOperand(1), N->getOperand(2)); + --I; + CurDAG->ReplaceAllUsesWith(N, Blendv.getNode()); + ++I; + MadeChange = true; + continue; + } case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: case ISD::FP_TO_SINT: @@ -849,7 +947,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; - CurDAG->DeleteNode(N); + MadeChange = true; continue; } case ISD::SHL: @@ -872,27 +970,33 @@ void X86DAGToDAGISel::PreprocessISelDAG() { --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; - CurDAG->DeleteNode(N); + MadeChange = true; continue; } case ISD::ANY_EXTEND: case ISD::ANY_EXTEND_VECTOR_INREG: { // Replace vector any extend with the zero extend equivalents so we don't // need 2 sets of patterns. Ignore vXi1 extensions. - if (!N->getValueType(0).isVector() || - N->getOperand(0).getScalarValueSizeInBits() == 1) + if (!N->getValueType(0).isVector()) break; - unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND - ? ISD::ZERO_EXTEND - : ISD::ZERO_EXTEND_VECTOR_INREG; + unsigned NewOpc; + if (N->getOperand(0).getScalarValueSizeInBits() == 1) { + assert(N->getOpcode() == ISD::ANY_EXTEND && + "Unexpected opcode for mask vector!"); + NewOpc = ISD::SIGN_EXTEND; + } else { + NewOpc = N->getOpcode() == ISD::ANY_EXTEND + ? ISD::ZERO_EXTEND + : ISD::ZERO_EXTEND_VECTOR_INREG; + } SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), N->getOperand(0)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; - CurDAG->DeleteNode(N); + MadeChange = true; continue; } case ISD::FCEIL: @@ -936,7 +1040,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; - CurDAG->DeleteNode(N); + MadeChange = true; continue; } case X86ISD::FANDN: @@ -979,7 +1083,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; - CurDAG->DeleteNode(N); + MadeChange = true; continue; } } @@ -1018,6 +1122,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { continue; moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); ++NumLoadMoved; + MadeChange = true; continue; } @@ -1064,14 +1169,17 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // operations. Based on this, decide what we want to do. MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); + int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); SDLoc dl(N); // FIXME: optimize the case where the src/dest is a load or store? - SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0), - MemTmp, MachinePointerInfo(), MemVT); - SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, - MachinePointerInfo(), MemVT); + SDValue Store = CurDAG->getTruncStore( + CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT); + SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, + MemTmp, MPI, MemVT); // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the // extload we created. This will cause general havok on the dag because @@ -1117,6 +1225,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // operations. Based on this, decide what we want to do. MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); + int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); SDLoc dl(N); // FIXME: optimize the case where the src/dest is a load or store? @@ -1127,7 +1238,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { SDVTList VTs = CurDAG->getVTList(MVT::Other); SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp}; Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT, - MachinePointerInfo(), 0, + MPI, /*Align*/ None, MachineMemOperand::MOStore); if (N->getFlags().hasNoFPExcept()) { SDNodeFlags Flags = Store->getFlags(); @@ -1137,15 +1248,15 @@ void X86DAGToDAGISel::PreprocessISelDAG() { } else { assert(SrcVT == MemVT && "Unexpected VT!"); Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp, - MachinePointerInfo()); + MPI); } if (!DstIsSSE) { SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other); SDValue Ops[] = {Store, MemTmp}; - Result = CurDAG->getMemIntrinsicNode(X86ISD::FLD, dl, VTs, Ops, MemVT, - MachinePointerInfo(), 0, - MachineMemOperand::MOLoad); + Result = CurDAG->getMemIntrinsicNode( + X86ISD::FLD, dl, VTs, Ops, MemVT, MPI, + /*Align*/ None, MachineMemOperand::MOLoad); if (N->getFlags().hasNoFPExcept()) { SDNodeFlags Flags = Result->getFlags(); Flags.setNoFPExcept(true); @@ -1153,8 +1264,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { } } else { assert(DstVT == MemVT && "Unexpected VT!"); - Result = - CurDAG->getLoad(DstVT, dl, Store, MemTmp, MachinePointerInfo()); + Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI); } // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the @@ -1171,13 +1281,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() { // Now that we did that, the node is dead. Increment the iterator to the // next node to process, then delete N. ++I; - CurDAG->DeleteNode(N); + MadeChange = true; } - // The load+call transform above can leave some dead nodes in the graph. Make - // sure we remove them. Its possible some of the other transforms do to so - // just remove dead nodes unconditionally. - CurDAG->RemoveDeadNodes(); + // Remove any dead nodes that may have been left behind. + if (MadeChange) + CurDAG->RemoveDeadNodes(); } // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. @@ -1275,6 +1384,8 @@ void X86DAGToDAGISel::PostprocessISelDAG() { And.getOperand(6) /* Chain */ }; MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops); + CurDAG->setNodeMemRefs( + Test, cast<MachineSDNode>(And.getNode())->memoperands()); ReplaceUses(N, Test); MadeChange = true; continue; @@ -1390,7 +1501,7 @@ void X86DAGToDAGISel::emitSpecialCodeForMain() { } } -void X86DAGToDAGISel::EmitFunctionEntryCode() { +void X86DAGToDAGISel::emitFunctionEntryCode() { // If this is main, emit special code for main. const Function &F = MF->getFunction(); if (F.hasExternalLinkage() && F.getName() == "main") @@ -1409,18 +1520,20 @@ static bool isDispSafeForFrameIndex(int64_t Val) { bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM) { - // If there's no offset to fold, we don't need to do any work. - if (Offset == 0) - return false; + // We may have already matched a displacement and the caller just added the + // symbolic displacement. So we still need to do the checks even if Offset + // is zero. + + int64_t Val = AM.Disp + Offset; // Cannot combine ExternalSymbol displacements with integer offsets. - if (AM.ES || AM.MCSym) + if (Val != 0 && (AM.ES || AM.MCSym)) return true; - int64_t Val = AM.Disp + Offset; CodeModel::Model M = TM.getCodeModel(); if (Subtarget->is64Bit()) { - if (!X86::isOffsetSuitableForCodeModel(Val, M, + if (Val != 0 && + !X86::isOffsetSuitableForCodeModel(Val, M, AM.hasSymbolicDisplacement())) return true; // In addition to the checks required for a register base, check that @@ -1449,13 +1562,13 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) switch (N->getPointerInfo().getAddrSpace()) { - case 256: + case X86AS::GS: AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); return false; - case 257: + case X86AS::FS: AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); return false; - // Address space 258 is not handled here, because it is not used to + // Address space X86AS::SS is not handled here, because it is not used to // address TLS areas. } @@ -1505,7 +1618,7 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { Offset = G->getOffset(); } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) { AM.CP = CP->getConstVal(); - AM.Align = CP->getAlignment(); + AM.Alignment = CP->getAlign(); AM.SymbolFlags = CP->getTargetFlags(); Offset = CP->getOffset(); } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) { @@ -1583,9 +1696,10 @@ bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, return false; AM = Backup; - // Try again after commuting the operands. - if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) && - !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + // Try again after commutating the operands. + if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, + Depth + 1) && + !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1)) return false; AM = Backup; @@ -1782,7 +1896,7 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, // There is nothing we can do here unless the mask is removing some bits. // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. - if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; + if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; // We also need to ensure that mask is a continuous run of bits. if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; @@ -1877,7 +1991,7 @@ static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, // There is nothing we can do here unless the mask is removing some bits. // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. - if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true; + if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; MVT VT = N.getSimpleValueType(); SDLoc DL(N); @@ -2280,15 +2394,16 @@ bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { return matchAddressBase(N, AM); } -bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, - SDValue &Scale, SDValue &Index, - SDValue &Disp, SDValue &Segment) { +bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, + SDValue IndexOp, SDValue ScaleOp, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { X86ISelAddressMode AM; - auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent); - AM.IndexReg = Mgs->getIndex(); - AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue(); + AM.IndexReg = IndexOp; + AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue(); - unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); + unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); if (AddrSpace == X86AS::GS) AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); if (AddrSpace == X86AS::FS) @@ -2296,11 +2411,11 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, if (AddrSpace == X86AS::SS) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); - SDLoc DL(N); - MVT VT = N.getSimpleValueType(); + SDLoc DL(BasePtr); + MVT VT = BasePtr.getSimpleValueType(); // Try to match into the base and displacement fields. - if (matchVectorAddress(N, AM)) + if (matchVectorAddress(BasePtr, AM)) return false; getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); @@ -2331,12 +2446,11 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); - // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. - if (AddrSpace == 256) + if (AddrSpace == X86AS::GS) AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); - if (AddrSpace == 257) + if (AddrSpace == X86AS::FS) AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); - if (AddrSpace == 258) + if (AddrSpace == X86AS::SS) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); } @@ -2351,86 +2465,7 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, return true; } -// We can only fold a load if all nodes between it and the root node have a -// single use. If there are additional uses, we could end up duplicating the -// load. -static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) { - while (User != Root) { - if (!User->hasOneUse()) - return false; - User = *User->use_begin(); - } - - return true; -} - -/// Match a scalar SSE load. In particular, we want to match a load whose top -/// elements are either undef or zeros. The load flavor is derived from the -/// type of N, which is either v4f32 or v2f64. -/// -/// We also return: -/// PatternChainNode: this is the matched node that has a chain input and -/// output. -bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent, - SDValue N, SDValue &Base, - SDValue &Scale, SDValue &Index, - SDValue &Disp, SDValue &Segment, - SDValue &PatternNodeWithChain) { - if (!hasSingleUsesFromRoot(Root, Parent)) - return false; - - // We can allow a full vector load here since narrowing a load is ok unless - // it's volatile or atomic. - if (ISD::isNON_EXTLoad(N.getNode())) { - LoadSDNode *LD = cast<LoadSDNode>(N); - if (LD->isSimple() && - IsProfitableToFold(N, LD, Root) && - IsLegalToFold(N, Parent, Root, OptLevel)) { - PatternNodeWithChain = N; - return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, - Segment); - } - } - - // We can also match the special zero extended load opcode. - if (N.getOpcode() == X86ISD::VZEXT_LOAD) { - PatternNodeWithChain = N; - if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) { - auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain); - return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp, - Segment); - } - } - - // Need to make sure that the SCALAR_TO_VECTOR and load are both only used - // once. Otherwise the load might get duplicated and the chain output of the - // duplicate load will not be observed by all dependencies. - if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) { - PatternNodeWithChain = N.getOperand(0); - if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && - IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { - LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); - return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, - Segment); - } - } - - return false; -} - - bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { - if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { - uint64_t ImmVal = CN->getZExtValue(); - if (!isUInt<32>(ImmVal)) - return false; - - Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64); - return true; - } - // In static codegen with small code model, we can get the address of a label // into a register with 'movl' if (N->getOpcode() != X86ISD::Wrapper) @@ -2604,12 +2639,6 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, } bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { - if (auto *CN = dyn_cast<ConstantSDNode>(N)) { - Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN), - N.getValueType()); - return true; - } - // Keep track of the original value type and whether this value was // truncated. If we see a truncation from pointer type to VT that truncates // bits that are known to be zero, we can use a narrow reference. @@ -3896,49 +3925,82 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { return true; } -/// Convert vector increment or decrement to sub/add with an all-ones constant: -/// add X, <1, 1...> --> sub X, <-1, -1...> -/// sub X, <1, 1...> --> add X, <-1, -1...> -/// The all-ones vector constant can be materialized using a pcmpeq instruction -/// that is commonly recognized as an idiom (has no register dependency), so -/// that's better/smaller than loading a splat 1 constant. -bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) { - assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) && - "Unexpected opcode for increment/decrement transform"); - - EVT VT = Node->getValueType(0); - assert(VT.isVector() && "Should only be called for vectors."); - - SDValue X = Node->getOperand(0); - SDValue OneVec = Node->getOperand(1); +// Try to match two logic ops to a VPTERNLOG. +// FIXME: Handle inverted inputs? +// FIXME: Handle more complex patterns that use an operand more than once? +bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { + MVT NVT = N->getSimpleValueType(0); - APInt SplatVal; - if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue()) + // Make sure we support VPTERNLOG. + if (!NVT.isVector() || !Subtarget->hasAVX512() || + NVT.getVectorElementType() == MVT::i1) return false; - SDLoc DL(Node); - SDValue OneConstant, AllOnesVec; + // We need VLX for 128/256-bit. + if (!(Subtarget->hasVLX() || NVT.is512BitVector())) + return false; - APInt Ones = APInt::getAllOnesValue(32); - assert(VT.getSizeInBits() % 32 == 0 && - "Expected bit count to be a multiple of 32"); - OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32); - insertDAGNode(*CurDAG, X, OneConstant); + unsigned Opc1 = N->getOpcode(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); - unsigned NumElts = VT.getSizeInBits() / 32; - assert(NumElts > 0 && "Expected to get non-empty vector."); - AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts), - DL, OneConstant); - insertDAGNode(*CurDAG, X, AllOnesVec); + auto isLogicOp = [](unsigned Opc) { + return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || + Opc == X86ISD::ANDNP; + }; - AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec); - insertDAGNode(*CurDAG, X, AllOnesVec); + SDValue A, B, C; + unsigned Opc2; + if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) { + Opc2 = N1.getOpcode(); + A = N0; + B = N1.getOperand(0); + C = N1.getOperand(1); + } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) { + Opc2 = N0.getOpcode(); + A = N1; + B = N0.getOperand(0); + C = N0.getOperand(1); + } else + return false; - unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; - SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec); + uint64_t Imm; + switch (Opc1) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::AND: + switch (Opc2) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::AND: Imm = 0x80; break; + case ISD::OR: Imm = 0xe0; break; + case ISD::XOR: Imm = 0x60; break; + case X86ISD::ANDNP: Imm = 0x20; break; + } + break; + case ISD::OR: + switch (Opc2) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::AND: Imm = 0xf8; break; + case ISD::OR: Imm = 0xfe; break; + case ISD::XOR: Imm = 0xf6; break; + case X86ISD::ANDNP: Imm = 0xf2; break; + } + break; + case ISD::XOR: + switch (Opc2) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::AND: Imm = 0x78; break; + case ISD::OR: Imm = 0x1e; break; + case ISD::XOR: Imm = 0x96; break; + case X86ISD::ANDNP: Imm = 0xd2; break; + } + break; + } - ReplaceNode(Node, NewNode.getNode()); - SelectCode(NewNode.getNode()); + SDLoc DL(N); + SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C, + CurDAG->getTargetConstant(Imm, DL, MVT::i8)); + ReplaceNode(N, New.getNode()); + SelectCode(New.getNode()); return true; } @@ -4014,159 +4076,50 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked) { - if (Masked) { - if (FoldedLoad) { - switch (TestVT.SimpleTy) { - default: llvm_unreachable("Unexpected VT!"); - case MVT::v16i8: - return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk; - case MVT::v8i16: - return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk; - case MVT::v4i32: - return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk; - case MVT::v2i64: - return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk; - case MVT::v32i8: - return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk; - case MVT::v16i16: - return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk; - case MVT::v8i32: - return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk; - case MVT::v4i64: - return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk; - case MVT::v64i8: - return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk; - case MVT::v32i16: - return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk; - case MVT::v16i32: - return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk; - case MVT::v8i64: - return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk; - } - } - - if (FoldedBCast) { - switch (TestVT.SimpleTy) { - default: llvm_unreachable("Unexpected VT!"); - case MVT::v4i32: - return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk; - case MVT::v2i64: - return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk; - case MVT::v8i32: - return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk; - case MVT::v4i64: - return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk; - case MVT::v16i32: - return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk; - case MVT::v8i64: - return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk; - } - } - - switch (TestVT.SimpleTy) { - default: llvm_unreachable("Unexpected VT!"); - case MVT::v16i8: - return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk; - case MVT::v8i16: - return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk; - case MVT::v4i32: - return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk; - case MVT::v2i64: - return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk; - case MVT::v32i8: - return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk; - case MVT::v16i16: - return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk; - case MVT::v8i32: - return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk; - case MVT::v4i64: - return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk; - case MVT::v64i8: - return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk; - case MVT::v32i16: - return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk; - case MVT::v16i32: - return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk; - case MVT::v8i64: - return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk; - } - } +#define VPTESTM_CASE(VT, SUFFIX) \ +case MVT::VT: \ + if (Masked) \ + return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ + return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; + + +#define VPTESTM_BROADCAST_CASES(SUFFIX) \ +default: llvm_unreachable("Unexpected VT!"); \ +VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ +VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ +VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ +VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ +VPTESTM_CASE(v16i32, DZ##SUFFIX) \ +VPTESTM_CASE(v8i64, QZ##SUFFIX) + +#define VPTESTM_FULL_CASES(SUFFIX) \ +VPTESTM_BROADCAST_CASES(SUFFIX) \ +VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ +VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ +VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ +VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ +VPTESTM_CASE(v64i8, BZ##SUFFIX) \ +VPTESTM_CASE(v32i16, WZ##SUFFIX) if (FoldedLoad) { switch (TestVT.SimpleTy) { - default: llvm_unreachable("Unexpected VT!"); - case MVT::v16i8: - return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm; - case MVT::v8i16: - return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm; - case MVT::v4i32: - return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm; - case MVT::v2i64: - return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm; - case MVT::v32i8: - return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm; - case MVT::v16i16: - return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm; - case MVT::v8i32: - return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm; - case MVT::v4i64: - return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm; - case MVT::v64i8: - return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm; - case MVT::v32i16: - return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm; - case MVT::v16i32: - return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm; - case MVT::v8i64: - return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm; + VPTESTM_FULL_CASES(rm) } } if (FoldedBCast) { switch (TestVT.SimpleTy) { - default: llvm_unreachable("Unexpected VT!"); - case MVT::v4i32: - return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb; - case MVT::v2i64: - return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb; - case MVT::v8i32: - return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb; - case MVT::v4i64: - return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb; - case MVT::v16i32: - return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb; - case MVT::v8i64: - return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb; + VPTESTM_BROADCAST_CASES(rmb) } } switch (TestVT.SimpleTy) { - default: llvm_unreachable("Unexpected VT!"); - case MVT::v16i8: - return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr; - case MVT::v8i16: - return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr; - case MVT::v4i32: - return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr; - case MVT::v2i64: - return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr; - case MVT::v32i8: - return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr; - case MVT::v16i16: - return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr; - case MVT::v8i32: - return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr; - case MVT::v4i64: - return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr; - case MVT::v64i8: - return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr; - case MVT::v32i16: - return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr; - case MVT::v16i32: - return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr; - case MVT::v8i64: - return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr; + VPTESTM_FULL_CASES(rr) } + +#undef VPTESTM_FULL_CASES +#undef VPTESTM_BROADCAST_CASES +#undef VPTESTM_CASE } // Try to create VPTESTM instruction. If InMask is not null, it will be used @@ -4477,8 +4430,39 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; } + case Intrinsic::x86_tileloadd64: + case Intrinsic::x86_tileloaddt164: + case Intrinsic::x86_tilestored64: { + if (!Subtarget->hasAMXTILE()) + break; + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; + case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; + case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; + } + // FIXME: Match displacement and scale. + unsigned TIndex = Node->getConstantOperandVal(2); + SDValue TReg = getI8Imm(TIndex, dl); + SDValue Base = Node->getOperand(3); + SDValue Scale = getI8Imm(1, dl); + SDValue Index = Node->getOperand(4); + SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); + SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue Chain = Node->getOperand(0); + MachineSDNode *CNode; + if (Opc == X86::PTILESTORED) { + SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + } else { + SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; + CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + } + ReplaceNode(Node, CNode); + return; + } } - break; } case ISD::BRIND: { @@ -4490,9 +4474,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Converts a 32-bit register to a 64-bit, zero-extended version of // it. This is needed because x86-64 can do many things, but jmp %r32 // ain't one of them. - const SDValue &Target = Node->getOperand(1); - assert(Target.getSimpleValueType() == llvm::MVT::i32); - SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64)); + SDValue Target = Node->getOperand(1); + assert(Target.getValueType() == MVT::i32 && "Unexpected VT!"); + SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64); SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, Node->getOperand(0), ZextTarget); ReplaceNode(Node, Brind.getNode()); @@ -4516,21 +4500,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } break; - case ISD::VSELECT: { - // Replace VSELECT with non-mask conditions with with BLENDV. - if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1) - break; - - assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); - SDValue Blendv = CurDAG->getNode( - X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), - Node->getOperand(1), Node->getOperand(2)); - ReplaceNode(Node, Blendv.getNode()); - SelectCode(Blendv.getNode()); - // We already called ReplaceUses. - return; - } - case ISD::SRL: if (matchBitExtract(Node)) return; @@ -4569,24 +4538,21 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case ISD::XOR: if (tryShrinkShlLogicImm(Node)) return; - if (Opcode == ISD::OR && tryMatchBitSelect(Node)) return; + if (tryVPTERNLOG(Node)) + return; LLVM_FALLTHROUGH; case ISD::ADD: case ISD::SUB: { - if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() && - combineIncDecVector(Node)) - return; - // Try to avoid folding immediates with multiple uses for optsize. // This code tries to select to register form directly to avoid going // through the isel table which might fold the immediate. We can't change // the patterns on the add/sub/and/or/xor with immediate paterns in the // tablegen files to check immediate use count without making the patterns // unavailable to the fast-isel table. - if (!OptForSize) + if (!CurDAG->shouldOptForSize()) break; // Only handle i8/i16/i32/i64. @@ -4720,7 +4686,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); - // Multiply is commmutative. + // Multiply is commutative. if (!FoldedLoad) { FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (FoldedLoad) @@ -4772,31 +4738,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue N1 = Node->getOperand(1); unsigned Opc, MOpc; - bool isSigned = Opcode == ISD::SMUL_LOHI; - if (!isSigned) { - switch (NVT.SimpleTy) { - default: llvm_unreachable("Unsupported VT!"); - case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break; - case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break; - } - } else { - switch (NVT.SimpleTy) { - default: llvm_unreachable("Unsupported VT!"); - case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break; - case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break; - } - } - - unsigned SrcReg, LoReg, HiReg; - switch (Opc) { - default: llvm_unreachable("Unknown MUL opcode!"); - case X86::IMUL32r: - case X86::MUL32r: - SrcReg = LoReg = X86::EAX; HiReg = X86::EDX; + unsigned LoReg, HiReg; + bool IsSigned = Opcode == ISD::SMUL_LOHI; + bool UseMULX = !IsSigned && Subtarget->hasBMI2(); + bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); + switch (NVT.SimpleTy) { + default: llvm_unreachable("Unsupported VT!"); + case MVT::i32: + Opc = UseMULXHi ? X86::MULX32Hrr : + UseMULX ? X86::MULX32rr : + IsSigned ? X86::IMUL32r : X86::MUL32r; + MOpc = UseMULXHi ? X86::MULX32Hrm : + UseMULX ? X86::MULX32rm : + IsSigned ? X86::IMUL32m : X86::MUL32m; + LoReg = UseMULX ? X86::EDX : X86::EAX; + HiReg = X86::EDX; break; - case X86::IMUL64r: - case X86::MUL64r: - SrcReg = LoReg = X86::RAX; HiReg = X86::RDX; + case MVT::i64: + Opc = UseMULXHi ? X86::MULX64Hrr : + UseMULX ? X86::MULX64rr : + IsSigned ? X86::IMUL64r : X86::MUL64r; + MOpc = UseMULXHi ? X86::MULX64Hrm : + UseMULX ? X86::MULX64rm : + IsSigned ? X86::IMUL64m : X86::MUL64m; + LoReg = UseMULX ? X86::RDX : X86::RAX; + HiReg = X86::RDX; break; } @@ -4809,17 +4775,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) { std::swap(N0, N1); } - SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg, + SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, N0, SDValue()).getValue(1); + SDValue ResHi, ResLo; if (foldedLoad) { SDValue Chain; MachineSDNode *CNode = nullptr; SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; - SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); - CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); - Chain = SDValue(CNode, 0); - InFlag = SDValue(CNode, 1); + if (UseMULXHi) { + SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + Chain = SDValue(CNode, 1); + } else if (UseMULX) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other); + CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + Chain = SDValue(CNode, 2); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); + CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + Chain = SDValue(CNode, 0); + InFlag = SDValue(CNode, 1); + } // Update the chain. ReplaceUses(N1.getValue(1), Chain); @@ -4827,27 +4807,42 @@ void X86DAGToDAGISel::Select(SDNode *Node) { CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); } else { SDValue Ops[] = { N1, InFlag }; - SDVTList VTs = CurDAG->getVTList(MVT::Glue); - SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); - InFlag = SDValue(CNode, 0); + if (UseMULXHi) { + SDVTList VTs = CurDAG->getVTList(NVT); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + } else if (UseMULX) { + SDVTList VTs = CurDAG->getVTList(NVT, NVT); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + ResHi = SDValue(CNode, 0); + ResLo = SDValue(CNode, 1); + } else { + SDVTList VTs = CurDAG->getVTList(MVT::Glue); + SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + InFlag = SDValue(CNode, 0); + } } // Copy the low half of the result, if it is needed. if (!SDValue(Node, 0).use_empty()) { - assert(LoReg && "Register for low half is not defined!"); - SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, - NVT, InFlag); - InFlag = ResLo.getValue(2); + if (!ResLo) { + assert(LoReg && "Register for low half is not defined!"); + ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, + NVT, InFlag); + InFlag = ResLo.getValue(2); + } ReplaceUses(SDValue(Node, 0), ResLo); LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the high half of the result, if it is needed. if (!SDValue(Node, 1).use_empty()) { - assert(HiReg && "Register for high half is not defined!"); - SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, - NVT, InFlag); - InFlag = ResHi.getValue(2); + if (!ResHi) { + assert(HiReg && "Register for high half is not defined!"); + ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, + NVT, InFlag); + InFlag = ResHi.getValue(2); + } ReplaceUses(SDValue(Node, 1), ResHi); LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); @@ -4862,23 +4857,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); - unsigned Opc, MOpc; + unsigned ROpc, MOpc; bool isSigned = Opcode == ISD::SDIVREM; if (!isSigned) { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); - case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break; - case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break; - case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break; - case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break; + case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; + case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; + case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; + case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; } } else { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); - case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break; - case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break; - case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break; - case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break; + case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; + case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; + case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; + case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; } } @@ -4943,7 +4938,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. - SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); + SDValue ClrNode = + SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); switch (NVT.SimpleTy) { case MVT::i16: ClrNode = @@ -4985,7 +4982,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()}); } else { InFlag = - SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); + SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0); } // Prevent use of AH in a REX instruction by explicitly copying it to @@ -5034,6 +5031,77 @@ void X86DAGToDAGISel::Select(SDNode *Node) { return; } + case X86ISD::FCMP: + case X86ISD::STRICT_FCMP: + case X86ISD::STRICT_FCMPS: { + bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || + Node->getOpcode() == X86ISD::STRICT_FCMPS; + SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0); + SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1); + + // Save the original VT of the compare. + MVT CmpVT = N0.getSimpleValueType(); + + // Floating point needs special handling if we don't have FCOMI. + if (Subtarget->hasCMov()) + break; + + bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; + + unsigned Opc; + switch (CmpVT.SimpleTy) { + default: llvm_unreachable("Unexpected type!"); + case MVT::f32: + Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; + break; + case MVT::f64: + Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; + break; + case MVT::f80: + Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; + break; + } + + SDValue Cmp; + SDValue Chain = + IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode(); + if (IsStrictCmp) { + SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other); + Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0); + Chain = Cmp.getValue(1); + } else { + Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0); + } + + // Move FPSW to AX. + SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue()); + Chain = FPSW; + SDValue FNSTSW = + SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW, + FPSW.getValue(1)), + 0); + + // Extract upper 8-bits of AX. + SDValue Extract = + CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW); + + // Move AH into flags. + // Some 64-bit targets lack SAHF support, but they do support FCOMI. + assert(Subtarget->hasLAHFSAHF() && + "Target doesn't support SAHF or FCOMI?"); + SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue()); + Chain = AH; + SDValue SAHF = SDValue( + CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0); + + if (IsStrictCmp) + ReplaceUses(SDValue(Node, 1), Chain); + + ReplaceUses(SDValue(Node, 0), SAHF); + CurDAG->RemoveDeadNode(Node); + return; + } + case X86ISD::CMP: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); @@ -5267,6 +5335,279 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (foldLoadStoreIntoMemOperand(Node)) return; break; + + case X86ISD::SETCC_CARRY: { + // We have to do this manually because tblgen will put the eflags copy in + // the wrong place if we use an extract_subreg in the pattern. + MVT VT = Node->getSimpleValueType(0); + + // Copy flags to the EFLAGS register and glue it to next node. + SDValue EFLAGS = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, + Node->getOperand(1), SDValue()); + + // Create a 64-bit instruction if the result is 64-bits otherwise use the + // 32-bit version. + unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; + MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; + SDValue Result = SDValue( + CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0); + + // For less than 32-bits we need to extract from the 32-bit node. + if (VT == MVT::i8 || VT == MVT::i16) { + int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; + Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); + } + + ReplaceUses(SDValue(Node, 0), Result); + CurDAG->RemoveDeadNode(Node); + return; + } + case X86ISD::SBB: { + if (isNullConstant(Node->getOperand(0)) && + isNullConstant(Node->getOperand(1))) { + MVT VT = Node->getSimpleValueType(0); + + // Create zero. + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); + SDValue Zero = + SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); + if (VT == MVT::i64) { + Zero = SDValue( + CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, dl, MVT::i64), Zero, + CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), + 0); + } + + // Copy flags to the EFLAGS register and glue it to next node. + SDValue EFLAGS = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, + Node->getOperand(2), SDValue()); + + // Create a 64-bit instruction if the result is 64-bits otherwise use the + // 32-bit version. + unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; + MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; + VTs = CurDAG->getVTList(SBBVT, MVT::i32); + SDValue Result = + SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS, + EFLAGS.getValue(1)}), + 0); + + // Replace the flag use. + ReplaceUses(SDValue(Node, 1), Result.getValue(1)); + + // Replace the result use. + if (!SDValue(Node, 0).use_empty()) { + // For less than 32-bits we need to extract from the 32-bit node. + if (VT == MVT::i8 || VT == MVT::i16) { + int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; + Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); + } + ReplaceUses(SDValue(Node, 0), Result); + } + + CurDAG->RemoveDeadNode(Node); + return; + } + break; + } + case X86ISD::MGATHER: { + auto *Mgt = cast<X86MaskedGatherSDNode>(Node); + SDValue IndexOp = Mgt->getIndex(); + SDValue Mask = Mgt->getMask(); + MVT IndexVT = IndexOp.getSimpleValueType(); + MVT ValueVT = Node->getSimpleValueType(0); + MVT MaskVT = Mask.getSimpleValueType(); + + // This is just to prevent crashes if the nodes are malformed somehow. We're + // otherwise only doing loose type checking in here based on type what + // a type constraint would say just like table based isel. + if (!ValueVT.isVector() || !MaskVT.isVector()) + break; + + unsigned NumElts = ValueVT.getVectorNumElements(); + MVT ValueSVT = ValueVT.getVectorElementType(); + + bool IsFP = ValueSVT.isFloatingPoint(); + unsigned EltSize = ValueSVT.getSizeInBits(); + + unsigned Opc = 0; + bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; + if (AVX512Gather) { + if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) + Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; + else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) + Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; + else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) + Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; + else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) + Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; + else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) + Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; + else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) + Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; + else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) + Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; + else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) + Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; + else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) + Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; + else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) + Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; + else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) + Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; + else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) + Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; + } else { + assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && + "Unexpected mask VT!"); + if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) + Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; + else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) + Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; + else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) + Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; + else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) + Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; + else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) + Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; + else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) + Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; + else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) + Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; + else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) + Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; + } + + if (!Opc) + break; + + SDValue Base, Scale, Index, Disp, Segment; + if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(), + Base, Scale, Index, Disp, Segment)) + break; + + SDValue PassThru = Mgt->getPassThru(); + SDValue Chain = Mgt->getChain(); + // Gather instructions have a mask output not in the ISD node. + SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other); + + MachineSDNode *NewNode; + if (AVX512Gather) { + SDValue Ops[] = {PassThru, Mask, Base, Scale, + Index, Disp, Segment, Chain}; + NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); + } else { + SDValue Ops[] = {PassThru, Base, Scale, Index, + Disp, Segment, Mask, Chain}; + NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); + } + CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()}); + ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); + ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2)); + CurDAG->RemoveDeadNode(Node); + return; + } + case X86ISD::MSCATTER: { + auto *Sc = cast<X86MaskedScatterSDNode>(Node); + SDValue Value = Sc->getValue(); + SDValue IndexOp = Sc->getIndex(); + MVT IndexVT = IndexOp.getSimpleValueType(); + MVT ValueVT = Value.getSimpleValueType(); + + // This is just to prevent crashes if the nodes are malformed somehow. We're + // otherwise only doing loose type checking in here based on type what + // a type constraint would say just like table based isel. + if (!ValueVT.isVector()) + break; + + unsigned NumElts = ValueVT.getVectorNumElements(); + MVT ValueSVT = ValueVT.getVectorElementType(); + + bool IsFP = ValueSVT.isFloatingPoint(); + unsigned EltSize = ValueSVT.getSizeInBits(); + + unsigned Opc; + if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) + Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; + else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) + Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; + else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) + Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; + else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) + Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; + else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) + Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; + else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) + Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; + else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) + Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; + else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) + Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; + else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) + Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; + else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) + Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; + else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) + Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; + else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) + Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; + else + break; + + SDValue Base, Scale, Index, Disp, Segment; + if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(), + Base, Scale, Index, Disp, Segment)) + break; + + SDValue Mask = Sc->getMask(); + SDValue Chain = Sc->getChain(); + // Scatter instructions have a mask output not in the ISD node. + SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other); + SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; + + MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); + CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()}); + ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1)); + CurDAG->RemoveDeadNode(Node); + return; + } + case ISD::PREALLOCATED_SETUP: { + auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); + auto CallId = MFI->getPreallocatedIdForCallSite( + cast<SrcValueSDNode>(Node->getOperand(1))->getValue()); + SDValue Chain = Node->getOperand(0); + SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); + MachineSDNode *New = CurDAG->getMachineNode( + TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain); + ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain + CurDAG->RemoveDeadNode(Node); + return; + } + case ISD::PREALLOCATED_ARG: { + auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); + auto CallId = MFI->getPreallocatedIdForCallSite( + cast<SrcValueSDNode>(Node->getOperand(1))->getValue()); + SDValue Chain = Node->getOperand(0); + SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); + SDValue ArgIndex = Node->getOperand(2); + SDValue Ops[3]; + Ops[0] = CallIdValue; + Ops[1] = ArgIndex; + Ops[2] = Chain; + MachineSDNode *New = CurDAG->getMachineNode( + TargetOpcode::PREALLOCATED_ARG, dl, + CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()), + MVT::Other), + Ops); + ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer + ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain + CurDAG->RemoveDeadNode(Node); + return; + } } SelectCode(Node); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp index c8720d9ae3a6..450927aaf5cc 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12,7 +12,8 @@ //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" -#include "Utils/X86ShuffleDecode.h" +#include "MCTargetDesc/X86ShuffleDecode.h" +#include "X86.h" #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" @@ -28,6 +29,7 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -37,7 +39,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/WinEHFuncInfo.h" -#include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -75,13 +76,6 @@ static cl::opt<int> ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); -// Added in 10.0. -static cl::opt<bool> EnableOldKNLABI( - "x86-enable-old-knl-abi", cl::init(false), - cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of " - "one ZMM register on AVX512F, but not AVX512BW targets."), - cl::Hidden); - static cl::opt<bool> MulConstantOptimization( "mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " @@ -164,7 +158,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b. - // FIXME: Should we be limitting the atomic size on other configs? Default is + // FIXME: Should we be limiting the atomic size on other configs? Default is // 1024. if (!Subtarget.hasCmpxchg8b()) setMaxAtomicSizeInBitsSupported(32); @@ -190,12 +184,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::f64, MVT::f32, Expand); // SETOEQ and SETUNE require checking two conditions. - setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); - setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); - setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); - setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); - setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); - setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); + for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) { + setCondCodeAction(ISD::SETOEQ, VT, Expand); + setCondCodeAction(ISD::SETUNE, VT, Expand); + } // Integer absolute. if (Subtarget.hasCMov()) { @@ -206,10 +198,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Funnel shifts. for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { + // For slow shld targets we only lower for code size. + LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal; + + setOperationAction(ShiftOp , MVT::i8 , Custom); setOperationAction(ShiftOp , MVT::i16 , Custom); - setOperationAction(ShiftOp , MVT::i32 , Custom); + setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction); if (Subtarget.is64Bit()) - setOperationAction(ShiftOp , MVT::i64 , Custom); + setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction); } if (!Subtarget.useSoftFloat()) { @@ -270,6 +266,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); + + setOperationAction(ISD::LRINT, MVT::f32, Custom); + setOperationAction(ISD::LRINT, MVT::f64, Custom); + setOperationAction(ISD::LLRINT, MVT::f32, Custom); + setOperationAction(ISD::LLRINT, MVT::f64, Custom); + + if (!Subtarget.is64Bit()) { + setOperationAction(ISD::LRINT, MVT::i64, Custom); + setOperationAction(ISD::LLRINT, MVT::i64, Custom); + } } // Handle address space casts between mixed sized pointers. @@ -347,34 +353,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); } else { - setOperationAction(ISD::CTLZ , MVT::i8 , Custom); - setOperationAction(ISD::CTLZ , MVT::i16 , Custom); - setOperationAction(ISD::CTLZ , MVT::i32 , Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); - if (Subtarget.is64Bit()) { - setOperationAction(ISD::CTLZ , MVT::i64 , Custom); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { + if (VT == MVT::i64 && !Subtarget.is64Bit()) + continue; + setOperationAction(ISD::CTLZ , VT, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); } } - // Special handling for half-precision floating point conversions. - // If we don't have F16C support, then lower half float conversions - // into library calls. - if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { - setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); - setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); + for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16, + ISD::STRICT_FP_TO_FP16}) { + // Special handling for half-precision floating point conversions. + // If we don't have F16C support, then lower half float conversions + // into library calls. + setOperationAction( + Op, MVT::f32, + (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand); + // There's never any support for operations beyond MVT::f32. + setOperationAction(Op, MVT::f64, Expand); + setOperationAction(Op, MVT::f80, Expand); + setOperationAction(Op, MVT::f128, Expand); } - // There's never any support for operations beyond MVT::f32. - setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); - setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); - setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand); - setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); - setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); - setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); @@ -542,7 +542,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); - } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) { + } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 && + (UseX87 || Is64Bit)) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); @@ -663,8 +664,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f80, Expand); setOperationAction(ISD::LROUND, MVT::f80, Expand); setOperationAction(ISD::LLROUND, MVT::f80, Expand); - setOperationAction(ISD::LRINT, MVT::f80, Expand); - setOperationAction(ISD::LLRINT, MVT::f80, Expand); + setOperationAction(ISD::LRINT, MVT::f80, Custom); + setOperationAction(ISD::LLRINT, MVT::f80, Custom); // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal); @@ -1038,8 +1039,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTL, MVT::v4i32, Custom); setOperationAction(ISD::ROTL, MVT::v8i16, Custom); - // With AVX512, expanding (and promoting the shifts) is better. - if (!Subtarget.hasAVX512()) + // With 512-bit registers or AVX512VL+BW, expanding (and promoting the + // shifts) is better. + if (!Subtarget.useAVX512Regs() && + !(Subtarget.hasBWI() && Subtarget.hasVLX())) setOperationAction(ISD::ROTL, MVT::v16i8, Custom); setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); @@ -1078,6 +1081,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); + + setOperationAction(ISD::FROUND, RoundedTy, Custom); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); @@ -1170,6 +1175,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + + setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); @@ -1221,7 +1229,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ROTL, MVT::v16i16, Custom); // With BWI, expanding (and promoting the shifts) is the better. - if (!Subtarget.hasBWI()) + if (!Subtarget.useBWIRegs()) setOperationAction(ISD::ROTL, MVT::v32i8, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); @@ -1412,19 +1420,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ANY_EXTEND, VT, Custom); } - for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { + for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::ADD, VT, Custom); setOperationAction(ISD::SUB, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::UADDSAT, VT, Custom); + setOperationAction(ISD::SADDSAT, VT, Custom); + setOperationAction(ISD::USUBSAT, VT, Custom); + setOperationAction(ISD::SSUBSAT, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Expand); + } + + for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::TRUNCATE, VT, Custom); - setOperationAction(ISD::UADDSAT, VT, Custom); - setOperationAction(ISD::SADDSAT, VT, Custom); - setOperationAction(ISD::USUBSAT, VT, Custom); - setOperationAction(ISD::SSUBSAT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); @@ -1432,7 +1444,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Expand); } for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) @@ -1443,10 +1454,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // elements. 512-bits can be disabled based on prefer-vector-width and // required-vector-width function attributes. if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { + bool HasBWI = Subtarget.hasBWI(); + addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); + addRegisterClass(MVT::v32i16, &X86::VR512RegClass); + addRegisterClass(MVT::v64i8, &X86::VR512RegClass); for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); @@ -1454,6 +1469,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); + if (HasBWI) + setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); } for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { @@ -1497,6 +1514,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + if (HasBWI) + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE // to 512-bit rather than use the AVX2 instructions so that we can use @@ -1509,19 +1528,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } } - setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - // Need to custom widen this if we don't have AVX512BW. - setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom); + if (HasBWI) { + // Extends from v64i1 masks to 512-bit vectors. + setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); + } for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); @@ -1535,47 +1561,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); - setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::FROUND, VT, Custom); } - // Without BWI we need to use custom lowering to handle MVT::v64i8 input. - for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) { + for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); } - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); + setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom); + setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom); + + setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::MUL, MVT::v16i32, Legal); + setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::MUL, MVT::v64i8, Custom); + + setOperationAction(ISD::MULHU, MVT::v16i32, Custom); + setOperationAction(ISD::MULHS, MVT::v16i32, Custom); + setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); + setOperationAction(ISD::MULHS, MVT::v64i8, Custom); + setOperationAction(ISD::MULHU, MVT::v64i8, Custom); - setOperationAction(ISD::MUL, MVT::v8i64, Custom); - setOperationAction(ISD::MUL, MVT::v16i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); - setOperationAction(ISD::MULHU, MVT::v16i32, Custom); - setOperationAction(ISD::MULHS, MVT::v16i32, Custom); + for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + // The condition codes aren't legal in SSE/AVX and under AVX512 we use + // setcc all the way to isel and prefer SETGT in some isel patterns. + setCondCodeAction(ISD::SETLT, VT, Custom); + setCondCodeAction(ISD::SETLE, VT, Custom); + } for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); - setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); - setOperationAction(ISD::SELECT, VT, Custom); + } - // The condition codes aren't legal in SSE/AVX and under AVX512 we use - // setcc all the way to isel and prefer SETGT in some isel patterns. - setCondCodeAction(ISD::SETLT, VT, Custom); - setCondCodeAction(ISD::SETLE, VT, Custom); + for (auto VT : { MVT::v64i8, MVT::v32i16 }) { + setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom); + setOperationAction(ISD::CTLZ, VT, Custom); + setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom); + setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom); } if (Subtarget.hasDQI()) { @@ -1610,36 +1658,42 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, + MVT::v16f32, MVT::v8f64 }) { + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + } + for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } - if (!Subtarget.hasBWI()) { - // Need to custom split v32i16/v64i8 bitcasts. - setOperationAction(ISD::BITCAST, MVT::v32i16, Custom); - setOperationAction(ISD::BITCAST, MVT::v64i8, Custom); - - // Better to split these into two 256-bit ops. - setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom); - setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom); + if (HasBWI) { + for (auto VT : { MVT::v64i8, MVT::v32i16 }) { + setOperationAction(ISD::MLOAD, VT, Legal); + setOperationAction(ISD::MSTORE, VT, Legal); + } + } else { + setOperationAction(ISD::STORE, MVT::v32i16, Custom); + setOperationAction(ISD::STORE, MVT::v64i8, Custom); } if (Subtarget.hasVBMI2()) { - for (auto VT : { MVT::v16i32, MVT::v8i64 }) { + for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::FSHL, VT, Custom); setOperationAction(ISD::FSHR, VT, Custom); } } - }// has AVX-512 + }// useAVX512Regs // This block controls legalization for operations that don't have // pre-AVX512 equivalents. Without VLX we use 512-bit operations for @@ -1667,6 +1721,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Subtarget.hasVLX() ? Legal : Custom); + if (Subtarget.hasDQI()) { + // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. + // v2f32 UINT_TO_FP is already custom under SSE2. + assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && + isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && + "Unexpected operation action!"); + // v2i64 FP_TO_S/UINT(v2f32) custom conversion. + setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); + } + for (auto VT : { MVT::v2i64, MVT::v4i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); @@ -1746,12 +1813,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); } - setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); for (auto VT : { MVT::v16i1, MVT::v32i1 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); @@ -1759,93 +1824,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); - } - - // This block controls legalization for v32i16 and v64i8. 512-bits can be - // disabled based on prefer-vector-width and required-vector-width function - // attributes. - if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) { - addRegisterClass(MVT::v32i16, &X86::VR512RegClass); - addRegisterClass(MVT::v64i8, &X86::VR512RegClass); - - // Extends from v64i1 masks to 512-bit vectors. - setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); - - setOperationAction(ISD::MUL, MVT::v32i16, Legal); - setOperationAction(ISD::MUL, MVT::v64i8, Custom); - setOperationAction(ISD::MULHS, MVT::v32i16, Legal); - setOperationAction(ISD::MULHU, MVT::v32i16, Legal); - setOperationAction(ISD::MULHS, MVT::v64i8, Custom); - setOperationAction(ISD::MULHU, MVT::v64i8, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); - setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); - setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); - setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); - - setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); - setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); - - setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); - - for (auto VT : { MVT::v64i8, MVT::v32i16 }) { - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Custom); - setOperationAction(ISD::ABS, VT, Legal); - setOperationAction(ISD::SRL, VT, Custom); - setOperationAction(ISD::SHL, VT, Custom); - setOperationAction(ISD::SRA, VT, Custom); - setOperationAction(ISD::MLOAD, VT, Legal); - setOperationAction(ISD::MSTORE, VT, Legal); - setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::CTLZ, VT, Custom); - setOperationAction(ISD::SMAX, VT, Legal); - setOperationAction(ISD::UMAX, VT, Legal); - setOperationAction(ISD::SMIN, VT, Legal); - setOperationAction(ISD::UMIN, VT, Legal); - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::UADDSAT, VT, Legal); - setOperationAction(ISD::SADDSAT, VT, Legal); - setOperationAction(ISD::USUBSAT, VT, Legal); - setOperationAction(ISD::SSUBSAT, VT, Legal); - setOperationAction(ISD::SELECT, VT, Custom); - - // The condition codes aren't legal in SSE/AVX and under AVX512 we use - // setcc all the way to isel and prefer SETGT in some isel patterns. - setCondCodeAction(ISD::SETLT, VT, Custom); - setCondCodeAction(ISD::SETLE, VT, Custom); - } - - for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { - setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); - } - - if (Subtarget.hasBITALG()) { - for (auto VT : { MVT::v64i8, MVT::v32i16 }) - setOperationAction(ISD::CTPOP, VT, Legal); - } - - if (Subtarget.hasVBMI2()) { - setOperationAction(ISD::FSHL, MVT::v32i16, Custom); - setOperationAction(ISD::FSHR, MVT::v32i16, Custom); - } - } - if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); @@ -1874,19 +1853,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); - if (Subtarget.hasDQI()) { - // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. - // v2f32 UINT_TO_FP is already custom under SSE2. - assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && - isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && - "Unexpected operation action!"); - // v2i64 FP_TO_S/UINT(v2f32) custom conversion. - setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); - setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); - setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); - } - if (Subtarget.hasBWI()) { setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); @@ -1983,6 +1949,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); @@ -2000,6 +1967,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::STRICT_FMA); setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); @@ -2024,6 +1992,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); + setTargetDAGCombine(ISD::FP16_TO_FP); + setTargetDAGCombine(ISD::FP_EXTEND); + setTargetDAGCombine(ISD::STRICT_FP_EXTEND); + setTargetDAGCombine(ISD::FP_ROUND); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -2075,7 +2047,8 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(MVT VT) const { - if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) + if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() && + !Subtarget.hasBWI()) return TypeSplitVector; if (VT.getVectorNumElements() != 1 && @@ -2085,51 +2058,73 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { return TargetLoweringBase::getPreferredVectorAction(VT); } +static std::pair<MVT, unsigned> +handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, + const X86Subtarget &Subtarget) { + // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling + // convention is one that uses k registers. + if (NumElts == 2) + return {MVT::v2i64, 1}; + if (NumElts == 4) + return {MVT::v4i32, 1}; + if (NumElts == 8 && CC != CallingConv::X86_RegCall && + CC != CallingConv::Intel_OCL_BI) + return {MVT::v8i16, 1}; + if (NumElts == 16 && CC != CallingConv::X86_RegCall && + CC != CallingConv::Intel_OCL_BI) + return {MVT::v16i8, 1}; + // v32i1 passes in ymm unless we have BWI and the calling convention is + // regcall. + if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall)) + return {MVT::v32i8, 1}; + // Split v64i1 vectors if we don't have v64i8 available. + if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) { + if (Subtarget.useAVX512Regs()) + return {MVT::v64i8, 1}; + return {MVT::v32i8, 2}; + } + + // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. + if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) || + NumElts > 64) + return {MVT::i8, NumElts}; + + return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0}; +} + MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // v32i1 vectors should be promoted to v32i8 to match avx2. - if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) - return MVT::v32i8; - // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512() && - (!isPowerOf2_32(VT.getVectorNumElements()) || - (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || - (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) - return MVT::i8; - // Split v64i1 vectors if we don't have v64i8 available. - if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && - CC != CallingConv::X86_RegCall) - return MVT::v32i1; - // FIXME: Should we just make these types legal and custom split operations? - if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && - Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) - return MVT::v16i32; + Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); + + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return RegisterVT; + } + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // v32i1 vectors should be promoted to v32i8 to match avx2. - if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) - return 1; - // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && - Subtarget.hasAVX512() && - (!isPowerOf2_32(VT.getVectorNumElements()) || - (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || - (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) - return VT.getVectorNumElements(); - // Split v64i1 vectors if we don't have v64i8 available. - if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && - CC != CallingConv::X86_RegCall) - return 2; - // FIXME: Should we just make these types legal and custom split operations? - if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI && - Subtarget.useAVX512Regs() && !Subtarget.hasBWI()) - return 1; + Subtarget.hasAVX512()) { + unsigned NumElts = VT.getVectorNumElements(); + + MVT RegisterVT; + unsigned NumRegisters; + std::tie(RegisterVT, NumRegisters) = + handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); + if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) + return NumRegisters; + } + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } @@ -2140,8 +2135,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512() && (!isPowerOf2_32(VT.getVectorNumElements()) || - (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) || - (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) { + (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) || + VT.getVectorNumElements() > 64)) { RegisterVT = MVT::i8; IntermediateVT = MVT::i1; NumIntermediates = VT.getVectorNumElements(); @@ -2151,7 +2146,7 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( // Split v64i1 vectors if we don't have v64i8 available. if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && CC != CallingConv::X86_RegCall) { - RegisterVT = MVT::v32i1; + RegisterVT = MVT::v32i8; IntermediateVT = MVT::v32i1; NumIntermediates = 2; return 2; @@ -2194,20 +2189,20 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, /// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. -static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { +static void getMaxByValAlign(Type *Ty, Align &MaxAlign) { if (MaxAlign == 16) return; if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { - if (VTy->getBitWidth() == 128) - MaxAlign = 16; + if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128) + MaxAlign = Align(16); } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { - unsigned EltAlign = 0; + Align EltAlign; getMaxByValAlign(ATy->getElementType(), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast<StructType>(Ty)) { for (auto *EltTy : STy->elements()) { - unsigned EltAlign = 0; + Align EltAlign; getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; @@ -2225,46 +2220,34 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, const DataLayout &DL) const { if (Subtarget.is64Bit()) { // Max of 8 and alignment of type. - unsigned TyAlign = DL.getABITypeAlignment(Ty); + Align TyAlign = DL.getABITypeAlign(Ty); if (TyAlign > 8) - return TyAlign; + return TyAlign.value(); return 8; } - unsigned Align = 4; + Align Alignment(4); if (Subtarget.hasSSE1()) - getMaxByValAlign(Ty, Align); - return Align; -} - -/// Returns the target specific optimal type for load -/// and store operations as a result of memset, memcpy, and memmove -/// lowering. If DstAlign is zero that means it's safe to destination -/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it -/// means there isn't a need to check it against alignment requirement, -/// probably because the source does not need to be loaded. If 'IsMemset' is -/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that -/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy -/// source is constant so it does not need to be loaded. + getMaxByValAlign(Ty, Alignment); + return Alignment.value(); +} + /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. /// For vector ops we check that the overall size isn't larger than our /// preferred vector width. EVT X86TargetLowering::getOptimalMemOpType( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { - if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || - ((DstAlign == 0 || DstAlign >= 16) && - (SrcAlign == 0 || SrcAlign >= 16)))) { + if (Op.size() >= 16 && + (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { // FIXME: Check if unaligned 64-byte accesses are slow. - if (Size >= 64 && Subtarget.hasAVX512() && + if (Op.size() >= 64 && Subtarget.hasAVX512() && (Subtarget.getPreferVectorWidth() >= 512)) { return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; } // FIXME: Check if unaligned 32-byte accesses are slow. - if (Size >= 32 && Subtarget.hasAVX() && + if (Op.size() >= 32 && Subtarget.hasAVX() && (Subtarget.getPreferVectorWidth() >= 256)) { // Although this isn't a well-supported type for AVX1, we'll let // legalization and shuffle lowering produce the optimal codegen. If we @@ -2280,8 +2263,8 @@ EVT X86TargetLowering::getOptimalMemOpType( if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && (Subtarget.getPreferVectorWidth() >= 128)) return MVT::v4f32; - } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 && - !Subtarget.is64Bit() && Subtarget.hasSSE2()) { + } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) && + Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. // Also, do not use f64 to lower memset unless this is a memset of zeros. @@ -2294,7 +2277,7 @@ EVT X86TargetLowering::getOptimalMemOpType( // This is a compromise. If we reach here, unaligned accesses may be slow on // this target. However, creating smaller, aligned accesses could be even // slower and would certainly be a lot more code. - if (Subtarget.is64Bit() && Size >= 8) + if (Subtarget.is64Bit() && Op.size() >= 8) return MVT::i64; return MVT::i32; } @@ -2611,7 +2594,7 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, /// Breaks v64i1 value into two registers and adds the new node to the DAG static void Passv64i1ArgInRegs( const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg, - SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA, + SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, const X86Subtarget &Subtarget) { assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); @@ -2656,14 +2639,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); - SDValue Flag; - SmallVector<SDValue, 6> RetOps; - RetOps.push_back(Chain); // Operand #0 = Chain (updated below) - // Operand #1 = Bytes To Pop - RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, - MVT::i32)); - - // Copy the result values into the output registers. + SmallVector<std::pair<Register, SDValue>, 4> RetVals; for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; ++I, ++OutsIndex) { CCValAssign &VA = RVLocs[I]; @@ -2715,7 +2691,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); - RetOps.push_back(ValToCopy); + RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); // Don't emit a copytoreg. continue; } @@ -2736,31 +2712,39 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, } } - SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; - if (VA.needsCustom()) { assert(VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs"); - Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I], + Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I], Subtarget); - assert(2 == RegsToPass.size() && - "Expecting two registers after Pass64BitArgInRegs"); - // Add the second register to the CalleeSaveDisableRegs list. if (ShouldDisableCalleeSavedRegister) MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); } else { - RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); + RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); } + } + + SDValue Flag; + SmallVector<SDValue, 6> RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + // Operand #1 = Bytes To Pop + RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, + MVT::i32)); - // Add nodes to the DAG and add the values into the RetOps list - for (auto &Reg : RegsToPass) { - Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag); - Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType())); + // Copy the result values into the output registers. + for (auto &RetVal : RetVals) { + if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) { + RetOps.push_back(RetVal.second); + continue; // Don't emit a copytoreg. } + + Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag); + Flag = Chain.getValue(1); + RetOps.push_back( + DAG.getRegister(RetVal.first, RetVal.second.getValueType())); } // Swift calling convention does not require we copy the sret argument @@ -2775,7 +2759,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is // false, then an sret argument may be implicitly inserted in the SelDAG. In // either case FuncInfo->setSRetReturnReg() will have been called. - if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { + if (Register SRetReg = FuncInfo->getSRetReturnReg()) { // When we have both sret and another return value, we should use the // original Chain stored in RetOps[0], instead of the current Chain updated // in the above loop. If we only have sret, RetOps[0] equals to Chain. @@ -2798,7 +2782,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, getPointerTy(MF.getDataLayout())); - unsigned RetValReg + Register RetValReg = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); @@ -2924,7 +2908,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, if (nullptr == InFlag) { // When no physical register is present, // create an intermediate virtual register. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); @@ -3133,10 +3117,10 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SelectionDAG &DAG, const SDLoc &dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); - return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), - /*isVolatile*/false, /*AlwaysInline=*/true, - /*isTailCall*/false, - MachinePointerInfo(), MachinePointerInfo()); + return DAG.getMemcpy( + Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), + /*isVolatile*/ false, /*AlwaysInline=*/true, + /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo()); } /// Return true if the calling convention is one that we can guarantee TCO for. @@ -3176,8 +3160,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!CI->isTailCall()) return false; - ImmutableCallSite CS(CI); - CallingConv::ID CalleeCC = CS.getCallingConv(); + CallingConv::ID CalleeCC = CI->getCallingConv(); if (!mayTailCallThisCC(CalleeCC)) return false; @@ -3341,20 +3324,223 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, #ifndef NDEBUG static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) { - return std::is_sorted(ArgLocs.begin(), ArgLocs.end(), - [](const CCValAssign &A, const CCValAssign &B) -> bool { - return A.getValNo() < B.getValNo(); - }); + return llvm::is_sorted( + ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool { + return A.getValNo() < B.getValNo(); + }); } #endif +namespace { +/// This is a helper class for lowering variable arguments parameters. +class VarArgsLoweringHelper { +public: + VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc, + SelectionDAG &DAG, const X86Subtarget &Subtarget, + CallingConv::ID CallConv, CCState &CCInfo) + : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget), + TheMachineFunction(DAG.getMachineFunction()), + TheFunction(TheMachineFunction.getFunction()), + FrameInfo(TheMachineFunction.getFrameInfo()), + FrameLowering(*Subtarget.getFrameLowering()), + TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv), + CCInfo(CCInfo) {} + + // Lower variable arguments parameters. + void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize); + +private: + void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize); + + void forwardMustTailParameters(SDValue &Chain); + + bool is64Bit() { return Subtarget.is64Bit(); } + bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); } + + X86MachineFunctionInfo *FuncInfo; + const SDLoc &DL; + SelectionDAG &DAG; + const X86Subtarget &Subtarget; + MachineFunction &TheMachineFunction; + const Function &TheFunction; + MachineFrameInfo &FrameInfo; + const TargetFrameLowering &FrameLowering; + const TargetLowering &TargLowering; + CallingConv::ID CallConv; + CCState &CCInfo; +}; +} // namespace + +void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters( + SDValue &Chain, unsigned StackSize) { + // If the function takes variable number of arguments, make a frame index for + // the start of the first vararg value... for expansion of llvm.va_start. We + // can skip this if there are no va_start calls. + if (is64Bit() || (CallConv != CallingConv::X86_FastCall && + CallConv != CallingConv::X86_ThisCall)) { + FuncInfo->setVarArgsFrameIndex( + FrameInfo.CreateFixedObject(1, StackSize, true)); + } + + // Figure out if XMM registers are in use. + assert(!(Subtarget.useSoftFloat() && + TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) && + "SSE register cannot be used when SSE is disabled!"); + + // 64-bit calling conventions support varargs and register parameters, so we + // have to do extra work to spill them in the prologue. + if (is64Bit()) { + // Find the first unallocated argument registers. + ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); + ArrayRef<MCPhysReg> ArgXMMs = + get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget); + unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); + unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); + + assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && + "SSE register cannot be used when SSE is disabled!"); + + if (isWin64()) { + // Get to the caller-allocated home save location. Add 8 to account + // for the return address. + int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8; + FuncInfo->setRegSaveFrameIndex( + FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); + // Fixup to set vararg frame on shadow area (4 x i64). + if (NumIntRegs < 4) + FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); + } else { + // For X86-64, if there are vararg parameters that are passed via + // registers, then we must store them to their spots on the stack so + // they may be loaded by dereferencing the result of va_next. + FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); + FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); + FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject( + ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false)); + } + + SmallVector<SDValue, 6> + LiveGPRs; // list of SDValue for GPR registers keeping live input value + SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers + // keeping live input value + SDValue ALVal; // if applicable keeps SDValue for %al register + + // Gather all the live in physical registers. + for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { + Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass); + LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64)); + } + const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs); + if (!AvailableXmms.empty()) { + Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); + ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8); + for (MCPhysReg Reg : AvailableXmms) { + Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass); + LiveXMMRegs.push_back( + DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32)); + } + } + + // Store the integer parameter registers. + SmallVector<SDValue, 8> MemOps; + SDValue RSFIN = + DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), + TargLowering.getPointerTy(DAG.getDataLayout())); + unsigned Offset = FuncInfo->getVarArgsGPOffset(); + for (SDValue Val : LiveGPRs) { + SDValue FIN = DAG.getNode(ISD::ADD, DL, + TargLowering.getPointerTy(DAG.getDataLayout()), + RSFIN, DAG.getIntPtrConstant(Offset, DL)); + SDValue Store = + DAG.getStore(Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset)); + MemOps.push_back(Store); + Offset += 8; + } + + // Now store the XMM (fp + vector) parameter registers. + if (!LiveXMMRegs.empty()) { + SmallVector<SDValue, 12> SaveXMMOps; + SaveXMMOps.push_back(Chain); + SaveXMMOps.push_back(ALVal); + SaveXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL)); + SaveXMMOps.push_back( + DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL)); + SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), + LiveXMMRegs.end()); + MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL, + MVT::Other, SaveXMMOps)); + } + + if (!MemOps.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); + } +} + +void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) { + // Find the largest legal vector type. + MVT VecVT = MVT::Other; + // FIXME: Only some x86_32 calling conventions support AVX512. + if (Subtarget.useAVX512Regs() && + (is64Bit() || (CallConv == CallingConv::X86_VectorCall || + CallConv == CallingConv::Intel_OCL_BI))) + VecVT = MVT::v16f32; + else if (Subtarget.hasAVX()) + VecVT = MVT::v8f32; + else if (Subtarget.hasSSE2()) + VecVT = MVT::v4f32; + + // We forward some GPRs and some vector types. + SmallVector<MVT, 2> RegParmTypes; + MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32; + RegParmTypes.push_back(IntVT); + if (VecVT != MVT::Other) + RegParmTypes.push_back(VecVT); + + // Compute the set of forwarded registers. The rest are scratch. + SmallVectorImpl<ForwardedRegister> &Forwards = + FuncInfo->getForwardedMustTailRegParms(); + CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); + + // Forward AL for SysV x86_64 targets, since it is used for varargs. + if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) { + Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); + Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); + } + + // Copy all forwards from physical to virtual registers. + for (ForwardedRegister &FR : Forwards) { + // FIXME: Can we use a less constrained schedule? + SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT); + FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister( + TargLowering.getRegClassFor(FR.VT)); + Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal); + } +} + +void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain, + unsigned StackSize) { + // Set FrameIndex to the 0xAAAAAAA value to mark unset state. + // If necessary, it would be set into the correct value later. + FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); + FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); + + if (FrameInfo.hasVAStart()) + createVarArgAreaAndStoreRegisters(Chain, StackSize); + + if (FrameInfo.hasMustTailInVarArgFunc()) + forwardMustTailParameters(Chain); +} + SDValue X86TargetLowering::LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); - const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); const Function &F = MF.getFunction(); if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && @@ -3366,16 +3552,16 @@ SDValue X86TargetLowering::LowerFormalArguments( bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); assert( - !(isVarArg && canGuaranteeTCO(CallConv)) && + !(IsVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64. if (IsWin64) - CCInfo.AllocateStack(32, 8); + CCInfo.AllocateStack(32, Align(8)); CCInfo.AnalyzeArguments(Ins, CC_X86); @@ -3446,7 +3632,7 @@ SDValue X86TargetLowering::LowerFormalArguments( else llvm_unreachable("Unknown argument type!"); - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + Register Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); } @@ -3500,7 +3686,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // the argument into a virtual register so that we can access it from the // return points. if (Ins[I].Flags.isSRet()) { - unsigned Reg = FuncInfo->getSRetReturnReg(); + Register Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); @@ -3518,147 +3704,12 @@ SDValue X86TargetLowering::LowerFormalArguments( MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); - // If the function takes variable number of arguments, make a frame index for - // the start of the first vararg value... for expansion of llvm.va_start. We - // can skip this if there are no va_start calls. - if (MFI.hasVAStart() && - (Is64Bit || (CallConv != CallingConv::X86_FastCall && - CallConv != CallingConv::X86_ThisCall))) { - FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true)); - } - - // Figure out if XMM registers are in use. - assert(!(Subtarget.useSoftFloat() && - F.hasFnAttribute(Attribute::NoImplicitFloat)) && - "SSE register cannot be used when SSE is disabled!"); - - // 64-bit calling conventions support varargs and register parameters, so we - // have to do extra work to spill them in the prologue. - if (Is64Bit && isVarArg && MFI.hasVAStart()) { - // Find the first unallocated argument registers. - ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); - ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); - unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); - unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); - assert(!(NumXMMRegs && !Subtarget.hasSSE1()) && - "SSE register cannot be used when SSE is disabled!"); - - // Gather all the live in physical registers. - SmallVector<SDValue, 6> LiveGPRs; - SmallVector<SDValue, 8> LiveXMMRegs; - SDValue ALVal; - for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { - unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); - LiveGPRs.push_back( - DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); - } - if (!ArgXMMs.empty()) { - unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); - ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); - for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { - unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); - LiveXMMRegs.push_back( - DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); - } - } - - if (IsWin64) { - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex( - MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); - // Fixup to set vararg frame on shadow area (4 x i64). - if (NumIntRegs < 4) - FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); - } else { - // For X86-64, if there are vararg parameters that are passed via - // registers, then we must store them to their spots on the stack so - // they may be loaded by dereferencing the result of va_next. - FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); - FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); - FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject( - ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); - } - - // Store the integer parameter registers. - SmallVector<SDValue, 8> MemOps; - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy(DAG.getDataLayout())); - unsigned Offset = FuncInfo->getVarArgsGPOffset(); - for (SDValue Val : LiveGPRs) { - SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), - RSFIN, DAG.getIntPtrConstant(Offset, dl)); - SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - DAG.getMachineFunction(), - FuncInfo->getRegSaveFrameIndex(), Offset)); - MemOps.push_back(Store); - Offset += 8; - } - - if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { - // Now store the XMM (fp + vector) parameter registers. - SmallVector<SDValue, 12> SaveXMMOps; - SaveXMMOps.push_back(Chain); - SaveXMMOps.push_back(ALVal); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getRegSaveFrameIndex(), dl)); - SaveXMMOps.push_back(DAG.getIntPtrConstant( - FuncInfo->getVarArgsFPOffset(), dl)); - SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), - LiveXMMRegs.end()); - MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, - MVT::Other, SaveXMMOps)); - } - - if (!MemOps.empty()) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); - } - - if (isVarArg && MFI.hasMustTailInVarArgFunc()) { - // Find the largest legal vector type. - MVT VecVT = MVT::Other; - // FIXME: Only some x86_32 calling conventions support AVX512. - if (Subtarget.useAVX512Regs() && - (Is64Bit || (CallConv == CallingConv::X86_VectorCall || - CallConv == CallingConv::Intel_OCL_BI))) - VecVT = MVT::v16f32; - else if (Subtarget.hasAVX()) - VecVT = MVT::v8f32; - else if (Subtarget.hasSSE2()) - VecVT = MVT::v4f32; - - // We forward some GPRs and some vector types. - SmallVector<MVT, 2> RegParmTypes; - MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; - RegParmTypes.push_back(IntVT); - if (VecVT != MVT::Other) - RegParmTypes.push_back(VecVT); - - // Compute the set of forwarded registers. The rest are scratch. - SmallVectorImpl<ForwardedRegister> &Forwards = - FuncInfo->getForwardedMustTailRegParms(); - CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); - - // Forward AL for SysV x86_64 targets, since it is used for varargs. - if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) { - unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); - Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); - } - - // Copy all forwards from physical to virtual registers. - for (ForwardedRegister &FR : Forwards) { - // FIXME: Can we use a less constrained schedule? - SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT); - FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT)); - Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal); - } - } + if (IsVarArg) + VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo) + .lowerVarArgsParameters(Chain, StackSize); // Some CCs need callee pop. - if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, + if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { @@ -3677,10 +3728,6 @@ SDValue X86TargetLowering::LowerFormalArguments( if (!Is64Bit) { // RegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); - if (CallConv == CallingConv::X86_FastCall || - CallConv == CallingConv::X86_ThisCall) - // fastcc functions can't have varargs. - FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } FuncInfo->setArgumentStackSize(StackSize); @@ -3697,7 +3744,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // same, so the size of funclets' (mostly empty) frames is dictated by // how far this slot is from the bottom (since they allocate just enough // space to accommodate holding this slot at the correct offset). - int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false); + int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSS=*/false); EHInfo->PSPSymFrameIdx = PSPSymFI; } } @@ -3705,7 +3752,7 @@ SDValue X86TargetLowering::LowerFormalArguments( if (CallConv == CallingConv::X86_RegCall || F.hasFnAttribute("no_caller_saved_registers")) { MachineRegisterInfo &MRI = MF.getRegInfo(); - for (std::pair<unsigned, unsigned> Pair : MRI.liveins()) + for (std::pair<Register, Register> Pair : MRI.liveins()) MRI.disableCalleeSavedRegister(Pair.first); } @@ -3716,12 +3763,13 @@ SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, - ISD::ArgFlagsTy Flags) const { + ISD::ArgFlagsTy Flags, + bool isByVal) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); - if (Flags.isByVal()) + if (isByVal) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); return DAG.getStore( @@ -3796,18 +3844,17 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || CallConv == CallingConv::Tail; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); - const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction()); + const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB); const Function *Fn = CI ? CI->getCalledFunction() : nullptr; bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) || (Fn && Fn->hasFnAttribute("no_caller_saved_registers")); - const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction()); + const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB); bool HasNoCfCheck = (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()); const Module *M = MF.getMMI().getModule(); Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); MachineFunction::CallSiteInfo CSInfo; - if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); @@ -3823,7 +3870,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, isTailCall = false; } - bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall(); + bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure // that we can lower this successfully without moving the return address @@ -3854,7 +3901,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Allocate shadow area for Win64. if (IsWin64) - CCInfo.AllocateStack(32, 8); + CCInfo.AllocateStack(32, Align(8)); CCInfo.AnalyzeArguments(Outs, CC_X86); @@ -3900,6 +3947,21 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (ArgLocs.back().getLocMemOffset() != 0) report_fatal_error("any parameter with the inalloca attribute must be " "the only memory argument"); + } else if (CLI.IsPreallocated) { + assert(ArgLocs.back().isMemLoc() && + "cannot use preallocated attribute on a register " + "parameter"); + SmallVector<size_t, 4> PreallocatedOffsets; + for (size_t i = 0; i < CLI.OutVals.size(); ++i) { + if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) { + PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset()); + } + } + auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>(); + size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB); + MFI->setPreallocatedStackSize(PreallocatedId, NumBytes); + MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets); + NumBytesToPush = 0; } if (!IsSibcall && !IsMustTail) @@ -3912,7 +3974,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); - SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<std::pair<Register, SDValue>, 8> RegsToPass; SmallVector<SDValue, 8> MemOpChains; SDValue StackPtr; @@ -3927,9 +3989,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; ++I, ++OutIndex) { assert(OutIndex < Outs.size() && "Invalid Out index"); - // Skip inalloca arguments, they have already been written. + // Skip inalloca/preallocated arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; - if (Flags.isInAlloca()) + if (Flags.isInAlloca() || Flags.isPreallocated()) continue; CCValAssign &VA = ArgLocs[I]; @@ -3968,8 +4030,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // the caller from seeing any modifications the callee may make // as guaranteed by the `byval` attribute. int FrameIdx = MF.getFrameInfo().CreateStackObject( - Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()), - false); + Flags.getByValSize(), + std::max(Align(16), Flags.getNonZeroByValAlign()), false); SDValue StackSlot = DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); Chain = @@ -3998,12 +4060,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } else if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); const TargetOptions &Options = DAG.getTarget().Options; - if (Options.EnableDebugEntryValues) + if (Options.EmitCallSiteInfo) CSInfo.emplace_back(VA.getLocReg(), I); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. - unsigned ShadowReg = 0; + Register ShadowReg; switch (VA.getLocReg()) { case X86::XMM0: ShadowReg = X86::RCX; break; case X86::XMM1: ShadowReg = X86::RDX; break; @@ -4019,7 +4081,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, - dl, DAG, VA, Flags)); + dl, DAG, VA, Flags, isByVal)); } } @@ -4031,7 +4093,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair( - unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), + Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())))); } else { // If we are tail calling and generating PIC/GOT style code load the @@ -4069,8 +4131,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget.hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); - - RegsToPass.push_back(std::make_pair(unsigned(X86::AL), + RegsToPass.push_back(std::make_pair(Register(X86::AL), DAG.getConstant(NumXMMRegs, dl, MVT::i8))); } @@ -4079,7 +4140,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); - RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + RegsToPass.push_back(std::make_pair(F.PReg, Val)); } } @@ -4117,8 +4178,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(VA.isMemLoc()); SDValue Arg = OutVals[OutsIndex]; ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; - // Skip inalloca arguments. They don't require any work. - if (Flags.isInAlloca()) + // Skip inalloca/preallocated arguments. They don't require any work. + if (Flags.isInAlloca() || Flags.isPreallocated()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; @@ -4219,7 +4280,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. - if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) { + if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) { const Function &CallerFn = MF.getFunction(); EHPersonality Pers = CallerFn.hasPersonalityFn() @@ -4278,11 +4339,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); } InFlag = Chain.getValue(1); + DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); // Save heapallocsite metadata. - if (CLI.CS) - if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite")) + if (CLI.CB) + if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite")) DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); // Create the CALLSEQ_END node. @@ -4301,12 +4363,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, else NumBytesForCalleeToPop = 0; // Callee pops nothing. - if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) { - // No need to reset the stack after the call if the call doesn't return. To - // make the MI verify, we'll pretend the callee does it for us. - NumBytesForCalleeToPop = NumBytes; - } - // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, @@ -4337,7 +4393,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // (within module) calls are supported at the moment. // To keep the stack aligned according to platform abi the function // GetAlignedArgumentStackSize ensures that argument delta is always multiples -// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) +// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example) // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the @@ -4359,7 +4415,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, SelectionDAG &DAG) const { - const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment()); + const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign(); const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); assert(StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"); @@ -4395,7 +4451,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { - unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); + Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); if (!Register::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); @@ -4578,7 +4634,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( // Allocate shadow area for Win64 if (IsCalleeWin64) - CCInfo.AllocateStack(32, 8); + CCInfo.AllocateStack(32, Align(8)); CCInfo.AnalyzeCallOperands(Outs, CC_X86); StackArgsSize = CCInfo.getNextStackOffset(); @@ -4693,6 +4749,7 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: + case X86ISD::VALIGN: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: @@ -4739,6 +4796,13 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) { } } +static bool isTargetShuffleSplat(SDValue Op) { + unsigned Opcode = Op.getOpcode(); + if (Opcode == ISD::EXTRACT_SUBVECTOR) + return isTargetShuffleSplat(Op.getOperand(0)); + return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD; +} + SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); @@ -4972,7 +5036,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, ScalarVT = MVT::i32; Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); - Info.align = Align::None(); + Info.align = Align(1); Info.flags |= MachineMemOperand::MOStore; break; } @@ -4985,7 +5049,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); - Info.align = Align::None(); + Info.align = Align(1); Info.flags |= MachineMemOperand::MOLoad; break; } @@ -4997,7 +5061,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, unsigned NumElts = std::min(DataVT.getVectorNumElements(), IndexVT.getVectorNumElements()); Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); - Info.align = Align::None(); + Info.align = Align(1); Info.flags |= MachineMemOperand::MOStore; break; } @@ -5146,7 +5210,8 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); } -bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const { +bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, + bool) const { // TODO: Allow vectors? if (VT.isVector()) return false; @@ -5374,6 +5439,19 @@ static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) { return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); }); } +/// Return true if the value of any element in Mask is the zero sentinel value. +static bool isAnyZero(ArrayRef<int> Mask) { + return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); +} + +/// Return true if the value of any element in Mask is the zero or undef +/// sentinel values. +static bool isAnyZeroOrUndef(ArrayRef<int> Mask) { + return llvm::any_of(Mask, [](int M) { + return M == SM_SentinelZero || M == SM_SentinelUndef; + }); +} + /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { @@ -5511,6 +5589,36 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask) { return canWidenShuffleElements(Mask, WidenedMask); } +// Attempt to narrow/widen shuffle mask until it matches the target number of +// elements. +static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts, + SmallVectorImpl<int> &ScaledMask) { + unsigned NumSrcElts = Mask.size(); + assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && + "Illegal shuffle scale factor"); + + // Narrowing is guaranteed to work. + if (NumDstElts >= NumSrcElts) { + int Scale = NumDstElts / NumSrcElts; + llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask); + return true; + } + + // We have to repeat the widening until we reach the target size, but we can + // split out the first widening as it sets up ScaledMask for us. + if (canWidenShuffleElements(Mask, ScaledMask)) { + while (ScaledMask.size() > NumDstElts) { + SmallVector<int, 16> WidenedMask; + if (!canWidenShuffleElements(ScaledMask, WidenedMask)) + return false; + ScaledMask = std::move(WidenedMask); + } + return true; + } + + return false; +} + /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); @@ -5725,7 +5833,7 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); } -// Helper function to collect subvector ops that are concated together, +// Helper function to collect subvector ops that are concatenated together, // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. // The subvectors in Ops are guaranteed to be the same type. static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) { @@ -5736,8 +5844,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) { return true; } - if (N->getOpcode() == ISD::INSERT_SUBVECTOR && - isa<ConstantSDNode>(N->getOperand(2))) { + if (N->getOpcode() == ISD::INSERT_SUBVECTOR) { SDValue Src = N->getOperand(0); SDValue Sub = N->getOperand(1); const APInt &Idx = N->getConstantOperandAPInt(2); @@ -5746,19 +5853,93 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) { // TODO - Handle more general insert_subvector chains. if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) && - Idx == (VT.getVectorNumElements() / 2) && - Src.getOpcode() == ISD::INSERT_SUBVECTOR && - Src.getOperand(1).getValueType() == SubVT && - isNullConstant(Src.getOperand(2))) { - Ops.push_back(Src.getOperand(1)); - Ops.push_back(Sub); - return true; + Idx == (VT.getVectorNumElements() / 2)) { + // insert_subvector(insert_subvector(undef, x, lo), y, hi) + if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && + Src.getOperand(1).getValueType() == SubVT && + isNullConstant(Src.getOperand(2))) { + Ops.push_back(Src.getOperand(1)); + Ops.push_back(Sub); + return true; + } + // insert_subvector(x, extract_subvector(x, lo), hi) + if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) { + Ops.append(2, Sub); + return true; + } } } return false; } +static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG, + const SDLoc &dl) { + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + unsigned SizeInBits = VT.getSizeInBits(); + assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 && + "Can't split odd sized vector"); + + SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2); + SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2); + return std::make_pair(Lo, Hi); +} + +// Split an unary integer op into 2 half sized ops. +static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // Make sure we only try to split 256/512-bit types to avoid creating + // narrow vectors. + assert((Op.getOperand(0).getValueType().is256BitVector() || + Op.getOperand(0).getValueType().is512BitVector()) && + (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"); + assert(Op.getOperand(0).getValueType().getVectorNumElements() == + VT.getVectorNumElements() && + "Unexpected VTs!"); + + SDLoc dl(Op); + + // Extract the Lo/Hi vectors + SDValue Lo, Hi; + std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, LoVT, Lo), + DAG.getNode(Op.getOpcode(), dl, HiVT, Hi)); +} + +/// Break a binary integer operation into 2 half sized ops and then +/// concatenate the result back. +static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // Sanity check that all the types match. + assert(Op.getOperand(0).getValueType() == VT && + Op.getOperand(1).getValueType() == VT && "Unexpected VTs!"); + assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"); + + SDLoc dl(Op); + + // Extract the LHS Lo/Hi vectors + SDValue LHS1, LHS2; + std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl); + + // Extract the RHS Lo/Hi vectors + SDValue RHS1, RHS2; + std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl); + + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, + DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1), + DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2)); +} + // Helper for splitting operands of an operation to legal target size and // apply a function on each part. // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in @@ -5815,21 +5996,17 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); - - if (!isa<ConstantSDNode>(Idx)) - return SDValue(); + unsigned IdxVal = Op.getConstantOperandVal(2); // Inserting undef is a nop. We can just return the original vector. if (SubVec.isUndef()) return Vec; - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; MVT OpVT = Op.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); - SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); // Extend to natively supported kshift. @@ -5849,7 +6026,6 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, MVT SubVecVT = SubVec.getSimpleValueType(); unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); - assert(IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); @@ -5900,7 +6076,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); if (SubVecNumElems * 2 == NumElems) { // Special case, use legal zero extending insert_subvector. This allows - // isel to opimitize when bits are known zero. + // isel to optimize when bits are known zero. Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, DAG.getConstant(0, dl, WideOpVT), @@ -6042,8 +6218,8 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT, // Match (xor X, -1) -> X. // Match extract_subvector(xor X, -1) -> extract_subvector(X). // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). -static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { - V = peekThroughBitcasts(V); +static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) { + V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V); if (V.getOpcode() == ISD::XOR && ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) return V.getOperand(0); @@ -6067,6 +6243,35 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { return SDValue(); } +void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, + bool Lo, bool Unary) { + assert(Mask.empty() && "Expected an empty shuffle mask vector"); + int NumElts = VT.getVectorNumElements(); + int NumEltsInLane = 128 / VT.getScalarSizeInBits(); + for (int i = 0; i < NumElts; ++i) { + unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; + int Pos = (i % NumEltsInLane) / 2 + LaneStart; + Pos += (Unary ? 0 : NumElts * (i % 2)); + Pos += (Lo ? 0 : NumEltsInLane / 2); + Mask.push_back(Pos); + } +} + +/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation +/// imposed by AVX and specific to the unary pattern. Example: +/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> +/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> +void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, + bool Lo) { + assert(Mask.empty() && "Expected an empty shuffle mask vector"); + int NumElts = VT.getVectorNumElements(); + for (int i = 0; i < NumElts; ++i) { + int Pos = i / 2; + Pos += (Lo ? 0 : NumElts / 2); + Mask.push_back(Pos); + } +} + /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { @@ -6102,14 +6307,10 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); } -static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { - if (!Load || !ISD::isNormalLoad(Load)) - return nullptr; - - SDValue Ptr = Load->getBasePtr(); - if (Ptr->getOpcode() == X86ISD::Wrapper || - Ptr->getOpcode() == X86ISD::WrapperRIP) - Ptr = Ptr->getOperand(0); +static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) { + if (Ptr.getOpcode() == X86ISD::Wrapper || + Ptr.getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr.getOperand(0); auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr); if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0) @@ -6118,6 +6319,12 @@ static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { return CNode->getConstVal(); } +static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { + if (!Load || !ISD::isNormalLoad(Load)) + return nullptr; + return getTargetConstantFromBasePtr(Load->getBasePtr()); +} + static const Constant *getTargetConstantFromNode(SDValue Op) { Op = peekThroughBitcasts(Op); return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)); @@ -6298,23 +6505,6 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } // Extract constant bits from a broadcasted constant pool scalar. - if (Op.getOpcode() == X86ISD::VBROADCAST && - EltSizeInBits <= VT.getScalarSizeInBits()) { - if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) { - unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits(); - unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; - - APInt UndefSrcElts(NumSrcElts, 0); - SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); - if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) { - if (UndefSrcElts[0]) - UndefSrcElts.setBits(0, NumSrcElts); - SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); - return CastBitData(UndefSrcElts, SrcEltBits); - } - } - } - if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && EltSizeInBits <= VT.getScalarSizeInBits()) { auto *MemIntr = cast<MemIntrinsicSDNode>(Op); @@ -6322,16 +6512,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return false; SDValue Ptr = MemIntr->getBasePtr(); - if (Ptr->getOpcode() == X86ISD::Wrapper || - Ptr->getOpcode() == X86ISD::WrapperRIP) - Ptr = Ptr->getOperand(0); - - auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr); - if (!CNode || CNode->isMachineConstantPoolEntry() || - CNode->getOffset() != 0) - return false; - - if (const Constant *C = CNode->getConstVal()) { + if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) { unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits(); unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; @@ -6375,8 +6556,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } // Insert constant bits from a base and sub vector sources. - if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && - isa<ConstantSDNode>(Op.getOperand(2))) { + if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) { // TODO - support insert_subvector through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; @@ -6398,8 +6578,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } // Extract constant bits from a subvector's source. - if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && - isa<ConstantSDNode>(Op.getOperand(1))) { + if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) { // TODO - support extract_subvector through bitcasts. if (EltSizeInBits != VT.getScalarSizeInBits()) return false; @@ -6468,11 +6647,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, namespace llvm { namespace X86 { -bool isConstantSplat(SDValue Op, APInt &SplatVal) { +bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) { APInt UndefElts; SmallVector<APInt, 16> EltBits; if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(), - UndefElts, EltBits, true, false)) { + UndefElts, EltBits, true, + AllowPartialUndefs)) { int SplatIndex = -1; for (int i = 0, e = EltBits.size(); i != e; ++i) { if (UndefElts[i]) @@ -6513,20 +6693,26 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode, } /// Create a shuffle mask that matches the PACKSS/PACKUS truncation. +/// A multi-stage pack shuffle mask is created by specifying NumStages > 1. /// Note: This ignores saturation, so inputs must be checked first. static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, - bool Unary) { + bool Unary, unsigned NumStages = 1) { assert(Mask.empty() && "Expected an empty shuffle mask vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits(); unsigned Offset = Unary ? 0 : NumElts; + unsigned Repetitions = 1u << (NumStages - 1); + unsigned Increment = 1u << NumStages; + assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction"); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { - for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) - Mask.push_back(Elt + (Lane * NumEltsPerLane)); - for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) - Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); + for (unsigned Stage = 0; Stage != Repetitions; ++Stage) { + for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment) + Mask.push_back(Elt + (Lane * NumEltsPerLane)); + for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment) + Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); + } } } @@ -6597,7 +6783,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, unsigned MaskEltSize = VT.getScalarSizeInBits(); SmallVector<uint64_t, 32> RawMask; APInt RawUndefs; - SDValue ImmN; + uint64_t ImmN; assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); @@ -6608,23 +6794,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, case X86ISD::BLENDI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodeBLENDMask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUFP: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodeSHUFPMask(NumElems, MaskEltSize, - cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::INSERTPS: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodeINSERTPSMask(ImmN, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::EXTRQI: @@ -6672,13 +6857,23 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; + case X86ISD::VALIGN: + assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && + "Only 32-bit and 64-bit elements are supported!"); + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodeVALIGNMask(NumElems, ImmN, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + Ops.push_back(N->getOperand(1)); + Ops.push_back(N->getOperand(0)); + break; case X86ISD::PALIGNR: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), - Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodePALIGNRMask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); Ops.push_back(N->getOperand(1)); Ops.push_back(N->getOperand(0)); @@ -6686,39 +6881,34 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, case X86ISD::VSHLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), - Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodePSLLDQMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::VSRLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), - Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodePSRLDQMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodePSHUFMask(NumElems, MaskEltSize, - cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask); IsUnary = true; break; case X86ISD::PSHUFHW: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), - Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodePSHUFHWMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::PSHUFLW: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), - Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodePSHUFLWMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::VZEXT_MOVL: @@ -6770,8 +6960,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, } case X86ISD::VPERMI: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodeVPERMMask(NumElems, ImmN, Mask); IsUnary = true; break; case X86ISD::MOVSS: @@ -6783,17 +6973,15 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, case X86ISD::VPERM2X128: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), - Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + DecodeVPERM2X128Mask(NumElems, ImmN, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUF128: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); - ImmN = N->getOperand(N->getNumOperands() - 1); - decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, - cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); + ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); + decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: @@ -6875,9 +7063,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return false; // Check if we're getting a shuffle mask with zero'd elements. - if (!AllowSentinelZero) - if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) - return false; + if (!AllowSentinelZero && isAnyZero(Mask)) + return false; // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point @@ -7060,6 +7247,20 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, continue; } + // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF + // base vectors. + if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue Vec = V.getOperand(0); + int NumVecElts = Vec.getValueType().getVectorNumElements(); + if (Vec.isUndef() && Size == NumVecElts) { + int Idx = V.getConstantOperandVal(2); + int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements(); + if (M < Idx || (Idx + NumSubElts) <= M) + KnownUndef.setBit(i); + } + continue; + } + // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) @@ -7111,7 +7312,7 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask, // TODO: Use DemandedElts variant. static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask, - SelectionDAG &DAG, unsigned Depth, + const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts); // Attempt to decode ops that could be represented as a shuffle mask. @@ -7120,7 +7321,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl<int> &Mask, SmallVectorImpl<SDValue> &Ops, - SelectionDAG &DAG, unsigned Depth, + const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { Mask.clear(); Ops.clear(); @@ -7132,6 +7333,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) return false; assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size"); + unsigned NumSizeInBytes = NumSizeInBits / 8; + unsigned NumBytesPerElt = NumBitsPerElt / 8; unsigned Opcode = N.getOpcode(); switch (Opcode) { @@ -7179,8 +7382,6 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1); if (Known0.One.isNullValue() && Known1.One.isNullValue()) { bool IsByteMask = true; - unsigned NumSizeInBytes = NumSizeInBits / 8; - unsigned NumBytesPerElt = NumBitsPerElt / 8; APInt ZeroMask = APInt::getNullValue(NumBytesPerElt); APInt SelectMask = APInt::getNullValue(NumBytesPerElt); for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) { @@ -7220,10 +7421,21 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1, true)) return false; + + // Shuffle inputs must be the same size as the result. + if (llvm::any_of(SrcInputs0, [VT](SDValue Op) { + return VT.getSizeInBits() != Op.getValueSizeInBits(); + })) + return false; + if (llvm::any_of(SrcInputs1, [VT](SDValue Op) { + return VT.getSizeInBits() != Op.getValueSizeInBits(); + })) + return false; + size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); SmallVector<int, 64> Mask0, Mask1; - scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0); - scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1); + narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0); + narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1); for (size_t i = 0; i != MaskSize; ++i) { if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef) Mask.push_back(SM_SentinelUndef); @@ -7245,14 +7457,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SDValue Sub = N.getOperand(1); EVT SubVT = Sub.getValueType(); unsigned NumSubElts = SubVT.getVectorNumElements(); - if (!isa<ConstantSDNode>(N.getOperand(2)) || - !N->isOnlyUserOf(Sub.getNode())) + if (!N->isOnlyUserOf(Sub.getNode())) return false; uint64_t InsertIdx = N.getConstantOperandVal(2); // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && - Sub.getOperand(0).getValueType() == VT && - isa<ConstantSDNode>(Sub.getOperand(1))) { + Sub.getOperand(0).getValueType() == VT) { uint64_t ExtractIdx = Sub.getConstantOperandVal(1); for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); @@ -7268,13 +7478,20 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs, SubMask, DAG, Depth + 1, ResolveKnownElts)) return false; + + // Subvector shuffle inputs must not be larger than the subvector. + if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) { + return SubVT.getSizeInBits() < SubInput.getValueSizeInBits(); + })) + return false; + if (SubMask.size() != NumSubElts) { assert(((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); if ((NumSubElts % SubMask.size()) == 0) { int Scale = NumSubElts / SubMask.size(); SmallVector<int,64> ScaledSubMask; - scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask); + narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask); SubMask = ScaledSubMask; } else { int Scale = SubMask.size() / NumSubElts; @@ -7284,14 +7501,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } } Ops.push_back(Src); - for (SDValue &SubInput : SubInputs) { - EVT SubSVT = SubInput.getValueType().getScalarType(); - EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT, - NumSizeInBits / SubSVT.getSizeInBits()); - Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT, - DAG.getUNDEF(AltVT), SubInput, - DAG.getIntPtrConstant(0, SDLoc(N)))); - } + Ops.append(SubInputs.begin(), SubInputs.end()); for (int i = 0; i != (int)NumElts; ++i) Mask.push_back(i); for (int i = 0; i != (int)NumSubElts; ++i) { @@ -7304,75 +7514,83 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } return true; } - case ISD::SCALAR_TO_VECTOR: { - // Match against a scalar_to_vector of an extract from a vector, - // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar. - SDValue N0 = N.getOperand(0); - SDValue SrcExtract; + case X86ISD::PINSRB: + case X86ISD::PINSRW: + case ISD::SCALAR_TO_VECTOR: + case ISD::INSERT_VECTOR_ELT: { + // Match against a insert_vector_elt/scalar_to_vector of an extract from a + // vector, for matching src/dst vector types. + SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1); + + unsigned DstIdx = 0; + if (Opcode != ISD::SCALAR_TO_VECTOR) { + // Check we have an in-range constant insertion index. + if (!isa<ConstantSDNode>(N.getOperand(2)) || + N.getConstantOperandAPInt(2).uge(NumElts)) + return false; + DstIdx = N.getConstantOperandVal(2); + + // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern. + if (X86::isZeroNode(Scl)) { + Ops.push_back(N.getOperand(0)); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i); + return true; + } + } - if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && - N0.getOperand(0).getValueType() == VT) || - (N0.getOpcode() == X86ISD::PEXTRW && - N0.getOperand(0).getValueType() == MVT::v8i16) || - (N0.getOpcode() == X86ISD::PEXTRB && - N0.getOperand(0).getValueType() == MVT::v16i8)) { - SrcExtract = N0; + // Peek through trunc/aext/zext. + // TODO: aext shouldn't require SM_SentinelZero padding. + // TODO: handle shift of scalars. + unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits(); + while (Scl.getOpcode() == ISD::TRUNCATE || + Scl.getOpcode() == ISD::ANY_EXTEND || + Scl.getOpcode() == ISD::ZERO_EXTEND) { + Scl = Scl.getOperand(0); + MinBitsPerElt = + std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits()); } + if ((MinBitsPerElt % 8) != 0) + return false; + // Attempt to find the source vector the scalar was extracted from. + SDValue SrcExtract; + if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT || + Scl.getOpcode() == X86ISD::PEXTRW || + Scl.getOpcode() == X86ISD::PEXTRB) && + Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) { + SrcExtract = Scl; + } if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1))) return false; SDValue SrcVec = SrcExtract.getOperand(0); EVT SrcVT = SrcVec.getValueType(); - unsigned NumSrcElts = SrcVT.getVectorNumElements(); - unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1; - - unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); - if (NumSrcElts <= SrcIdx) + if (!SrcVT.getScalarType().isByteSized()) return false; - - Ops.push_back(SrcVec); - Mask.push_back(SrcIdx); - Mask.append(NumZeros, SM_SentinelZero); - Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef); - return true; - } - case X86ISD::PINSRB: - case X86ISD::PINSRW: { - SDValue InVec = N.getOperand(0); - SDValue InScl = N.getOperand(1); - SDValue InIndex = N.getOperand(2); - if (!isa<ConstantSDNode>(InIndex) || - cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts)) - return false; - uint64_t InIdx = N.getConstantOperandVal(2); - - // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern. - if (X86::isZeroNode(InScl)) { - Ops.push_back(InVec); - for (unsigned i = 0; i != NumElts; ++i) - Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i); - return true; + unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); + unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8); + unsigned DstByte = DstIdx * NumBytesPerElt; + MinBitsPerElt = + std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits()); + + // Create 'identity' byte level shuffle mask and then add inserted bytes. + if (Opcode == ISD::SCALAR_TO_VECTOR) { + Ops.push_back(SrcVec); + Mask.append(NumSizeInBytes, SM_SentinelUndef); + } else { + Ops.push_back(SrcVec); + Ops.push_back(N.getOperand(0)); + for (int i = 0; i != (int)NumSizeInBytes; ++i) + Mask.push_back(NumSizeInBytes + i); } - // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern. - // TODO: Expand this to support INSERT_VECTOR_ELT/etc. - unsigned ExOp = - (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW); - if (InScl.getOpcode() != ExOp) - return false; - - SDValue ExVec = InScl.getOperand(0); - SDValue ExIndex = InScl.getOperand(1); - if (!isa<ConstantSDNode>(ExIndex) || - cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts)) - return false; - uint64_t ExIdx = InScl.getConstantOperandVal(1); - - Ops.push_back(InVec); - Ops.push_back(ExVec); - for (unsigned i = 0; i != NumElts; ++i) - Mask.push_back(i == InIdx ? NumElts + ExIdx : i); + unsigned MinBytesPerElts = MinBitsPerElt / 8; + MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt); + for (unsigned i = 0; i != MinBytesPerElts; ++i) + Mask[DstByte + i] = SrcByte + i; + for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i) + Mask[DstByte + i] = SM_SentinelZero; return true; } case X86ISD::PACKSS: @@ -7412,6 +7630,23 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, createPackShuffleMask(VT, Mask, IsUnary); return true; } + case X86ISD::VTRUNC: { + SDValue Src = N.getOperand(0); + EVT SrcVT = Src.getValueType(); + // Truncated source must be a simple vector. + if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || + (SrcVT.getScalarSizeInBits() % 8) != 0) + return false; + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits(); + unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt; + assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation"); + for (unsigned i = 0; i != NumSrcElts; ++i) + Mask.push_back(i * Scale); + Mask.append(NumElts - NumSrcElts, SM_SentinelZero); + Ops.push_back(Src); + return true; + } case X86ISD::VSHLI: case X86ISD::VSRLI: { uint64_t ShiftVal = N.getConstantOperandVal(1); @@ -7426,40 +7661,43 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, break; uint64_t ByteShift = ShiftVal / 8; - unsigned NumBytes = NumSizeInBits / 8; - unsigned NumBytesPerElt = NumBitsPerElt / 8; Ops.push_back(N.getOperand(0)); // Clear mask to all zeros and insert the shifted byte indices. - Mask.append(NumBytes, SM_SentinelZero); + Mask.append(NumSizeInBytes, SM_SentinelZero); if (X86ISD::VSHLI == Opcode) { - for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) + for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j] = i + j - ByteShift; } else { - for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt) + for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) Mask[i + j - ByteShift] = i + j; } return true; } + case X86ISD::VROTLI: + case X86ISD::VROTRI: { + // We can only decode 'whole byte' bit rotates as shuffles. + uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt); + if ((RotateVal % 8) != 0) + return false; + Ops.push_back(N.getOperand(0)); + int Offset = RotateVal / 8; + Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset); + for (int i = 0; i != (int)NumElts; ++i) { + int BaseIdx = i * NumBytesPerElt; + for (int j = 0; j != (int)NumBytesPerElt; ++j) { + Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt)); + } + } + return true; + } case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); - MVT SrcVT = Src.getSimpleValueType(); - if (!SrcVT.isVector()) + if (!Src.getSimpleValueType().isVector()) return false; - - if (NumSizeInBits != SrcVT.getSizeInBits()) { - assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && - "Illegal broadcast type"); - SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), - NumSizeInBits / SrcVT.getScalarSizeInBits()); - Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, - DAG.getUNDEF(SrcVT), Src, - DAG.getIntPtrConstant(0, SDLoc(N))); - } - Ops.push_back(Src); Mask.append(NumElts, 0); return true; @@ -7476,22 +7714,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, (SrcVT.getScalarSizeInBits() % 8) != 0) return false; - unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits(); bool IsAnyExtend = (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); - DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend, - Mask); - - if (NumSizeInBits != SrcVT.getSizeInBits()) { - assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && - "Illegal zero-extension type"); - SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(), - NumSizeInBits / NumSrcBitsPerElt); - Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, - DAG.getUNDEF(SrcVT), Src, - DAG.getIntPtrConstant(0, SDLoc(N))); - } - + DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts, + IsAnyExtend, Mask); Ops.push_back(Src); return true; } @@ -7549,7 +7775,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask, APInt &KnownUndef, APInt &KnownZero, - SelectionDAG &DAG, unsigned Depth, + const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts) { EVT VT = Op.getValueType(); if (!VT.isSimple() || !VT.isVector()) @@ -7570,7 +7796,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, SmallVectorImpl<int> &Mask, - SelectionDAG &DAG, unsigned Depth = 0, + const SelectionDAG &DAG, unsigned Depth = 0, bool ResolveKnownElts = true) { EVT VT = Op.getValueType(); if (!VT.isSimple() || !VT.isVector()) @@ -7583,93 +7809,107 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, KnownZero, DAG, Depth, ResolveKnownElts); } -/// Returns the scalar element that will make up the ith +/// Returns the scalar element that will make up the i'th /// element of the result of the vector shuffle. -static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, - unsigned Depth) { - if (Depth == 6) - return SDValue(); // Limit search depth. +static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, + SelectionDAG &DAG, unsigned Depth) { + if (Depth >= SelectionDAG::MaxRecursionDepth) + return SDValue(); // Limit search depth. - SDValue V = SDValue(N, 0); - EVT VT = V.getValueType(); - unsigned Opcode = V.getOpcode(); + EVT VT = Op.getValueType(); + unsigned Opcode = Op.getOpcode(); + unsigned NumElems = VT.getVectorNumElements(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. - if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { + if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); - unsigned NumElems = VT.getVectorNumElements(); - SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) - : SV->getOperand(1); - return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); + SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); + return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { - MVT ShufVT = V.getSimpleValueType(); + MVT ShufVT = VT.getSimpleVT(); MVT ShufSVT = ShufVT.getVectorElementType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; SmallVector<SDValue, 16> ShuffleOps; bool IsUnary; - if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps, + ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt == SM_SentinelZero) - return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT) - : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT); + return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT) + : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT); if (Elt == SM_SentinelUndef) return DAG.getUNDEF(ShufSVT); - assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); - SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; - return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, - Depth+1); + assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range"); + SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; + return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1); } // Recurse into insert_subvector base/sub vector to find scalars. - if (Opcode == ISD::INSERT_SUBVECTOR && - isa<ConstantSDNode>(N->getOperand(2))) { - SDValue Vec = N->getOperand(0); - SDValue Sub = N->getOperand(1); - EVT SubVT = Sub.getValueType(); - unsigned NumSubElts = SubVT.getVectorNumElements(); - uint64_t SubIdx = N->getConstantOperandVal(2); + if (Opcode == ISD::INSERT_SUBVECTOR) { + SDValue Vec = Op.getOperand(0); + SDValue Sub = Op.getOperand(1); + uint64_t SubIdx = Op.getConstantOperandVal(2); + unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) - return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1); - return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1); + return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1); + return getShuffleScalarElt(Vec, Index, DAG, Depth + 1); + } + + // Recurse into concat_vectors sub vector to find scalars. + if (Opcode == ISD::CONCAT_VECTORS) { + EVT SubVT = Op.getOperand(0).getValueType(); + unsigned NumSubElts = SubVT.getVectorNumElements(); + uint64_t SubIdx = Index / NumSubElts; + uint64_t SubElt = Index % NumSubElts; + return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1); } // Recurse into extract_subvector src vector to find scalars. - if (Opcode == ISD::EXTRACT_SUBVECTOR && - isa<ConstantSDNode>(N->getOperand(1))) { - SDValue Src = N->getOperand(0); - uint64_t SrcIdx = N->getConstantOperandVal(1); - return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1); + if (Opcode == ISD::EXTRACT_SUBVECTOR) { + SDValue Src = Op.getOperand(0); + uint64_t SrcIdx = Op.getConstantOperandVal(1); + return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1); } - // Actual nodes that may contain scalar elements + // We only peek through bitcasts of the same vector width. if (Opcode == ISD::BITCAST) { - V = V.getOperand(0); - EVT SrcVT = V.getValueType(); - unsigned NumElems = VT.getVectorNumElements(); + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems) + return getShuffleScalarElt(Src, Index, DAG, Depth + 1); + return SDValue(); + } - if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) - return SDValue(); + // Actual nodes that may contain scalar elements + + // For insert_vector_elt - either return the index matching scalar or recurse + // into the base vector. + if (Opcode == ISD::INSERT_VECTOR_ELT && + isa<ConstantSDNode>(Op.getOperand(2))) { + if (Op.getConstantOperandAPInt(2) == Index) + return Op.getOperand(1); + return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1); } - if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) - return (Index == 0) ? V.getOperand(0) + if (Opcode == ISD::SCALAR_TO_VECTOR) + return (Index == 0) ? Op.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); - if (V.getOpcode() == ISD::BUILD_VECTOR) - return V.getOperand(Index); + if (Opcode == ISD::BUILD_VECTOR) + return Op.getOperand(Index); return SDValue(); } @@ -7762,10 +8002,11 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, Elt = NextElt; } - // If our first insertion is not the first index then insert into zero - // vector to break any register dependency else use SCALAR_TO_VECTOR. + // If our first insertion is not the first index or zeros are needed, then + // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high + // elements undefined). if (!V) { - if (i != 0) + if (i != 0 || NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else { V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt); @@ -7964,11 +8205,12 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. - unsigned RequiredAlign = VT.getSizeInBits()/8; + Align RequiredAlign(VT.getSizeInBits() / 8); SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); - if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { + MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr); + if (!InferredAlign || *InferredAlign < RequiredAlign) { if (MFI.isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. @@ -7983,9 +8225,9 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); - if ((Offset % RequiredAlign) & 3) + if ((Offset % RequiredAlign.value()) & 3) return SDValue(); - int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); + int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, @@ -8024,8 +8266,8 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { case ISD::SCALAR_TO_VECTOR: return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset); case ISD::SRL: - if (isa<ConstantSDNode>(Elt.getOperand(1))) { - uint64_t Idx = Elt.getConstantOperandVal(1); + if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) { + uint64_t Idx = IdxC->getZExtValue(); if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) { ByteOffset += Idx / 8; return true; @@ -8033,13 +8275,13 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { } break; case ISD::EXTRACT_VECTOR_ELT: - if (isa<ConstantSDNode>(Elt.getOperand(1))) { + if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) { SDValue Src = Elt.getOperand(0); unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && findEltLoadSrc(Src, Ld, ByteOffset)) { - uint64_t Idx = Elt.getConstantOperandVal(1); + uint64_t Idx = IdxC->getZExtValue(); ByteOffset += Idx * (SrcSizeInBits / 8); return true; } @@ -8169,7 +8411,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, "Cannot merge volatile or atomic loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); + LDBase->getPointerInfo(), LDBase->getOriginalAlign(), + MMOFlags); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, NewLd); @@ -8247,14 +8490,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) : MVT::getIntegerVT(LoadSizeInBits); MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); + // Allow v4f32 on SSE1 only targets. + // FIXME: Add more isel patterns so we can just use VT directly. + if (!Subtarget.hasSSE2() && VT == MVT::v4f32) + VecVT = MVT::v4f32; if (TLI.isTypeLegal(VecVT)) { SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; - SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, - LDBase->getPointerInfo(), - LDBase->getAlignment(), - MachineMemOperand::MOLoad); + SDValue ResNode = DAG.getMemIntrinsicNode( + X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(), + LDBase->getOriginalAlign(), MachineMemOperand::MOLoad); for (auto *LD : Loads) if (LD) DAG.makeEquivalentMemoryOrdering(LD, ResNode); @@ -8318,13 +8563,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, // Combine a vector ops (shuffles etc.) that is equal to build_vector load1, // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses // are consecutive, non-overlapping, and in the right order. -static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL, +static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool isAfterLegalize) { SmallVector<SDValue, 64> Elts; for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { - if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) { + if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) { Elts.push_back(Elt); continue; } @@ -8439,7 +8684,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, SDValue Ld = BVOp->getSplatValue(&UndefElements); // Attempt to use VBROADCASTM - // From this paterrn: + // From this pattern: // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) // b. t1 = (build_vector t0 t0) // @@ -8486,8 +8731,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, LLVMContext *Ctx = DAG.getContext(); MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); if (Subtarget.hasAVX()) { - if (SplatBitSize <= 64 && Subtarget.hasAVX2() && - !(SplatBitSize == 64 && Subtarget.is32Bit())) { + if (SplatBitSize == 32 || SplatBitSize == 64 || + (SplatBitSize < 32 && Subtarget.hasAVX2())) { // Splatted value can fit in one INTEGER constant in constant pool. // Load the constant and broadcast it. MVT CVT = MVT::getIntegerVT(SplatBitSize); @@ -8496,46 +8741,25 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, SDValue CP = DAG.getConstantPool(C, PVT); unsigned Repeat = VT.getSizeInBits() / SplatBitSize; - unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); - Ld = DAG.getLoad( - CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - Alignment); - SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, - MVT::getVectorVT(CVT, Repeat), Ld); - return DAG.getBitcast(VT, Brdcst); - } else if (SplatBitSize == 32 || SplatBitSize == 64) { - // Splatted value can fit in one FLOAT constant in constant pool. - // Load the constant and broadcast it. - // AVX have support for 32 and 64 bit broadcast for floats only. - // No 64bit integer in 32bit subtarget. - MVT CVT = MVT::getFloatingPointVT(SplatBitSize); - // Lower the splat via APFloat directly, to avoid any conversion. - Constant *C = - SplatBitSize == 32 - ? ConstantFP::get(*Ctx, - APFloat(APFloat::IEEEsingle(), SplatValue)) - : ConstantFP::get(*Ctx, - APFloat(APFloat::IEEEdouble(), SplatValue)); - SDValue CP = DAG.getConstantPool(C, PVT); - unsigned Repeat = VT.getSizeInBits() / SplatBitSize; - - unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); - Ld = DAG.getLoad( - CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - Alignment); - SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, - MVT::getVectorVT(CVT, Repeat), Ld); + Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign(); + SDVTList Tys = + DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other); + SDValue Ops[] = {DAG.getEntryNode(), CP}; + MachinePointerInfo MPI = + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); + SDValue Brdcst = DAG.getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment, + MachineMemOperand::MOLoad); return DAG.getBitcast(VT, Brdcst); - } else if (SplatBitSize > 64) { + } + if (SplatBitSize > 64) { // Load the vector of constants and broadcast it. MVT CVT = VT.getScalarType(); Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx); SDValue VCP = DAG.getConstantPool(VecC, PVT); unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); - unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment(); + Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign(); Ld = DAG.getLoad( MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), @@ -8560,10 +8784,12 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, bool ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); + bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. - if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) + // FIXME: Is the use count needed for non-constant, non-load case? + if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); unsigned ScalarSize = Ld.getValueSizeInBits(); @@ -8603,18 +8829,17 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); - unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); - Ld = DAG.getLoad( - CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - Alignment); + Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign(); - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {DAG.getEntryNode(), CP}; + MachinePointerInfo MPI = + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); + return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, + MPI, Alignment, MachineMemOperand::MOLoad); } } - bool IsLoad = ISD::isNormalLoad(Ld.getNode()); - // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget.hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) @@ -8624,15 +8849,34 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, if (!IsLoad) return SDValue(); + // Make sure the non-chain result is only used by this build vector. + if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0)) + return SDValue(); + if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || - (Subtarget.hasVLX() && ScalarSize == 64)) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + (Subtarget.hasVLX() && ScalarSize == 64)) { + auto *LN = cast<LoadSDNode>(Ld); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; + SDValue BCast = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1)); + return BCast; + } // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm - if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) { - if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) - return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + if (Subtarget.hasInt256() && Ld.getValueType().isInteger() && + (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) { + auto *LN = cast<LoadSDNode>(Ld); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; + SDValue BCast = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1)); + return BCast; } // Unsupported broadcast. @@ -8746,20 +8990,6 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { return NV; } -static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { - assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && - Op.getScalarValueSizeInBits() == 1 && - "Can not convert non-constant vector"); - uint64_t Immediate = 0; - for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { - SDValue In = Op.getOperand(idx); - if (!In.isUndef()) - Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; - } - SDLoc dl(Op); - MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8)); - return DAG.getConstant(Immediate, dl, VT); -} // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -8782,11 +9012,11 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, SDValue In = Op.getOperand(idx); if (In.isUndef()) continue; - if (!isa<ConstantSDNode>(In)) - NonConstIdx.push_back(idx); - else { - Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx; + if (auto *InC = dyn_cast<ConstantSDNode>(In)) { + Immediate |= (InC->getZExtValue() & 0x1) << idx; HasConstElts = true; + } else { + NonConstIdx.push_back(idx); } if (SplatIdx < 0) SplatIdx = idx; @@ -8805,9 +9035,24 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, if (Cond.getOpcode() != ISD::SETCC) Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond, DAG.getConstant(1, dl, MVT::i8)); - return DAG.getSelect(dl, VT, Cond, - DAG.getConstant(1, dl, VT), - DAG.getConstant(0, dl, VT)); + + // Perform the select in the scalar domain so we can use cmov. + if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { + SDValue Select = DAG.getSelect(dl, MVT::i32, Cond, + DAG.getAllOnesConstant(dl, MVT::i32), + DAG.getConstant(0, dl, MVT::i32)); + Select = DAG.getBitcast(MVT::v32i1, Select); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select); + } else { + MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); + SDValue Select = DAG.getSelect(dl, ImmVT, Cond, + DAG.getAllOnesConstant(dl, ImmVT), + DAG.getConstant(0, dl, ImmVT)); + MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; + Select = DAG.getBitcast(VecVT, Select); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select, + DAG.getIntPtrConstant(0, dl)); + } } // insert elements one by one @@ -8907,8 +9152,8 @@ static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, if (!CanFold) break; - unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); - unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue(); + unsigned I0 = Op0.getConstantOperandVal(1); + unsigned I1 = Op1.getConstantOperandVal(1); if (i * 2 < NumElts) { if (V0.isUndef()) { @@ -9056,11 +9301,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa<ConstantSDNode>(Op0.getOperand(1)) || - !isa<ConstantSDNode>(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) return false; - unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue(); + unsigned I0 = Op0.getConstantOperandVal(1); if (I0 != i) return false; @@ -9445,6 +9689,9 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, return SDValue(); } +static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG); + /// If a BUILD_VECTOR's source elements all apply the same bit operation and /// one of their operands is constant, lower to a pair of BUILD_VECTOR and /// just apply the bit to the vectors. @@ -9452,6 +9699,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, /// from this, but enough scalar bit operations are created from the later /// legalization + scalarization stages to need basic support. static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, + const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); MVT VT = Op->getSimpleValueType(0); @@ -9515,7 +9763,14 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); - return DAG.getNode(Opcode, DL, VT, LHS, RHS); + SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS); + + if (!IsShift) + return Res; + + // Immediately lower the shift to ensure the constant build vector doesn't + // get converted to a constant pool before the shift is lowered. + return LowerShift(Res, Subtarget, DAG); } /// Create a vector constant without a load. SSE/AVX provide the bare minimum @@ -9571,9 +9826,11 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false, Subtarget, DAG, SDLoc(IndicesVec)); - return extractSubVector( - createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0, - DAG, DL, SizeInBits); + SDValue NewSrcVec = + createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); + if (NewSrcVec) + return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits); + return SDValue(); } else if (SrcVec.getValueSizeInBits() < SizeInBits) { // Widen smaller SrcVec to match VT. SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); @@ -9869,7 +10126,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return HorizontalOp; if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) return Broadcast; - if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG)) + if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG)) return BitOp; unsigned EVTBits = EltVT.getSizeInBits(); @@ -9929,7 +10186,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { assert(!VarElt.getNode() && !InsIndex.getNode() && "Expected one variable element in this vector"); VarElt = Elt; - InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout())); + InsIndex = DAG.getVectorIdxConstant(i, dl); } } Constant *CV = ConstantVector::get(ConstVecOps); @@ -10929,6 +11186,71 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, return SDValue(); } +/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) +/// followed by unpack 256-bit. +static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + SmallVector<int, 32> Unpckl, Unpckh; + createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true); + createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false); + + unsigned UnpackOpcode; + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + UnpackOpcode = X86ISD::UNPCKL; + else if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + UnpackOpcode = X86ISD::UNPCKH; + else + return SDValue(); + + // This is a "natural" unpack operation (rather than the 128-bit sectored + // operation implemented by AVX). We need to rearrange 64-bit chunks of the + // input in order to use the x86 instruction. + V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1), + DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3}); + V1 = DAG.getBitcast(VT, V1); + return DAG.getNode(UnpackOpcode, DL, VT, V1, V1); +} + +// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the +// source into the lower elements and zeroing the upper elements. +// TODO: Merge with matchShuffleAsVPMOV. +static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, + ArrayRef<int> Mask, const APInt &Zeroable, + const X86Subtarget &Subtarget) { + if (!VT.is512BitVector() && !Subtarget.hasVLX()) + return false; + + unsigned NumElts = Mask.size(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + unsigned MaxScale = 64 / EltSizeInBits; + + for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { + unsigned SrcEltBits = EltSizeInBits * Scale; + if (SrcEltBits < 32 && !Subtarget.hasBWI()) + continue; + unsigned NumSrcElts = NumElts / Scale; + if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale)) + continue; + unsigned UpperElts = NumElts - NumSrcElts; + if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue()) + continue; + SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale); + SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts); + DstVT = MVT::getIntegerVT(EltSizeInBits); + if ((NumSrcElts * EltSizeInBits) >= 128) { + // ISD::TRUNCATE + DstVT = MVT::getVectorVT(DstVT, NumSrcElts); + } else { + // X86ISD::VTRUNC + DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits); + } + return true; + } + + return false; +} + static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps, int Delta) { int Size = (int)Mask.size(); @@ -11022,22 +11344,93 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask, return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src); } +/// Check whether a compaction lowering can be done by dropping even +/// elements and compute how many times even elements must be dropped. +/// +/// This handles shuffles which take every Nth element where N is a power of +/// two. Example shuffle masks: +/// +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 +/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 +/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 +/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 +/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 +/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 +/// +/// Any of these lanes can of course be undef. +/// +/// This routine only supports N <= 3. +/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here +/// for larger N. +/// +/// \returns N above, or the number of times even elements must be dropped if +/// there is such a number. Otherwise returns zero. +static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, + bool IsSingleInput) { + // The modulus for the shuffle vector entries is based on whether this is + // a single input or not. + int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); + assert(isPowerOf2_32((uint32_t)ShuffleModulus) && + "We should only be called with masks with a power-of-2 size!"); + + uint64_t ModMask = (uint64_t)ShuffleModulus - 1; + + // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, + // and 2^3 simultaneously. This is because we may have ambiguity with + // partially undef inputs. + bool ViableForN[3] = {true, true, true}; + + for (int i = 0, e = Mask.size(); i < e; ++i) { + // Ignore undef lanes, we'll optimistically collapse them to the pattern we + // want. + if (Mask[i] < 0) + continue; + + bool IsAnyViable = false; + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) { + uint64_t N = j + 1; + + // The shuffle mask must be equal to (i * 2^N) % M. + if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) + IsAnyViable = true; + else + ViableForN[j] = false; + } + // Early exit if we exhaust the possible powers of two. + if (!IsAnyViable) + break; + } + + for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) + if (ViableForN[j]) + return j + 1; + + // Return 0 as there is no viable power of two. + return 0; +} + // X86 has dedicated pack instructions that can handle specific truncation // operations: PACKSS and PACKUS. +// Checks for compaction shuffle masks if MaxStages > 1. +// TODO: Add support for matching multiple PACKSS/PACKUS stages. static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef<int> TargetMask, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + const X86Subtarget &Subtarget, + unsigned MaxStages = 1) { unsigned NumElts = VT.getVectorNumElements(); unsigned BitSize = VT.getScalarSizeInBits(); - MVT PackSVT = MVT::getIntegerVT(BitSize * 2); - MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2); + assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && + "Illegal maximum compaction"); - auto MatchPACK = [&](SDValue N1, SDValue N2) { + auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) { + unsigned NumSrcBits = PackVT.getScalarSizeInBits(); + unsigned NumPackedBits = NumSrcBits - BitSize; SDValue VV1 = DAG.getBitcast(PackVT, N1); SDValue VV2 = DAG.getBitcast(PackVT, N2); - if (Subtarget.hasSSE41() || PackSVT == MVT::i16) { - APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize); + if (Subtarget.hasSSE41() || BitSize == 8) { + APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits); if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) && (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) { V1 = VV1; @@ -11047,8 +11440,8 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, return true; } } - if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) && - (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) { + if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) && + (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) { V1 = VV1; V2 = VV2; SrcVT = PackVT; @@ -11058,19 +11451,25 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, return false; }; - // Try binary shuffle. - SmallVector<int, 32> BinaryMask; - createPackShuffleMask(VT, BinaryMask, false); - if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2)) - if (MatchPACK(V1, V2)) - return true; + // Attempt to match against wider and wider compaction patterns. + for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) { + MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages); + MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages); - // Try unary shuffle. - SmallVector<int, 32> UnaryMask; - createPackShuffleMask(VT, UnaryMask, true); - if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1)) - if (MatchPACK(V1, V1)) - return true; + // Try binary shuffle. + SmallVector<int, 32> BinaryMask; + createPackShuffleMask(VT, BinaryMask, false, NumStages); + if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2)) + if (MatchPACK(V1, V2, PackVT)) + return true; + + // Try unary shuffle. + SmallVector<int, 32> UnaryMask; + createPackShuffleMask(VT, UnaryMask, true, NumStages); + if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1)) + if (MatchPACK(V1, V1, PackVT)) + return true; + } return false; } @@ -11080,12 +11479,44 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, const X86Subtarget &Subtarget) { MVT PackVT; unsigned PackOpcode; - if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, - Subtarget)) - return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1), - DAG.getBitcast(PackVT, V2)); + unsigned SizeBits = VT.getSizeInBits(); + unsigned EltBits = VT.getScalarSizeInBits(); + unsigned MaxStages = Log2_32(64 / EltBits); + if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, + Subtarget, MaxStages)) + return SDValue(); - return SDValue(); + unsigned CurrentEltBits = PackVT.getScalarSizeInBits(); + unsigned NumStages = Log2_32(CurrentEltBits / EltBits); + + // Don't lower multi-stage packs on AVX512, truncation is better. + if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX()) + return SDValue(); + + // Pack to the largest type possible: + // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. + unsigned MaxPackBits = 16; + if (CurrentEltBits > 16 && + (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41())) + MaxPackBits = 32; + + // Repeatedly pack down to the target size. + SDValue Res; + for (unsigned i = 0; i != NumStages; ++i) { + unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits); + unsigned NumSrcElts = SizeBits / SrcEltBits; + MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); + MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2); + MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); + MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2); + Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1), + DAG.getBitcast(SrcVT, V2)); + V1 = V2 = Res; + CurrentEltBits /= 2; + } + assert(Res && Res.getValueType() == VT && + "Failed to lower compaction shuffle"); + return Res; } /// Try to emit a bitmask instruction for a shuffle. @@ -11109,8 +11540,9 @@ static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, MVT LogicVT = VT; if (EltVT == MVT::f32 || EltVT == MVT::f64) { Zero = DAG.getConstantFP(0.0, DL, EltVT); - AllOnes = DAG.getConstantFP( - APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT); + APFloat AllOnesValue = APFloat::getAllOnesValue( + SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits()); + AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT); LogicVT = MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size()); } else { @@ -11312,6 +11744,12 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); } + // If we have VPTERNLOG, we can use that as a bit blend. + if (Subtarget.hasVLX()) + if (SDValue BitBlend = + lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) + return BitBlend; + // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; @@ -11622,10 +12060,101 @@ static SDValue lowerShuffleAsDecomposedShuffleBlend( return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } -/// Try to lower a vector shuffle as a rotation. +/// Try to lower a vector shuffle as a bit rotation. +/// +/// Look for a repeated rotation pattern in each sub group. +/// Returns a ISD::ROTL element rotation amount or -1 if failed. +static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) { + int NumElts = Mask.size(); + assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask"); + + int RotateAmt = -1; + for (int i = 0; i != NumElts; i += NumSubElts) { + for (int j = 0; j != NumSubElts; ++j) { + int M = Mask[i + j]; + if (M < 0) + continue; + if (!isInRange(M, i, i + NumSubElts)) + return -1; + int Offset = (NumSubElts - (M - (i + j))) % NumSubElts; + if (0 <= RotateAmt && Offset != RotateAmt) + return -1; + RotateAmt = Offset; + } + } + return RotateAmt; +} + +static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits, + const X86Subtarget &Subtarget, + ArrayRef<int> Mask) { + assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); + assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers"); + + // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size. + int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2; + int MaxSubElts = 64 / EltSizeInBits; + for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) { + int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts); + if (RotateAmt < 0) + continue; + + int NumElts = Mask.size(); + MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts); + RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); + return RotateAmt * EltSizeInBits; + } + + return -1; +} + +/// Lower shuffle using X86ISD::VROTLI rotations. +static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, + ArrayRef<int> Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + // Only XOP + AVX512 targets have bit rotation instructions. + // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this. + bool IsLegal = + (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512(); + if (!IsLegal && Subtarget.hasSSE3()) + return SDValue(); + + MVT RotateVT; + int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(), + Subtarget, Mask); + if (RotateAmt < 0) + return SDValue(); + + // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL, + // expanded to OR(SRL,SHL), will be more efficient, but if they can + // widen to vXi16 or more then existing lowering should will be better. + if (!IsLegal) { + if ((RotateAmt % 16) == 0) + return SDValue(); + // TODO: Use getTargetVShiftByConstNode. + unsigned ShlAmt = RotateAmt; + unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt; + V1 = DAG.getBitcast(RotateVT, V1); + SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1, + DAG.getTargetConstant(ShlAmt, DL, MVT::i8)); + SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1, + DAG.getTargetConstant(SrlAmt, DL, MVT::i8)); + SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL); + return DAG.getBitcast(VT, Rot); + } + + SDValue Rot = + DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1), + DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); + return DAG.getBitcast(VT, Rot); +} + +/// Try to match a vector shuffle as an element rotation. /// /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. -static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) { +static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, + ArrayRef<int> Mask) { int NumElts = Mask.size(); // We need to detect various ways of spelling a rotation: @@ -11712,7 +12241,7 @@ static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) { static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef<int> Mask) { // Don't accept any shuffles with zero elements. - if (any_of(Mask, [](int M) { return M == SM_SentinelZero; })) + if (isAnyZero(Mask)) return -1; // PALIGNR works on 128-bit lanes. @@ -11720,7 +12249,7 @@ static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) return -1; - int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask); + int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask); if (Rotation <= 0) return -1; @@ -11788,7 +12317,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. -static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, +static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -11800,7 +12329,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1, && "VLX required for 128/256-bit vectors"); SDValue Lo = V1, Hi = V2; - int Rotation = matchShuffleAsRotate(Lo, Hi, Mask); + int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask); if (Rotation <= 0) return SDValue(); @@ -12566,13 +13095,13 @@ static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, assert(Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"); - EVT EltVT = VT.getVectorElementType(); - EVT V0VT = V0.getValueType(); + MVT EltVT = VT.getVectorElementType(); + MVT V0VT = V0.getSimpleValueType(); assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); - EVT V0EltVT = V0VT.getVectorElementType(); + MVT V0EltVT = V0VT.getVectorElementType(); if (!V0EltVT.isInteger()) return SDValue(); @@ -12636,7 +13165,7 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef<int> Mask, SelectionDAG &DAG) { - EVT VT = N0.getValueType(); + MVT VT = N0.getSimpleValueType(); assert((VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"); @@ -12649,9 +13178,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, return SDValue(); SDValue WideVec = N0.getOperand(0); - EVT WideVT = WideVec.getValueType(); - if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) || - !isa<ConstantSDNode>(N1.getOperand(1))) + MVT WideVT = WideVec.getSimpleValueType(); + if (!WideVT.is256BitVector()) return SDValue(); // Match extracts of each half of the wide source vector. Commute the shuffle @@ -12699,7 +13227,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. - unsigned NumElts = Mask.size(); unsigned NumEltBits = VT.getScalarSizeInBits(); unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) ? X86ISD::MOVDDUP @@ -12707,15 +13234,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. - int BroadcastIdx = -1; - for (int i = 0; i != (int)NumElts; ++i) { - SmallVector<int, 8> BroadcastMask(NumElts, i); - if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) { - BroadcastIdx = i; - break; - } - } - + int BroadcastIdx = getSplatIndex(Mask); if (BroadcastIdx < 0) return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " @@ -12724,6 +13243,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. + // TODO: Combine this logic with findEltLoadSrc() used by + // EltsFromConsecutiveLoads(). int BitOffset = BroadcastIdx * NumEltBits; SDValue V = V1; for (;;) { @@ -12739,14 +13260,19 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, BitOffset %= OpBitWidth; continue; } + case ISD::EXTRACT_SUBVECTOR: { + // The extraction index adds to the existing offset. + unsigned EltBitWidth = V.getScalarValueSizeInBits(); + unsigned Idx = V.getConstantOperandVal(1); + unsigned BeginOffset = Idx * EltBitWidth; + BitOffset += BeginOffset; + V = V.getOperand(0); + continue; + } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); - auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2)); - if (!ConstantIdx) - break; - int EltBitWidth = VOuter.getScalarValueSizeInBits(); - int Idx = (int)ConstantIdx->getZExtValue(); + int Idx = (int)V.getConstantOperandVal(2); int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements(); int BeginOffset = Idx * EltBitWidth; int EndOffset = BeginOffset + NumSubElts * EltBitWidth; @@ -12777,8 +13303,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, DL, VT, V, BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; - MVT BroadcastVT = VT; - // Also check the simpler case, where we can directly reuse the scalar. if (!BitCastSrc && ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || @@ -12788,23 +13312,34 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) { - // 32-bit targets need to load i64 as a f64 and then bitcast the result. - if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { - BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); - Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2()) - ? X86ISD::MOVDDUP - : Opcode; - } + } else if (ISD::isNormalLoad(V.getNode()) && + cast<LoadSDNode>(V)->isSimple()) { + // We do not check for one-use of the vector load because a broadcast load + // is expected to be a win for code size, register pressure, and possibly + // uops even if the original vector load is not eliminated. - // If we are broadcasting a load that is only used by the shuffle - // then we can reduce the vector load to the broadcasted scalar load. + // Reduce the vector load and shuffle to a broadcasted scalar load. LoadSDNode *Ld = cast<LoadSDNode>(V); SDValue BaseAddr = Ld->getOperand(1); - EVT SVT = BroadcastVT.getScalarType(); + MVT SVT = VT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset"); SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); + + // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather + // than MOVDDUP. + // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX? + if (Opcode == X86ISD::VBROADCAST) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {Ld->getChain(), NewAddr}; + V = DAG.getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT, + DAG.getMachineFunction().getMachineMemOperand( + Ld->getMemOperand(), Offset, SVT.getStoreSize())); + DAG.makeEquivalentMemoryOrdering(Ld, V); + return DAG.getBitcast(VT, V); + } + assert(SVT == MVT::f64 && "Unexpected VT!"); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); @@ -12839,38 +13374,26 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, DAG.getBitcast(MVT::f64, V)); - // Bitcast back to the same scalar type as BroadcastVT. - if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) { - assert(NumEltBits == BroadcastVT.getScalarSizeInBits() && - "Unexpected vector element size"); - MVT ExtVT; - if (V.getValueType().isVector()) { - unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; - ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts); - } else { - ExtVT = BroadcastVT.getScalarType(); - } - V = DAG.getBitcast(ExtVT, V); - } - - // 32-bit targets need to load i64 as a f64 and then bitcast the result. - if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) { - V = DAG.getBitcast(MVT::f64, V); - unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements(); - BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); + // If this is a scalar, do the broadcast on this type and bitcast. + if (!V.getValueType().isVector()) { + assert(V.getScalarValueSizeInBits() == NumEltBits && + "Unexpected scalar size"); + MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(), + VT.getVectorNumElements()); + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to // 128-bits, removing as many bitcasts as possible. - if (V.getValueSizeInBits() > 128) { - MVT ExtVT = V.getSimpleValueType().getScalarType(); - ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits()); + if (V.getValueSizeInBits() > 128) V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); - V = DAG.getBitcast(ExtVT, V); - } - return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); + // Otherwise cast V to a vector with the same element type as VT, but + // possibly narrower than VT. Then perform the broadcast. + unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; + MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts); + return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V)); } // Check for whether we can use INSERTPS to perform the shuffle. We only use @@ -13259,7 +13782,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) - if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask, + if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -13293,8 +13816,7 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; - int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; - + SmallVector<int, 4> NewMask(Mask.begin(), Mask.end()); int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 1) { @@ -13548,7 +14070,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget.hasSSSE3()) { if (Subtarget.hasVLX()) - if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask, + if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -14186,6 +14708,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Mask, Subtarget, DAG)) return Broadcast; + // Try to use bit rotation instructions. + if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask, + Subtarget, DAG)) + return Rotate; + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; @@ -14262,6 +14789,29 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return V; + // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW. + // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to + // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain. + int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false); + if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() && + !Subtarget.hasVLX()) { + SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32)); + for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1)) + DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32); + SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps); + V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1), + DWordClearMask); + V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2), + DWordClearMask); + // Now pack things back together. + SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2); + if (NumEvenDrops == 2) { + Result = DAG.getBitcast(MVT::v4i32, Result); + Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result); + } + return Result; + } + // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) @@ -14281,72 +14831,6 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Mask, Subtarget, DAG); } -/// Check whether a compaction lowering can be done by dropping even -/// elements and compute how many times even elements must be dropped. -/// -/// This handles shuffles which take every Nth element where N is a power of -/// two. Example shuffle masks: -/// -/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 -/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 -/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 -/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 -/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 -/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 -/// -/// Any of these lanes can of course be undef. -/// -/// This routine only supports N <= 3. -/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here -/// for larger N. -/// -/// \returns N above, or the number of times even elements must be dropped if -/// there is such a number. Otherwise returns zero. -static int canLowerByDroppingEvenElements(ArrayRef<int> Mask, - bool IsSingleInput) { - // The modulus for the shuffle vector entries is based on whether this is - // a single input or not. - int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); - assert(isPowerOf2_32((uint32_t)ShuffleModulus) && - "We should only be called with masks with a power-of-2 size!"); - - uint64_t ModMask = (uint64_t)ShuffleModulus - 1; - - // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, - // and 2^3 simultaneously. This is because we may have ambiguity with - // partially undef inputs. - bool ViableForN[3] = {true, true, true}; - - for (int i = 0, e = Mask.size(); i < e; ++i) { - // Ignore undef lanes, we'll optimistically collapse them to the pattern we - // want. - if (Mask[i] < 0) - continue; - - bool IsAnyViable = false; - for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) - if (ViableForN[j]) { - uint64_t N = j + 1; - - // The shuffle mask must be equal to (i * 2^N) % M. - if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask)) - IsAnyViable = true; - else - ViableForN[j] = false; - } - // Early exit if we exhaust the possible powers of two. - if (!IsAnyViable) - break; - } - - for (unsigned j = 0; j != array_lengthof(ViableForN); ++j) - if (ViableForN[j]) - return j + 1; - - // Return 0 as there is no viable power of two. - return 0; -} - static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { @@ -14410,6 +14894,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Mask, Subtarget, DAG)) return Broadcast; + // Try to use bit rotation instructions. + if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask, + Subtarget, DAG)) + return Rotate; + if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) return V; @@ -14524,6 +15013,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Zeroable, Subtarget, DAG)) return V; + // Check for compaction patterns. + bool IsSingleInput = V2.isUndef(); + int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput); + // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any // blends but after all of the single-input lowerings. If the single input @@ -14534,10 +15027,13 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // and there are *very* few patterns that would actually be faster than the // PSHUFB approach because of its ability to zero lanes. // + // If the mask is a binary compaction, we can more efficiently perform this + // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()). + // // FIXME: The only exceptions to the above are blends which are exact // interleavings with direct instructions supporting them. We currently don't // handle those well here. - if (Subtarget.hasSSSE3()) { + if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) { bool V1InUse = false; bool V2InUse = false; @@ -14595,8 +15091,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // We special case these as they can be particularly efficiently handled with // the PACKUSB instruction on x86 and they show up in common patterns of // rearranging bytes to truncate wide elements. - bool IsSingleInput = V2.isUndef(); - if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) { + if (NumEvenDrops) { // NumEvenDrops is the power of two stride of the elements. Another way of // thinking about it is that we need to drop the even elements this many // times to get the original input. @@ -14604,23 +15099,23 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // First we need to zero all the dropped bytes. assert(NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."); - SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8)); - for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops) - ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8); - SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps); - V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask); + SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16)); + for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1)) + WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16); + SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps); + V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1), + WordClearMask); if (!IsSingleInput) - V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask); + V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2), + WordClearMask); // Now pack things back together. - V1 = DAG.getBitcast(MVT::v8i16, V1); - V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2); - SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2); + SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, + IsSingleInput ? V1 : V2); for (int i = 1; i < NumEvenDrops; ++i) { Result = DAG.getBitcast(MVT::v8i16, Result); Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); } - return Result; } @@ -14725,37 +15220,13 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; MVT ScalarVT = VT.getVectorElementType(); - MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); + MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements); - // Rather than splitting build-vectors, just build two narrower build - // vectors. This helps shuffling with splats and zeros. + // Use splitVector/extractSubVector so that split build-vectors just build two + // narrower build vectors. This helps shuffling with splats and zeros. auto SplitVector = [&](SDValue V) { - V = peekThroughBitcasts(V); - - MVT OrigVT = V.getSimpleValueType(); - int OrigNumElements = OrigVT.getVectorNumElements(); - int OrigSplitNumElements = OrigNumElements / 2; - MVT OrigScalarVT = OrigVT.getVectorElementType(); - MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); - SDValue LoV, HiV; - - auto *BV = dyn_cast<BuildVectorSDNode>(V); - if (!BV) { - LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, - DAG.getIntPtrConstant(0, DL)); - HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V, - DAG.getIntPtrConstant(OrigSplitNumElements, DL)); - } else { - - SmallVector<SDValue, 16> LoOps, HiOps; - for (int i = 0; i < OrigSplitNumElements; ++i) { - LoOps.push_back(BV->getOperand(i)); - HiOps.push_back(BV->getOperand(i + OrigSplitNumElements)); - } - LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps); - HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps); - } + std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL); return std::make_pair(DAG.getBitcast(SplitVT, LoV), DAG.getBitcast(SplitVT, HiV)); }; @@ -15963,7 +16434,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, SmallVector<int, 2> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { SmallVector<int, 4> PSHUFDMask; - scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask); + narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask); return DAG.getBitcast( MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, @@ -15984,7 +16455,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have VLX support, we can use VALIGN or VEXPAND. if (Subtarget.hasVLX()) { - if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask, + if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -16085,13 +16556,14 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have a single input shuffle with different shuffle patterns in the // two 128-bit lanes use the variable mask to VPERMILPS. if (V2.isUndef()) { - SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); - if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) + if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) { + SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask); - - if (Subtarget.hasAVX2()) + } + if (Subtarget.hasAVX2()) { + SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); - + } // Otherwise, fall back. return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG, Subtarget); @@ -16190,7 +16662,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // If we have VLX support, we can use VALIGN or EXPAND. if (Subtarget.hasVLX()) { - if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask, + if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -16210,9 +16682,14 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) return V; - // If the shuffle patterns aren't repeated but it is a single input, directly - // generate a cross-lane VPERMD instruction. if (V2.isUndef()) { + // Try to produce a fixed cross-128-bit lane permute followed by unpack + // because that should be faster than the variable permute alternatives. + if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG)) + return V; + + // If the shuffle patterns aren't repeated but it's a single input, directly + // generate a cross-lane VPERMD instruction. SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); } @@ -16294,6 +16771,16 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return V; if (V2.isUndef()) { + // Try to use bit rotation instructions. + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG)) + return Rotate; + + // Try to produce a fixed cross-128-bit lane permute followed by unpack + // because that should be faster than the variable permute alternatives. + if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG)) + return V; + // There are no generalized cross-lane shuffle operations available on i16 // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { @@ -16379,7 +16866,7 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Shift; // Try to use byte rotation instructions. @@ -16387,6 +16874,12 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Subtarget, DAG)) return Rotate; + // Try to use bit rotation instructions. + if (V2.isUndef()) + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG)) + return Rotate; + // Try to create an in-lane repeating shuffle mask and then shuffle the // results into the target lanes. if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( @@ -16396,6 +16889,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, // There are no generalized cross-lane shuffle operations available on i8 // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { + // Try to produce a fixed cross-128-bit lane permute followed by unpack + // because that should be faster than the variable permute alternatives. + if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG)) + return V; + if (SDValue V = lowerShuffleAsLanePermuteAndPermute( DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; @@ -16518,13 +17016,14 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."); // TODO - use Zeroable like we do for lowerV2X128VectorShuffle? - SmallVector<int, 4> WidenedMask; - if (!canWidenShuffleElements(Mask, WidenedMask)) + SmallVector<int, 4> Widened128Mask; + if (!canWidenShuffleElements(Mask, Widened128Mask)) return SDValue(); + assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch"); // Try to use an insert into a zero vector. - if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 && - (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) { + if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 && + (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) { unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4; MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, @@ -16536,37 +17035,34 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, // Check for patterns which can be matched with a single insert of a 256-bit // subvector. - bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, - {0, 1, 2, 3, 0, 1, 2, 3}); - if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, - {0, 1, 2, 3, 8, 9, 10, 11})) { + bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3}); + if (OnlyUsesV1 || + isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) { MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4); - SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, - OnlyUsesV1 ? V1 : V2, - DAG.getIntPtrConstant(0, DL)); + SDValue SubVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, + DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec, DAG.getIntPtrConstant(4, DL)); } - assert(WidenedMask.size() == 4); - // See if this is an insertion of the lower 128-bits of V2 into V1. bool IsInsert = true; int V2Index = -1; for (int i = 0; i < 4; ++i) { - assert(WidenedMask[i] >= -1); - if (WidenedMask[i] < 0) + assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value"); + if (Widened128Mask[i] < 0) continue; // Make sure all V1 subvectors are in place. - if (WidenedMask[i] < 4) { - if (WidenedMask[i] != i) { + if (Widened128Mask[i] < 4) { + if (Widened128Mask[i] != i) { IsInsert = false; break; } } else { // Make sure we only have a single V2 index and its the lowest 128-bits. - if (V2Index >= 0 || WidenedMask[i] != 4) { + if (V2Index >= 0 || Widened128Mask[i] != 4) { IsInsert = false; break; } @@ -16580,16 +17076,26 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); } + // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane + // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where + // possible we at least ensure the lanes stay sequential to help later + // combines. + SmallVector<int, 2> Widened256Mask; + if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) { + Widened128Mask.clear(); + narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask); + } + // Try to lower to vshuf64x2/vshuf32x4. SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; unsigned PermMask = 0; // Insure elements came from the same Op. for (int i = 0; i < 4; ++i) { - assert(WidenedMask[i] >= -1); - if (WidenedMask[i] < 0) + assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value"); + if (Widened128Mask[i] < 0) continue; - SDValue Op = WidenedMask[i] >= 4 ? V2 : V1; + SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1; unsigned OpIndex = i / 2; if (Ops[OpIndex].isUndef()) Ops[OpIndex] = Op; @@ -16598,7 +17104,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, // Convert the 128-bit shuffle mask selection values into 128-bit selection // bits defined by a vshuf64x2 instruction's immediate control byte. - PermMask |= (WidenedMask[i] % 4) << (i * 2); + PermMask |= (Widened128Mask[i] % 4) << (i * 2); } return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], @@ -16696,6 +17202,12 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } + // Try to create an in-lane repeating shuffle mask and then shuffle the + // results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG)) + return V; + // If we have a single input shuffle with different shuffle patterns in the // 128-bit lanes and don't lane cross, use variable mask VPERMILPS. if (V2.isUndef() && @@ -16728,7 +17240,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, SmallVector<int, 2> Repeated128Mask; if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { SmallVector<int, 4> PSHUFDMask; - scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask); + narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask); return DAG.getBitcast( MVT::v8i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, @@ -16752,7 +17264,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Shift; // Try to use VALIGN. - if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask, + if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -16814,7 +17326,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Shift; // Try to use VALIGN. - if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask, + if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; @@ -16833,6 +17345,13 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, CastV1, CastV2, DAG); return DAG.getBitcast(MVT::v16i32, ShufPS); } + + // Try to create an in-lane repeating shuffle mask and then shuffle the + // results into the target lanes. + if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( + DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) + return V; + // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) @@ -16841,6 +17360,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; + return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } @@ -16865,6 +17385,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) return V; + // Use dedicated pack instructions for masks that match their pattern. + if (SDValue V = + lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget)) + return V; + // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -16876,18 +17401,23 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, return Rotate; if (V2.isUndef()) { + // Try to use bit rotation instructions. + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG)) + return Rotate; + SmallVector<int, 8> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) { // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v32 case. - return lowerV8I16GeneralSingleInputShuffle( - DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); + return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1, + RepeatedMask, Subtarget, DAG); } } if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2, @@ -16933,6 +17463,17 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, Subtarget, DAG)) return Rotate; + // Try to use bit rotation instructions. + if (V2.isUndef()) + if (SDValue Rotate = + lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG)) + return Rotate; + + // Lower as AND if possible. + if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Masked; + if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; @@ -16995,6 +17536,18 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, Subtarget, DAG)) return Broadcast; + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) { + // Try using bit ops for masking and blending before falling back to + // splitting. + if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return V; + if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) + return V; + + return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); + } + // Dispatch to each element type for lowering. If we don't have support for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that @@ -17477,6 +18030,10 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned EltSize = VT.getScalarSizeInBits(); unsigned NumElts = VT.getVectorNumElements(); + // Expand v32i16/v64i8 without BWI. + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) + return SDValue(); + // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition // into an i1 condition so that we can use the mask-based 512-bit blend // instructions. @@ -17532,14 +18089,24 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); + SDValue Vec = Op.getOperand(0); + SDValue Idx = Op.getOperand(1); + assert(isa<ConstantSDNode>(Idx) && "Constant index expected"); SDLoc dl(Op); - if (!Op.getOperand(0).getSimpleValueType().is128BitVector()) + if (!Vec.getSimpleValueType().is128BitVector()) return SDValue(); if (VT.getSizeInBits() == 8) { - SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, - Op.getOperand(0), Op.getOperand(1)); + // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless + // we're going to zero extend the register or fold the store. + if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) && + !MayFoldIntoStore(Op)) + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + DAG.getBitcast(MVT::v4i32, Vec), Idx)); + + SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } @@ -17552,22 +18119,17 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { if (!Op.hasOneUse()) return SDValue(); SDNode *User = *Op.getNode()->use_begin(); - if ((User->getOpcode() != ISD::STORE || - isNullConstant(Op.getOperand(1))) && + if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, - DAG.getBitcast(MVT::v4i32, Op.getOperand(0)), - Op.getOperand(1)); + DAG.getBitcast(MVT::v4i32, Vec), Idx); return DAG.getBitcast(MVT::f32, Extract); } - if (VT == MVT::i32 || VT == MVT::i64) { - // ExtractPS/pextrq works with constant index. - if (isa<ConstantSDNode>(Op.getOperand(1))) + if (VT == MVT::i32 || VT == MVT::i64) return Op; - } return SDValue(); } @@ -17580,6 +18142,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, SDLoc dl(Vec); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); + auto* IdxC = dyn_cast<ConstantSDNode>(Idx); MVT EltVT = Op.getSimpleValueType(); assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && @@ -17587,7 +18150,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, // variable index can't be handled in mask registers, // extend vector to VR512/128 - if (!isa<ConstantSDNode>(Idx)) { + if (!IdxC) { unsigned NumElts = VecVT.getVectorNumElements(); // Extending v8i1/v16i1 to 512-bit get better performance on KNL // than extending to 128/256bit. @@ -17598,7 +18161,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + unsigned IdxVal = IdxC->getZExtValue(); if (IdxVal == 0) // the operation is legal return Op; @@ -17627,11 +18190,12 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Vec = Op.getOperand(0); MVT VecVT = Vec.getSimpleValueType(); SDValue Idx = Op.getOperand(1); + auto* IdxC = dyn_cast<ConstantSDNode>(Idx); if (VecVT.getVectorElementType() == MVT::i1) return ExtractBitFromMaskVector(Op, DAG, Subtarget); - if (!isa<ConstantSDNode>(Idx)) { + if (!IdxC) { // Its more profitable to go through memory (1 cycles throughput) // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) // IACA tool was used to get performance estimation @@ -17665,7 +18229,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, return SDValue(); } - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + unsigned IdxVal = IdxC->getZExtValue(); // If this is a 256-bit vector result, first extract the 128-bit vector and // then extract the element from the 128-bit vector. @@ -17697,9 +18261,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); - // Transform it so it match pextrw which produces a 32-bit result. - SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, - Op.getOperand(0), Op.getOperand(1)); + SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } @@ -17789,9 +18351,7 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, // Copy into a k-register, extract to v1i1 and insert_subvector. SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt); - - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, - Op.getOperand(2)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx); } SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, @@ -17864,11 +18424,22 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); // This will be just movd/movq/movss/movsd. - if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) && - (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || - EltVT == MVT::i64)) { - N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); - return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); + if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) { + if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || + EltVT == MVT::i64) { + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); + return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); + } + + // We can't directly insert an i8 or i16 into a vector, so zero extend + // it to i32 first. + if (EltVT == MVT::i16 || EltVT == MVT::i8) { + N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1); + MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32); + N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1); + N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); + return DAG.getBitcast(VT, N1); + } } // Transform it so it match pinsr{b,w} which expects a GR32 as its second @@ -17981,12 +18552,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); SDValue Vec = Op.getOperand(0); - SDValue Idx = Op.getOperand(1); - - if (!isa<ConstantSDNode>(Idx)) - return SDValue(); + uint64_t IdxVal = Op.getConstantOperandVal(1); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); if (IdxVal == 0) // the operation is legal return Op; @@ -18045,7 +18612,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag); + CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag); SDLoc DL(CP); Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); // With PIC, the address is actually $g + Offset. @@ -18554,25 +19121,47 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, Op0, Op1, Amt); } - - assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && - "Unexpected funnel shift type!"); + assert( + (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && + "Unexpected funnel shift type!"); // Expand slow SHLD/SHRD cases if we are not optimizing for size. bool OptForSize = DAG.shouldOptForSize(); - if (!OptForSize && Subtarget.isSHLDSlow()) - return SDValue(); + bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow(); - if (IsFSHR) - std::swap(Op0, Op1); + // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw. + // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))). + if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) && + !isa<ConstantSDNode>(Amt)) { + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType()); + SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType()); + Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32); + Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32); + Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask); + SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift); + Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1); + if (IsFSHR) { + Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt); + } else { + Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt); + Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift); + } + return DAG.getZExtOrTrunc(Res, DL, VT); + } + + if (VT == MVT::i8 || ExpandFunnel) + return SDValue(); // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo. - if (VT == MVT::i16) + if (VT == MVT::i16) { Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, DAG.getConstant(15, DL, Amt.getValueType())); + unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL); + return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt); + } - unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD); - return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt); + return Op; } // Try to use a packed vector operation to handle i64 on 32-bit targets when @@ -18682,6 +19271,56 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, DAG.getIntPtrConstant(0, DL)); } +/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), +/// try to vectorize the cast ops. This will avoid an expensive round-trip +/// between XMM and GPR. +static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // TODO: Allow FP_TO_UINT. + SDValue CastToInt = CastToFP.getOperand(0); + MVT VT = CastToFP.getSimpleValueType(); + if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector()) + return SDValue(); + + MVT IntVT = CastToInt.getSimpleValueType(); + SDValue X = CastToInt.getOperand(0); + MVT SrcVT = X.getSimpleValueType(); + if (SrcVT != MVT::f32 && SrcVT != MVT::f64) + return SDValue(); + + // See if we have 128-bit vector cast instructions for this type of cast. + // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd. + if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) || + IntVT != MVT::i32) + return SDValue(); + + unsigned SrcSize = SrcVT.getSizeInBits(); + unsigned IntSize = IntVT.getSizeInBits(); + unsigned VTSize = VT.getSizeInBits(); + MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize); + MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize); + MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize); + + // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64. + unsigned ToIntOpcode = + SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT; + unsigned ToFPOpcode = + IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP; + + // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0 + // + // We are not defining the high elements (for example, zero them) because + // that could nullify any performance advantage that we hoped to gain from + // this vector op hack. We do not expect any adverse effects (like denorm + // penalties) with cast ops. + SDLoc DL(CastToFP); + SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL); + SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X); + SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX); + SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx); +} + static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(Op); @@ -18739,15 +19378,15 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, SmallVector<SDValue, 4> SignCvts(4); SmallVector<SDValue, 4> Chains(4); for (int i = 0; i != 4; ++i) { - SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc, + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc, DAG.getIntPtrConstant(i, DL)); if (IsStrict) { SignCvts[i] = DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other}, - {Op.getOperand(0), Src}); + {Op.getOperand(0), Elt}); Chains[i] = SignCvts[i].getValue(1); } else { - SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src); + SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt); } } SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts); @@ -18784,6 +19423,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; + if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget)) + return R; + if (SrcVT.isVector()) { if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { // Note: Since v2f64 is a legal type. We don't need to zero extend the @@ -18832,21 +19474,23 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); SDValue ValueToStore = Src; - if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit()) + if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from // an SSE register, avoiding the store forwarding penalty that would come // with two 32-bit stores. ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); - unsigned Size = SrcVT.getSizeInBits()/8; + unsigned Size = SrcVT.getStoreSize(); + Align Alignment(Size); MachineFunction &MF = DAG.getMachineFunction(); auto PtrVT = getPointerTy(MF.getDataLayout()); - int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false); + int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - Chain = DAG.getStore( - Chain, dl, ValueToStore, StackSlot, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); - std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); + Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment); + std::pair<SDValue, SDValue> Tmp = + BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG); if (IsStrict) return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); @@ -18854,58 +19498,40 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, return Tmp.first; } -std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, - SDValue StackSlot, - SelectionDAG &DAG) const { +std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD( + EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, + MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const { // Build the FILD - SDLoc DL(Op); SDVTList Tys; - bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); + bool useSSE = isScalarFPTypeInSSEReg(DstVT); if (useSSE) - Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue); + Tys = DAG.getVTList(MVT::f80, MVT::Other); else - Tys = DAG.getVTList(Op.getValueType(), MVT::Other); + Tys = DAG.getVTList(DstVT, MVT::Other); - unsigned ByteSize = SrcVT.getSizeInBits() / 8; - - FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot); - MachineMemOperand *LoadMMO; - if (FI) { - int SSFI = FI->getIndex(); - LoadMMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), - MachineMemOperand::MOLoad, ByteSize, ByteSize); - } else { - LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); - StackSlot = StackSlot.getOperand(1); - } - SDValue FILDOps[] = {Chain, StackSlot}; + SDValue FILDOps[] = {Chain, Pointer}; SDValue Result = - DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL, - Tys, FILDOps, SrcVT, LoadMMO); + DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo, + Alignment, MachineMemOperand::MOLoad); Chain = Result.getValue(1); if (useSSE) { - SDValue InFlag = Result.getValue(2); - - // FIXME: Currently the FST is glued to the FILD_FLAG. This - // shouldn't be necessary except that RFP cannot be live across - // multiple blocks. When stackifier is fixed, they can be uncoupled. MachineFunction &MF = DAG.getMachineFunction(); - unsigned SSFISize = Op.getValueSizeInBits() / 8; - int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false); + unsigned SSFISize = DstVT.getStoreSize(); + int SSFI = + MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false); auto PtrVT = getPointerTy(MF.getDataLayout()); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Tys = DAG.getVTList(MVT::Other); - SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag}; + SDValue FSTOps[] = {Chain, Result, StackSlot}; MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), - MachineMemOperand::MOStore, SSFISize, SSFISize); + MachineMemOperand::MOStore, SSFISize, Align(SSFISize)); - Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, - Op.getValueType(), StoreMMO); + Chain = + DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO); Result = DAG.getLoad( - Op.getValueType(), DL, Chain, StackSlot, + DstVT, DL, Chain, StackSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); Chain = Result.getValue(1); } @@ -18948,7 +19574,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; Constant *C0 = ConstantDataVector::get(*Context, CV0); auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); - SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16); + SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16)); SmallVector<Constant*,2> CV1; CV1.push_back( @@ -18958,7 +19584,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), APInt(64, 0x4530000000000000ULL)))); Constant *C1 = ConstantVector::get(CV1); - SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16); + SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16)); // Load the 64-bit value into an XMM register. SDValue XR1 = @@ -19163,13 +19789,13 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, *DAG.getContext(), APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL))); auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); - SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8); + SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8)); SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other); SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; SDValue VBias = DAG.getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), - /*Alignment*/ 8, MachineMemOperand::MOLoad); + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8), + MachineMemOperand::MOLoad); SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn, DAG.getBitcast(MVT::v4i64, VBias)); @@ -19337,15 +19963,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, return SDValue(); // Make a 64-bit buffer, and use it to build an FILD. - SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); + SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8); + int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI); if (SrcVT == MVT::i32) { SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl); SDValue Store1 = - DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo()); + DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/); SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), - OffsetSlot, MachinePointerInfo()); + OffsetSlot, MPI.getWithOffset(4), 4); std::pair<SDValue, SDValue> Tmp = - BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); + BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG); if (IsStrict) return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); @@ -19361,21 +19990,17 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); } SDValue Store = - DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo()); + DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8)); // For i64 source, we need to add the appropriate power of 2 if the input // was negative. This is the same as the optimization in // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) - int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); - MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), - MachineMemOperand::MOLoad, 8, 8); - SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot }; - SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, - MVT::i64, MMO); + SDValue Fild = + DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI, + Align(8), MachineMemOperand::MOLoad); Chain = Fild.getValue(1); @@ -19388,6 +20013,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, APInt FF(64, 0x5F80000000000000ULL); SDValue FudgePtr = DAG.getConstantPool( ConstantInt::get(*DAG.getContext(), FF), PtrVT); + Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign(); // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); @@ -19399,7 +20025,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue Fudge = DAG.getExtLoad( ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, - /* Alignment = */ 4); + CPAlignment); Chain = Fudge.getValue(1); // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? @@ -19462,7 +20088,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getStoreSize(); - int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false); + int SSFI = + MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); @@ -19537,20 +20164,20 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI); - SDVTList Tys = DAG.getVTList(TheVT, MVT::Other); + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Chain, StackSlot }; unsigned FLDSize = TheVT.getStoreSize(); assert(FLDSize <= MemSize && "Stack slot not big enough"); MachineMemOperand *MMO = MF.getMachineMemOperand( - MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize); + MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize)); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO); Chain = Value.getValue(1); } // Build the FP_TO_INT*_IN_MEM MachineMemOperand *MMO = MF.getMachineMemOperand( - MPI, MachineMemOperand::MOStore, MemSize, MemSize); + MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize)); SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL, DAG.getVTList(MVT::Other), @@ -19590,14 +20217,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc); - // Custom legalize v8i8->v8i64 on CPUs without avx512bw. - if (InVT == MVT::v8i8) { - if (VT != MVT::v8i64) - return SDValue(); - - In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), - MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); - return DAG.getNode(ExtendInVecOpc, dl, VT, In); + if (VT == MVT::v32i16 && !Subtarget.hasBWI()) { + assert(InVT == MVT::v32i8 && "Unexpected VT!"); + return splitVectorIntUnary(Op, DAG); } if (Subtarget.hasInt256()) @@ -19729,7 +20351,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, "Unexpected PACK opcode"); assert(DstVT.isVector() && "VT not a vector?"); - // Requires SSE2 but AVX512 has fast vector truncate. + // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below). if (!Subtarget.hasSSE2()) return SDValue(); @@ -19770,15 +20392,14 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits()); OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits()); In = DAG.getBitcast(InVT, In); - SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In); + SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT)); Res = extractSubVector(Res, 0, DAG, DL, 64); return DAG.getBitcast(DstVT, Res); } - // Extract lower/upper subvectors. - unsigned NumSubElts = NumElems / 2; - SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2); - SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2); + // Split lower/upper subvectors. + SDValue Lo, Hi; + std::tie(Lo, Hi) = splitVector(In, DAG, DL); unsigned SubSizeInBits = SrcSizeInBits / 2; InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits()); @@ -19804,7 +20425,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits. SmallVector<int, 64> Mask; int Scale = 64 / OutVT.getScalarSizeInBits(); - scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask); + narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask); Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask); if (DstVT.is256BitVector()) @@ -19818,7 +20439,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, // Recursively pack lower/upper subvectors, concat result and pack again. assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater"); - EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts); + EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2); Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget); Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget); @@ -19865,17 +20486,22 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors // we need to split into two 8 element vectors which we can extend to v8i32, // truncate and concat the results. There's an additional complication if - // the original type is v16i8. In that case we can't split the v16i8 so - // first we pre-extend it to v16i16 which we can split to v8i16, then extend - // to v8i32, truncate that to v8i1 and concat the two halves. + // the original type is v16i8. In that case we can't split the v16i8 + // directly, so we need to shuffle high elements to low and use + // sign_extend_vector_inreg. if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) { + SDValue Lo, Hi; if (InVT == MVT::v16i8) { - // First we need to sign extend up to 256-bits so we can split that. - InVT = MVT::v16i16; - In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In); + Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In); + Hi = DAG.getVectorShuffle( + InVT, DL, In, In, + {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}); + Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi); + } else { + assert(InVT == MVT::v16i16 && "Unexpected VT!"); + Lo = extract128BitVector(In, 0, DAG, DL); + Hi = extract128BitVector(In, 8, DAG, DL); } - SDValue Lo = extract128BitVector(In, 0, DAG, DL); - SDValue Hi = extract128BitVector(In, 8, DAG, DL); // We're split now, just emit two truncates and a concat. The two // truncates will trigger legalization to come back to this function. Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo); @@ -19918,7 +20544,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { if (!TLI.isTypeLegal(InVT)) { if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && VT.is128BitVector()) { - assert(Subtarget.hasVLX() && "Unexpected subtarget!"); + assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) && + "Unexpected subtarget!"); // The default behavior is to truncate one step, concatenate, and then // truncate the remainder. We'd rather produce two 64-bit results and // concatenate those. @@ -19942,6 +20569,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { // vpmovqb/w/d, vpmovdb/w, vpmovwb if (Subtarget.hasAVX512()) { + if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) { + assert(VT == MVT::v32i8 && "Unexpected VT!"); + return splitVectorIntUnary(Op, DAG); + } + // word to byte only under BWI. Otherwise we have to promoted to v16i32 // and then truncate that. But we should only do that if we haven't been // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be @@ -20174,6 +20806,25 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { } if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { + if (!Subtarget.hasVLX()) { + // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type + // legalizer and then widened again by vector op legalization. + if (!IsStrict) + return SDValue(); + + SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32); + SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32, + {Src, Zero, Zero, Zero}); + Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, + {Op->getOperand(0), Tmp}); + SDValue Chain = Tmp.getValue(1); + Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp, + DAG.getIntPtrConstant(0, dl)); + if (IsStrict) + return DAG.getMergeValues({Tmp, Chain}, dl); + return Tmp; + } + assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL"); SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, DAG.getUNDEF(MVT::v2f32)); @@ -20281,6 +20932,62 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } +SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op, + SelectionDAG &DAG) const { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + // If the source is in an SSE register, the node is Legal. + if (isScalarFPTypeInSSEReg(SrcVT)) + return Op; + + return LRINT_LLRINTHelper(Op.getNode(), DAG); +} + +SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N, + SelectionDAG &DAG) const { + EVT DstVT = N->getValueType(0); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) { + // f16 must be promoted before using the lowering in this routine. + // fp128 does not use this lowering. + return SDValue(); + } + + SDLoc DL(N); + SDValue Chain = DAG.getEntryNode(); + + bool UseSSE = isScalarFPTypeInSSEReg(SrcVT); + + // If we're converting from SSE, the stack slot needs to hold both types. + // Otherwise it only needs to hold the DstVT. + EVT OtherVT = UseSSE ? SrcVT : DstVT; + SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT); + int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); + + if (UseSSE) { + assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"); + Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI); + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); + SDValue Ops[] = { Chain, StackPtr }; + + Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI, + /*Align*/ None, MachineMemOperand::MOLoad); + Chain = Src.getValue(1); + } + + SDValue StoreOps[] = { Chain, Src, StackPtr }; + Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other), + StoreOps, DstVT, MPI, /*Align*/ None, + MachineMemOperand::MOStore); + + return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI); +} + SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); @@ -20333,6 +21040,67 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { return Tmp.first; } +static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) { + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && + "Unexpected VT!"); + + SDLoc dl(Op); + SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, + DAG.getConstant(0, dl, MVT::v8i16), Src, + DAG.getIntPtrConstant(0, dl)); + + SDValue Chain; + if (IsStrict) { + Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other}, + {Op.getOperand(0), Res}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res); + } + + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, + DAG.getIntPtrConstant(0, dl)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + + return Res; +} + +static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) { + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && + "Unexpected VT!"); + + SDLoc dl(Op); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32, + DAG.getConstantFP(0, dl, MVT::v4f32), Src, + DAG.getIntPtrConstant(0, dl)); + Res = DAG.getNode( + X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, + {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)}); + Chain = Res.getValue(1); + } else { + // FIXME: Should we use zeros for upper elements for non-strict? + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src); + Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res, + DAG.getTargetConstant(4, dl, MVT::i32)); + } + + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res, + DAG.getIntPtrConstant(0, dl)); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + + return Res; +} + /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, @@ -20413,6 +21181,30 @@ SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); } +/// ISD::FROUND is defined to round to nearest with ties rounding away from 0. +/// This mode isn't supported in hardware on X86. But as long as we aren't +/// compiling with trapping math, we can emulate this with +/// floor(X + copysign(nextafter(0.5, 0.0), X)). +static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) { + SDValue N0 = Op.getOperand(0); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // N0 += copysign(nextafter(0.5, 0.0), N0) + const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); + bool Ignored; + APFloat Point5Pred = APFloat(0.5f); + Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored); + Point5Pred.next(/*nextDown*/true); + + SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT, + DAG.getConstantFP(Point5Pred, dl, VT), N0); + N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder); + + // Truncate the result to remove fraction. + return DAG.getNode(ISD::FTRUNC, dl, VT, N0); +} + /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { @@ -20568,9 +21360,12 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, } /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...)) -/// style scalarized (associative) reduction patterns. +/// style scalarized (associative) reduction patterns. Partial reductions +/// are supported when the pointer SrcMask is non-null. +/// TODO - move this to SelectionDAG? static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, - SmallVectorImpl<SDValue> &SrcOps) { + SmallVectorImpl<SDValue> &SrcOps, + SmallVectorImpl<APInt> *SrcMask = nullptr) { SmallVector<SDValue, 8> Opnds; DenseMap<SDValue, APInt> SrcOpMap; EVT VT = MVT::Other; @@ -20598,8 +21393,8 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, return false; // Quit if without a constant index. - SDValue Idx = I->getOperand(1); - if (!isa<ConstantSDNode>(Idx)) + auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1)); + if (!Idx) return false; SDValue Src = I->getOperand(0); @@ -20615,61 +21410,167 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first; SrcOps.push_back(Src); } + // Quit if element already used. - unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue(); + unsigned CIdx = Idx->getZExtValue(); if (M->second[CIdx]) return false; M->second.setBit(CIdx); } - // Quit if not all elements are used. - for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(), - E = SrcOpMap.end(); - I != E; ++I) { - if (!I->second.isAllOnesValue()) - return false; + if (SrcMask) { + // Collect the source partial masks. + for (SDValue &SrcOp : SrcOps) + SrcMask->push_back(SrcOpMap[SrcOp]); + } else { + // Quit if not all elements are used. + for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(), + E = SrcOpMap.end(); + I != E; ++I) { + if (!I->second.isAllOnesValue()) + return false; + } } return true; } -// Check whether an OR'd tree is PTEST-able. -static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC, +// Helper function for comparing all bits of a vector against zero. +static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC, + const APInt &Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG, X86::CondCode &X86CC) { + EVT VT = V.getValueType(); + assert(Mask.getBitWidth() == VT.getScalarSizeInBits() && + "Element Mask vs Vector bitwidth mismatch"); + + assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"); + X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE); + + auto MaskBits = [&](SDValue Src) { + if (Mask.isAllOnesValue()) + return Src; + EVT SrcVT = Src.getValueType(); + SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT); + return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue); + }; + + // For sub-128-bit vector, cast to (legal) integer and compare with zero. + if (VT.getSizeInBits() < 128) { + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) + return SDValue(); + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, + DAG.getBitcast(IntVT, MaskBits(V)), + DAG.getConstant(0, DL, IntVT)); + } + + // Quit if not splittable to 128/256-bit vector. + if (!isPowerOf2_32(VT.getSizeInBits())) + return SDValue(); + + // Split down to 128/256-bit vector. + unsigned TestSize = Subtarget.hasAVX() ? 256 : 128; + while (VT.getSizeInBits() > TestSize) { + auto Split = DAG.SplitVector(V, DL); + VT = Split.first.getValueType(); + V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second); + } + + bool UsePTEST = Subtarget.hasSSE41(); + if (UsePTEST) { + MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + V = DAG.getBitcast(TestVT, MaskBits(V)); + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V); + } + + // Without PTEST, a masked v2i64 or-reduction is not faster than + // scalarization. + if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32) + return SDValue(); + + V = DAG.getBitcast(MVT::v16i8, MaskBits(V)); + V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V, + getZeroVector(MVT::v16i8, Subtarget, DAG, DL)); + V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V, + DAG.getConstant(0xFFFF, DL, MVT::i32)); +} + +// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to +// CMP(MOVMSK(PCMPEQB(X,0))). +static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC, + const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &X86CC) { - assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode"); - if (!Subtarget.hasSSE41() || !Op->hasOneUse()) + if (!Subtarget.hasSSE2() || !Op->hasOneUse()) return SDValue(); - SmallVector<SDValue, 8> VecIns; - if (!matchScalarReduction(Op, ISD::OR, VecIns)) - return SDValue(); + // Check whether we're masking/truncating an OR-reduction result, in which + // case track the masked bits. + APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits()); + switch (Op.getOpcode()) { + case ISD::TRUNCATE: { + SDValue Src = Op.getOperand(0); + Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(), + Op.getScalarValueSizeInBits()); + Op = Src; + break; + } + case ISD::AND: { + if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { + Mask = Cst->getAPIntValue(); + Op = Op.getOperand(0); + } + break; + } + } - // Quit if not 128/256-bit vector. - EVT VT = VecIns[0].getValueType(); - if (!VT.is128BitVector() && !VT.is256BitVector()) - return SDValue(); + SmallVector<SDValue, 8> VecIns; + if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) { + EVT VT = VecIns[0].getValueType(); + assert(llvm::all_of(VecIns, + [VT](SDValue V) { return VT == V.getValueType(); }) && + "Reduction source vector mismatch"); + + // Quit if less than 128-bits or not splittable to 128/256-bit vector. + if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits())) + return SDValue(); - SDLoc DL(Op); - MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + // If more than one full vector is evaluated, OR them first before PTEST. + for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; + Slot += 2, e += 1) { + // Each iteration will OR 2 nodes and append the result until there is + // only 1 node left, i.e. the final OR'd value of all vectors. + SDValue LHS = VecIns[Slot]; + SDValue RHS = VecIns[Slot + 1]; + VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS)); + } - // Cast all vectors into TestVT for PTEST. - for (unsigned i = 0, e = VecIns.size(); i < e; ++i) - VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]); + X86::CondCode CCode; + if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget, + DAG, CCode)) { + X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8); + return V; + } + } - // If more than one full vector is evaluated, OR them first before PTEST. - for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { - // Each iteration will OR 2 nodes and append the result until there is only - // 1 node left, i.e. the final OR'd value of all vectors. - SDValue LHS = VecIns[Slot]; - SDValue RHS = VecIns[Slot + 1]; - VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); + if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + ISD::NodeType BinOp; + if (SDValue Match = + DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) { + X86::CondCode CCode; + if (SDValue V = + LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) { + X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8); + return V; + } + } } - X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, - DL, MVT::i8); - return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back()); + return SDValue(); } /// return true if \c Op has a use that doesn't just read flags. @@ -20814,27 +21715,14 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, /// Emit nodes that will be selected as "cmp Op0,Op1", or something /// equivalent. -static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1, - unsigned X86CC, const SDLoc &dl, - SelectionDAG &DAG, - const X86Subtarget &Subtarget, - SDValue Chain, bool IsSignaling) { +static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, + const SDLoc &dl, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { if (isNullConstant(Op1)) - return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain); + return EmitTest(Op0, X86CC, dl, DAG, Subtarget); EVT CmpVT = Op0.getValueType(); - if (CmpVT.isFloatingPoint()) { - if (Chain) { - SDValue Res = - DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP, - dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1}); - return std::make_pair(Res, Res.getValue(1)); - } - return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1), - SDValue()); - } - assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"); @@ -20884,40 +21772,28 @@ static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1, Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); } + // 0-x == y --> x+y == 0 + // 0-x != y --> x+y != 0 + if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) && + Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); + SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1); + return Add.getValue(1); + } + + // x == 0-y --> x+y == 0 + // x != 0-y --> x+y != 0 + if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) && + Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); + SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1)); + return Add.getValue(1); + } + // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); - return std::make_pair(Sub.getValue(1), SDValue()); -} - -/// Convert a comparison if required by the subtarget. -SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, - SelectionDAG &DAG) const { - // If the subtarget does not support the FUCOMI instruction, floating-point - // comparisons have to be converted. - bool IsCmp = Cmp.getOpcode() == X86ISD::CMP; - bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP || - Cmp.getOpcode() == X86ISD::STRICT_FCMPS; - - if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) || - !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() || - !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint()) - return Cmp; - - // The instruction selector will select an FUCOM instruction instead of - // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence - // build an SDNode sequence that transfers the result from FPSW into EFLAGS: - // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8)))) - SDLoc dl(Cmp); - SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp); - SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW); - SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, - DAG.getConstant(8, dl, MVT::i8)); - SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); - - // Some 64-bit targets lack SAHF support, but they do support FCOMI. - assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); - return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); + return Sub.getValue(1); } /// Check if replacement of SQRT with RSQRT should be disabled. @@ -21211,32 +22087,30 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then /// concatenate the result back. -static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); +static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); - assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC && - "Unsupported value type for operation"); + assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation"); + assert(Op.getOperand(0).getValueType().isInteger() && + VT == Op.getOperand(0).getValueType() && "Unsupported VTs!"); - unsigned NumElems = VT.getVectorNumElements(); SDLoc dl(Op); SDValue CC = Op.getOperand(2); - // Extract the LHS vectors - SDValue LHS = Op.getOperand(0); - SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); - SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); + // Extract the LHS Lo/Hi vectors + SDValue LHS1, LHS2; + std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl); - // Extract the RHS vectors - SDValue RHS = Op.getOperand(1); - SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); - SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); + // Extract the RHS Lo/Hi vectors + SDValue RHS1, RHS2; + std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl); // Issue the operation on the smaller types and concatenate the result back - MVT EltVT = VT.getVectorElementType(); - MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC), - DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC)); + DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC), + DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC)); } static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { @@ -21369,8 +22243,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + // If we have a strict compare with a vXi1 result and the input is 128/256 + // bits we can't use a masked compare unless we have VLX. If we use a wider + // compare like we do for non-strict, we might trigger spurious exceptions + // from the upper elements. Instead emit a AVX compare and convert to mask. unsigned Opc; - if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) { + if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 && + (!IsStrict || Subtarget.hasVLX() || + Op0.getSimpleValueType().is512BitVector())) { assert(VT.getVectorNumElements() <= 16); Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM; } else { @@ -21466,10 +22346,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); } - // If this is SSE/AVX CMPP, bitcast the result back to integer to match the - // result type of SETCC. The bitcast is expected to be optimized away - // during combining/isel. - Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); + if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) { + // We emitted a compare with an XMM/YMM result. Finish converting to a + // mask register using a vptestm. + EVT CastVT = EVT(VT).changeVectorElementTypeToInteger(); + Cmp = DAG.getBitcast(CastVT, Cmp); + Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp, + DAG.getConstant(0, dl, CastVT), ISD::SETNE); + } else { + // If this is SSE/AVX CMPP, bitcast the result back to integer to match + // the result type of SETCC. The bitcast is expected to be optimized + // away during combining/isel. + Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); + } if (IsStrict) return DAG.getMergeValues({Cmp, Chain}, dl); @@ -21563,7 +22452,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // Break 256-bit integer vector compare into smaller ones. if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntVSETCC(Op, DAG); + return splitIntVSETCC(Op, DAG); + + if (VT == MVT::v32i16 || VT == MVT::v64i8) { + assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!"); + return splitIntVSETCC(Op, DAG); + } // If this is a SETNE against the signed minimum value, change it to SETGT. // If this is a SETNE against the signed maximum value, change it to SETLT. @@ -21812,9 +22706,8 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, /// corresponding X86 condition code constant in X86CC. SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, - SelectionDAG &DAG, SDValue &X86CC, - SDValue &Chain, - bool IsSignaling) const { + SelectionDAG &DAG, + SDValue &X86CC) const { // Optimize to BT if possible. // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). @@ -21825,13 +22718,12 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, return BT; } - // Try to use PTEST for a tree ORs equality compared with 0. + // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0. // TODO: We could do AND tree with all 1s as well by using the C flag. - if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) && - (CC == ISD::SETEQ || CC == ISD::SETNE)) { - if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC)) - return PTEST; - } + if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) + if (SDValue CmpZ = + MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC)) + return CmpZ; // Try to lower using KORTEST or KTEST. if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) @@ -21873,17 +22765,11 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, } } - bool IsFP = Op1.getSimpleValueType().isFloatingPoint(); - X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG); - if (CondCode == X86::COND_INVALID) - return SDValue(); + X86::CondCode CondCode = + TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG); + assert(CondCode != X86::COND_INVALID && "Unexpected condition code!"); - std::pair<SDValue, SDValue> Tmp = - EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling); - SDValue EFLAGS = Tmp.first; - if (Chain) - Chain = Tmp.second; - EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); + SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget); X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); return EFLAGS; } @@ -21920,18 +22806,32 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } } - SDValue X86CC; - SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain, - Op.getOpcode() == ISD::STRICT_FSETCCS); - if (!EFLAGS) - return SDValue(); + if (Op0.getSimpleValueType().isInteger()) { + SDValue X86CC; + SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC); + SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); + return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; + } - SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); + // Handle floating point. + X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG); + if (CondCode == X86::COND_INVALID) + return SDValue(); - if (IsStrict) - return DAG.getMergeValues({Res, Chain}, dl); + SDValue EFLAGS; + if (IsStrict) { + bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; + EFLAGS = + DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP, + dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1}); + Chain = EFLAGS.getValue(1); + } else { + EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1); + } - return Res; + SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); + SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); + return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; } SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { @@ -21946,9 +22846,8 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const // Recreate the carry if needed. EVT CarryVT = Carry.getValueType(); - APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), - Carry, DAG.getConstant(NegOne, DL, CarryVT)); + Carry, DAG.getAllOnesConstant(DL, CarryVT)); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); @@ -22024,7 +22923,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getOpcode(); if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || - Opc == X86ISD::SAHF) + Opc == X86ISD::FCMP) return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || @@ -22057,9 +22956,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops // are available or VBLENDV if AVX is available. // Otherwise FP cmovs get lowered into a less efficient branch sequence later. - if (Cond.getOpcode() == ISD::SETCC && - ((Subtarget.hasSSE2() && VT == MVT::f64) || - (Subtarget.hasSSE1() && VT == MVT::f32)) && + if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); bool IsAlwaysSignaling; @@ -22115,45 +23012,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } // AVX512 fallback is to lower selects of scalar floats to masked moves. - if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) { + if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) { SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); } - // For v64i1 without 64-bit support we need to split and rejoin. - if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { - assert(Subtarget.hasBWI() && "Expected BWI to be legal"); - SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32); - SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32); - SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32); - SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32); - SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo); - SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi); - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); - } - - if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { - SDValue Op1Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) - Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); - else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) - Op1Scalar = Op1.getOperand(0); - SDValue Op2Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) - Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); - else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) - Op2Scalar = Op2.getOperand(0); - if (Op1Scalar.getNode() && Op2Scalar.getNode()) { - SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond, - Op1Scalar, Op2Scalar); - if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) - return DAG.getBitcast(VT, newSelect); - SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, - DAG.getIntPtrConstant(0, DL)); - } - } - if (Cond.getOpcode() == ISD::SETCC) { if (SDValue NewCond = LowerSETCC(Cond, DAG)) { Cond = NewCond; @@ -22175,12 +23038,28 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Cond.getOperand(1).getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); + SDValue CmpOp0 = Cmp.getOperand(0); unsigned CondCode = Cond.getConstantOperandVal(0); - if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && + // Special handling for __builtin_ffs(X) - 1 pattern which looks like + // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special + // handle to keep the CMP with 0. This should be removed by + // optimizeCompareInst by using the flags from the BSR/TZCNT used for the + // cttz_zero_undef. + auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) { + return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() && + Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2)); + }; + if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) && + ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) || + (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) { + // Keep Cmp. + } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; - SDValue CmpOp0 = Cmp.getOperand(0); + + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb @@ -22188,31 +23067,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (isNullConstant(Y) && (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); - SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0); Zero = DAG.getConstant(0, DL, Op.getValueType()); - return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero); + return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1)); } - Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, + Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType())); - Cmp = ConvertCmpIfNecessary(Cmp, DAG); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SDValue Zero = DAG.getConstant(0, DL, Op.getValueType()); SDValue Res = // Res = 0 or -1. - DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp); + DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1)); if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); - if (!isNullConstant(Op2)) - Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); - return Res; + return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E && Cmp.getOperand(0).getOpcode() == ISD::AND && isOneConstant(Cmp.getOperand(0).getOperand(1))) { - SDValue CmpOp0 = Cmp.getOperand(0); SDValue Src1, Src2; // true if Op2 is XOR or OR operator and one of its operands // is equal to Op1 @@ -22265,7 +23138,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cmp = Cond.getOperand(1); bool IllegalFPCMov = false; if (VT.isFloatingPoint() && !VT.isVector() && - !isScalarFPTypeInSSEReg(VT)) // FPStack? + !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack? IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || @@ -22311,7 +23184,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == X86ISD::SUB) { - Cond = ConvertCmpIfNecessary(Cond, DAG); unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && @@ -22333,7 +23205,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); if (T1.getValueType() == T2.getValueType() && - // Blacklist CopyFromReg to avoid partial register stalls. + // Exclude CopyFromReg to avoid partial register stalls. T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, CC, Cond); @@ -22570,14 +23442,9 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, InVT.getVectorElementType() == MVT::i32) && "Unexpected element type"); - // Custom legalize v8i8->v8i64 on CPUs without avx512bw. - if (InVT == MVT::v8i8) { - if (VT != MVT::v8i64) - return SDValue(); - - In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), - MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8)); - return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In); + if (VT == MVT::v32i16 && !Subtarget.hasBWI()) { + assert(InVT == MVT::v32i8 && "Unexpected VT!"); + return splitVectorIntUnary(Op, DAG); } if (Subtarget.hasInt256()) @@ -22620,23 +23487,19 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { if (!Store->isSimple()) return SDValue(); - EVT StoreVT = StoredVal.getValueType(); - unsigned NumElems = StoreVT.getVectorNumElements(); - unsigned HalfSize = StoredVal.getValueSizeInBits() / 2; - unsigned HalfAlign = (128 == HalfSize ? 16 : 32); - SDLoc DL(Store); - SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize); - SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize); + SDValue Value0, Value1; + std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL); + unsigned HalfOffset = Value0.getValueType().getStoreSize(); SDValue Ptr0 = Store->getBasePtr(); - SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL); - unsigned Alignment = Store->getAlignment(); + SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL); SDValue Ch0 = DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), - Alignment, Store->getMemOperand()->getFlags()); + Store->getOriginalAlign(), + Store->getMemOperand()->getFlags()); SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1, - Store->getPointerInfo().getWithOffset(HalfAlign), - MinAlign(Alignment, HalfAlign), + Store->getPointerInfo().getWithOffset(HalfOffset), + Store->getOriginalAlign(), Store->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); } @@ -22659,7 +23522,6 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, MVT StoreSVT = StoreVT.getScalarType(); unsigned NumElems = StoreVT.getVectorNumElements(); unsigned ScalarSize = StoreSVT.getStoreSize(); - unsigned Alignment = Store->getAlignment(); SDLoc DL(Store); SmallVector<SDValue, 4> Stores; @@ -22670,7 +23532,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, DAG.getIntPtrConstant(i, DL)); SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, Store->getPointerInfo().getWithOffset(Offset), - MinAlign(Alignment, Offset), + Store->getOriginalAlign(), Store->getMemOperand()->getFlags()); Stores.push_back(Ch); } @@ -22699,7 +23561,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), + St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } @@ -22711,7 +23573,9 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, // and each half can execute independently. Some cores would split the op into // halves anyway, so the concat (vinsertf128) is purely an extra op. MVT StoreVT = StoredVal.getSimpleValueType(); - if (StoreVT.is256BitVector()) { + if (StoreVT.is256BitVector() || + ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) && + !Subtarget.hasBWI())) { SmallVector<SDValue, 4> CatOps; if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps)) return splitVectorStore(St, DAG); @@ -22738,7 +23602,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, DAG.getIntPtrConstant(0, dl)); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), + St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } assert(Subtarget.hasSSE1() && "Expected SSE"); @@ -22773,7 +23637,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, "Expected AVX512F without AVX512DQI"); SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getPointerInfo(), Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); // Replace chain users with the new chain. @@ -22801,163 +23665,44 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { Op.getOperand(1).hasOneUse()); } -/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the -/// SETCC node has a single use. -static bool isXor1OfSetCC(SDValue Op) { - if (Op.getOpcode() != ISD::XOR) - return false; - if (isOneConstant(Op.getOperand(1))) - return Op.getOperand(0).getOpcode() == X86ISD::SETCC && - Op.getOperand(0).hasOneUse(); - return false; -} - SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { - bool addTest = true; SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); SDLoc dl(Op); - SDValue CC; - bool Inverted = false; - if (Cond.getOpcode() == ISD::SETCC) { - // Check for setcc([su]{add,sub,mul}o == 0). - if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && - isNullConstant(Cond.getOperand(1)) && - Cond.getOperand(0).getResNo() == 1 && - (Cond.getOperand(0).getOpcode() == ISD::SADDO || - Cond.getOperand(0).getOpcode() == ISD::UADDO || - Cond.getOperand(0).getOpcode() == ISD::SSUBO || - Cond.getOperand(0).getOpcode() == ISD::USUBO || - Cond.getOperand(0).getOpcode() == ISD::SMULO || - Cond.getOperand(0).getOpcode() == ISD::UMULO)) { - Inverted = true; - Cond = Cond.getOperand(0); - } else { - if (SDValue NewCond = LowerSETCC(Cond, DAG)) - Cond = NewCond; - } - } -#if 0 - // FIXME: LowerXALUO doesn't handle these!! - else if (Cond.getOpcode() == X86ISD::ADD || - Cond.getOpcode() == X86ISD::SUB || - Cond.getOpcode() == X86ISD::SMUL || - Cond.getOpcode() == X86ISD::UMUL) - Cond = LowerXALUO(Cond, DAG); -#endif + if (Cond.getOpcode() == ISD::SETCC && + Cond.getOperand(0).getValueType() != MVT::f128) { + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - // Look pass (and (setcc_carry (cmp ...)), 1). - if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && - isOneConstant(Cond.getOperand(1))) - Cond = Cond.getOperand(0); + // Special case for + // setcc([su]{add,sub,mul}o == 0) + // setcc([su]{add,sub,mul}o != 1) + if (ISD::isOverflowIntrOpRes(LHS) && + (CC == ISD::SETEQ || CC == ISD::SETNE) && + (isNullConstant(RHS) || isOneConstant(RHS))) { + SDValue Value, Overflow; + X86::CondCode X86Cond; + std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG); - // If condition flag is set by a X86ISD::CMP, then use it as the condition - // setting operand in place of the X86ISD::SETCC. - unsigned CondOpcode = Cond.getOpcode(); - if (CondOpcode == X86ISD::SETCC || - CondOpcode == X86ISD::SETCC_CARRY) { - CC = Cond.getOperand(0); + if ((CC == ISD::SETEQ) == isNullConstant(RHS)) + X86Cond = X86::GetOppositeBranchCondition(X86Cond); - SDValue Cmp = Cond.getOperand(1); - unsigned Opc = Cmp.getOpcode(); - // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? - if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { - Cond = Cmp; - addTest = false; - } else { - switch (cast<ConstantSDNode>(CC)->getZExtValue()) { - default: break; - case X86::COND_O: - case X86::COND_B: - // These can only come from an arithmetic instruction with overflow, - // e.g. SADDO, UADDO. - Cond = Cond.getOperand(1); - addTest = false; - break; - } + SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Overflow); } - } - CondOpcode = Cond.getOpcode(); - if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || - CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || - CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { - SDValue Value; - X86::CondCode X86Cond; - std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); - if (Inverted) - X86Cond = X86::GetOppositeBranchCondition(X86Cond); + if (LHS.getSimpleValueType().isInteger()) { + SDValue CCVal; + SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal); + return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + EFLAGS); + } - CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); - addTest = false; - } else { - unsigned CondOpc; - if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { - SDValue Cmp = Cond.getOperand(0).getOperand(1); - if (CondOpc == ISD::OR) { - // Also, recognize the pattern generated by an FCMP_UNE. We can emit - // two branches instead of an explicit OR instruction with a - // separate test. - if (Cmp == Cond.getOperand(1).getOperand(1) && - isX86LogicalCmp(Cmp)) { - CC = Cond.getOperand(0).getOperand(0); - Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), - Chain, Dest, CC, Cmp); - CC = Cond.getOperand(1).getOperand(0); - Cond = Cmp; - addTest = false; - } - } else { // ISD::AND - // Also, recognize the pattern generated by an FCMP_OEQ. We can emit - // two branches instead of an explicit AND instruction with a - // separate test. However, we only do this if this block doesn't - // have a fall-through edge, because this requires an explicit - // jmp when the condition is false. - if (Cmp == Cond.getOperand(1).getOperand(1) && - isX86LogicalCmp(Cmp) && - Op.getNode()->hasOneUse()) { - X86::CondCode CCode0 = - (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); - CCode0 = X86::GetOppositeBranchCondition(CCode0); - CC = DAG.getTargetConstant(CCode0, dl, MVT::i8); - SDNode *User = *Op.getNode()->use_begin(); - // Look for an unconditional branch following this conditional branch. - // We need this because we need to reverse the successors in order - // to implement FCMP_OEQ. - if (User->getOpcode() == ISD::BR) { - SDValue FalseBB = User->getOperand(1); - SDNode *NewBR = - DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); - assert(NewBR == User); - (void)NewBR; - Dest = FalseBB; - - Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain, - Dest, CC, Cmp); - X86::CondCode CCode1 = - (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); - CCode1 = X86::GetOppositeBranchCondition(CCode1); - CC = DAG.getTargetConstant(CCode1, dl, MVT::i8); - Cond = Cmp; - addTest = false; - } - } - } - } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { - // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. - // It should be transformed during dag combiner except when the condition - // is set by a arithmetics with overflow node. - X86::CondCode CCode = - (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); - CCode = X86::GetOppositeBranchCondition(CCode); - CC = DAG.getTargetConstant(CCode, dl, MVT::i8); - Cond = Cond.getOperand(0).getOperand(1); - addTest = false; - } else if (Cond.getOpcode() == ISD::SETCC && - cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) { + if (CC == ISD::SETOEQ) { // For FCMP_OEQ, we can emit // two branches instead of an explicit AND instruction with a // separate test. However, we only do this if this block doesn't @@ -22976,59 +23721,65 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { (void)NewBR; Dest = FalseBB; - SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, - Cond.getOperand(0), Cond.getOperand(1)); - Cmp = ConvertCmpIfNecessary(Cmp, DAG); - CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); - Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), - Chain, Dest, CC, Cmp); - CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); - Cond = Cmp; - addTest = false; + SDValue Cmp = + DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS); + SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); + Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, + CCVal, Cmp); + CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Cmp); } } - } else if (Cond.getOpcode() == ISD::SETCC && - cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) { + } else if (CC == ISD::SETUNE) { // For FCMP_UNE, we can emit // two branches instead of an explicit OR instruction with a // separate test. - SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, - Cond.getOperand(0), Cond.getOperand(1)); - Cmp = ConvertCmpIfNecessary(Cmp, DAG); - CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); - Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), - Chain, Dest, CC, Cmp); - CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); - Cond = Cmp; - addTest = false; + SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS); + SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); + Chain = + DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); + CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Cmp); + } else { + X86::CondCode X86Cond = + TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG); + SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS); + SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Cmp); } } - if (addTest) { - // Look pass the truncate if the high bits are known zero. - if (isTruncWithZeroHighBitsInput(Cond, DAG)) - Cond = Cond.getOperand(0); + if (ISD::isOverflowIntrOpRes(Cond)) { + SDValue Value, Overflow; + X86::CondCode X86Cond; + std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); - // We know the result of AND is compared against zero. Try to match - // it to BT. - if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue BTCC; - if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) { - CC = BTCC; - Cond = BT; - addTest = false; - } - } + SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8); + return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Overflow); } - if (addTest) { - X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE; - CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); - Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget); - } - Cond = ConvertCmpIfNecessary(Cond, DAG); - return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), - Chain, Dest, CC, Cond); + // Look past the truncate if the high bits are known zero. + if (isTruncWithZeroHighBitsInput(Cond, DAG)) + Cond = Cond.getOperand(0); + + EVT CondVT = Cond.getValueType(); + + // Add an AND with 1 if we don't already have one. + if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))) + Cond = + DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT)); + + SDValue LHS = Cond; + SDValue RHS = DAG.getConstant(0, dl, CondVT); + + SDValue CCVal; + SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal); + return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + EFLAGS); } // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. @@ -23041,9 +23792,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool SplitStack = MF.shouldSplitStack(); - bool EmitStackProbe = !getStackProbeSymbolName(MF).empty(); + bool EmitStackProbeCall = hasStackProbeSymbol(MF); bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || - SplitStack || EmitStackProbe; + SplitStack || EmitStackProbeCall; SDLoc dl(Op); // Get the inputs. @@ -23067,12 +23818,22 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!"); - SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); - Chain = SP.getValue(1); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - const Align StackAlign(TFI.getStackAlignment()); - Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value - if (Alignment && Alignment > StackAlign) + const Align StackAlign = TFI.getStackAlign(); + if (hasInlineStackProbe(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); + Register Vreg = MRI.createVirtualRegister(AddrRegClass); + Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); + Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain, + DAG.getRegister(Vreg, SPTy)); + } else { + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); + Chain = SP.getValue(1); + Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + } + if (Alignment && *Alignment > StackAlign) Result = DAG.getNode(ISD::AND, dl, VT, Result, DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); @@ -23203,14 +23964,13 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // Decide which area this value should be read from. // TODO: Implement the AMD64 ABI in its entirety. This simple // selection mechanism works only for the basic types. - if (ArgVT == MVT::f80) { - llvm_unreachable("va_arg for f80 not yet implemented"); - } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { + assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"); + if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { ArgMode = 2; // Argument passed in XMM register. Use fp_offset. - } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { - ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. } else { - llvm_unreachable("Unhandled argument type in LowerVAARG"); + assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ && + "Unhandled argument type in LowerVAARG"); + ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. } if (ArgMode == 2) { @@ -23227,11 +23987,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(Align, dl, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); SDValue VAARG = DAG.getMemIntrinsicNode( - X86ISD::VAARG_64, dl, - VTs, InstOps, MVT::i64, - MachinePointerInfo(SV), - /*Align=*/0, - MachineMemOperand::MOLoad | MachineMemOperand::MOStore); + X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV), + /*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore); Chain = VAARG.getValue(1); // Load the next argument and return it @@ -23255,9 +24012,8 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); SDLoc DL(Op); - return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, - DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false, - false, false, + return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL), + Align(8), /*isVolatile*/ false, false, false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); } @@ -24004,8 +24760,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); SDValue RHS = Op.getOperand(2); + // Some conditions require the operands to be swapped. + if (CC == ISD::SETLT || CC == ISD::SETLE) + std::swap(LHS, RHS); + SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); - SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS); SDValue SetCC; switch (CC) { case ISD::SETEQ: { // (ZF = 0 and PF = 0) @@ -24021,18 +24780,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, break; } case ISD::SETGT: // (CF = 0 and ZF = 0) + case ISD::SETLT: { // Condition opposite to GT. Operands swapped above. SetCC = getSETCC(X86::COND_A, Comi, dl, DAG); break; - case ISD::SETLT: { // The condition is opposite to GT. Swap the operands. - SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG); - break; } case ISD::SETGE: // CF = 0 + case ISD::SETLE: // Condition opposite to GE. Operands swapped above. SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG); break; - case ISD::SETLE: // The condition is opposite to GE. Swap the operands. - SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG); - break; default: llvm_unreachable("Unexpected illegal condition!"); } @@ -24481,6 +25236,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // Clamp out of bounds shift amounts since they will otherwise be masked // to 8-bits which may make it no longer out of bounds. unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255); + if (ShiftAmount == 0) + return Op.getOperand(1); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), Op.getOperand(0), Op.getOperand(1), DAG.getTargetConstant(ShiftAmount, DL, MVT::i32)); @@ -24540,19 +25298,23 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, TLI.getPointerTy(DAG.getDataLayout())); EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); + // Cast mask to an integer type. + Mask = DAG.getBitcast(MaskVT, Mask); + MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; - SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); - return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); + SDValue Res = + DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops, + MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({Res, Res.getValue(1)}, dl); } static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, @@ -24577,7 +25339,7 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, if (Mask.getValueType() != MaskVT) Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. // TODO: use undef instead and let BreakFalseDeps deal with it? @@ -24587,9 +25349,10 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; - SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); - return DAG.getMergeValues({ Res, Res.getValue(2) }, dl); + SDValue Res = + DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops, + MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return DAG.getMergeValues({Res, Res.getValue(1)}, dl); } static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -24615,11 +25378,12 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); - SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale}; - SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( - VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand()); - return Res.getValue(1); + SDValue Res = + DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops, + MemIntr->getMemoryVT(), MemIntr->getMemOperand()); + return Res; } static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, @@ -24778,13 +25542,11 @@ static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) { - SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Undef = DAG.getUNDEF(Ptr.getValueType()); SDValue Ops[] = { Chain, Val, Ptr, Undef }; - return SignedSat ? - DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) : - DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO); + unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS; + return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO); } /// Emit Masked Truncating Store with signed or unsigned saturation. @@ -24792,12 +25554,10 @@ static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG) { - SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = { Chain, Val, Ptr, Mask }; - return SignedSat ? - DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) : - DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO); + unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS; + return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO); } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, @@ -25147,7 +25907,7 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } -unsigned X86TargetLowering::getExceptionPointerRegister( +Register X86TargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; @@ -25155,7 +25915,7 @@ unsigned X86TargetLowering::getExceptionPointerRegister( return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX; } -unsigned X86TargetLowering::getExceptionSelectorRegister( +Register X86TargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Funclet personalities don't use selectors (the runtime does the selection). assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); @@ -25179,7 +25939,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { (FrameReg == X86::EBP && PtrVT == MVT::i32)) && "Invalid Frame Register!"); SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); - unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; + Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, DAG.getIntPtrConstant(RegInfo->getSlotSize(), @@ -25393,93 +26153,51 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 2 Round to +inf 3 Round to -inf - To perform the conversion, we do: - (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) + To perform the conversion, we use a packed lookup table of the four 2-bit + values that we can index by FPSP[11:10] + 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10] + + (0x2d >> ((FPSR & 0xc00) >> 9)) & 3 */ MachineFunction &MF = DAG.getMachineFunction(); - const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - const Align StackAlignment(TFI.getStackAlignment()); MVT VT = Op.getSimpleValueType(); SDLoc DL(Op); // Save FP Control Word to stack slot - int SSFI = - MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false); + int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false); SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), - MachineMemOperand::MOStore, 2, 2); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); - SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; - SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, - DAG.getVTList(MVT::Other), - Ops, MVT::i16, MMO); + SDValue Chain = Op.getOperand(0); + SDValue Ops[] = {Chain, StackSlot}; + Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, + DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI, + Align(2), MachineMemOperand::MOStore); // Load FP Control Word from stack slot - SDValue CWD = - DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo()); + SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2)); + Chain = CWD.getValue(1); - // Transform as necessary - SDValue CWD1 = + // Mask and turn the control bits into a shift for the lookup table. + SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i16, DAG.getNode(ISD::AND, DL, MVT::i16, - CWD, DAG.getConstant(0x800, DL, MVT::i16)), - DAG.getConstant(11, DL, MVT::i8)); - SDValue CWD2 = - DAG.getNode(ISD::SRL, DL, MVT::i16, - DAG.getNode(ISD::AND, DL, MVT::i16, - CWD, DAG.getConstant(0x400, DL, MVT::i16)), + CWD, DAG.getConstant(0xc00, DL, MVT::i16)), DAG.getConstant(9, DL, MVT::i8)); + Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift); + SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32); SDValue RetVal = - DAG.getNode(ISD::AND, DL, MVT::i16, - DAG.getNode(ISD::ADD, DL, MVT::i16, - DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), - DAG.getConstant(1, DL, MVT::i16)), - DAG.getConstant(3, DL, MVT::i16)); + DAG.getNode(ISD::AND, DL, MVT::i32, + DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift), + DAG.getConstant(3, DL, MVT::i32)); - return DAG.getNode((VT.getSizeInBits() < 16 ? - ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); -} + RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT); -// Split an unary integer op into 2 half sized ops. -static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - unsigned NumElems = VT.getVectorNumElements(); - unsigned SizeInBits = VT.getSizeInBits(); - MVT EltVT = VT.getVectorElementType(); - SDValue Src = Op.getOperand(0); - assert(EltVT == Src.getSimpleValueType().getVectorElementType() && - "Src and Op should have the same element type!"); - - // Extract the Lo/Hi vectors - SDLoc dl(Op); - SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2); - SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2); - - MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - DAG.getNode(Op.getOpcode(), dl, NewVT, Lo), - DAG.getNode(Op.getOpcode(), dl, NewVT, Hi)); -} - -// Decompose 256-bit ops into smaller 128-bit ops. -static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) { - assert(Op.getSimpleValueType().is256BitVector() && - Op.getSimpleValueType().isInteger() && - "Only handle AVX 256-bit vector integer operation"); - return LowerVectorIntUnary(Op, DAG); -} - -// Decompose 512-bit ops into smaller 256-bit ops. -static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) { - assert(Op.getSimpleValueType().is512BitVector() && - Op.getSimpleValueType().isInteger() && - "Only handle AVX 512-bit vector integer operation"); - return LowerVectorIntUnary(Op, DAG); + return DAG.getMergeValues({RetVal, Chain}, DL); } /// Lower a vector CTLZ using native supported vector CTLZ instruction. @@ -25502,7 +26220,7 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, // Split vector, it's Lo and Hi parts will be handled in next iteration. if (NumElems > 16 || (NumElems == 16 && !Subtarget.canExtendTo512DQ())) - return LowerVectorIntUnary(Op, DAG); + return splitVectorIntUnary(Op, DAG); MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && @@ -25612,11 +26330,11 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntUnary(Op, DAG); + return splitVectorIntUnary(Op, DAG); // Decompose 512-bit ops into smaller 256-bit ops. if (VT.is512BitVector() && !Subtarget.hasBWI()) - return Lower512IntUnary(Op, DAG); + return splitVectorIntUnary(Op, DAG); assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"); return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); @@ -25682,64 +26400,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); } -/// Break a 256-bit integer operation into two new 128-bit ones and then -/// concatenate the result back. -static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - - assert(VT.is256BitVector() && VT.isInteger() && - "Unsupported value type for operation"); - - unsigned NumElems = VT.getVectorNumElements(); - SDLoc dl(Op); - - // Extract the LHS vectors - SDValue LHS = Op.getOperand(0); - SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl); - SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl); - - // Extract the RHS vectors - SDValue RHS = Op.getOperand(1); - SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl); - SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl); - - MVT EltVT = VT.getVectorElementType(); - MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); - - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), - DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); -} - -/// Break a 512-bit integer operation into two new 256-bit ones and then -/// concatenate the result back. -static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) { - MVT VT = Op.getSimpleValueType(); - - assert(VT.is512BitVector() && VT.isInteger() && - "Unsupported value type for operation"); - - unsigned NumElems = VT.getVectorNumElements(); - SDLoc dl(Op); - - // Extract the LHS vectors - SDValue LHS = Op.getOperand(0); - SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl); - SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl); - - // Extract the RHS vectors - SDValue RHS = Op.getOperand(1); - SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl); - SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl); - - MVT EltVT = VT.getVectorElementType(); - MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); - - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, - DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1), - DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); -} - static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); @@ -25750,10 +26410,13 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::XOR, SDLoc(Op), VT, Op.getOperand(0), Op.getOperand(1)); + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitVectorIntBinary(Op, DAG); + assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); - return split256IntArith(Op, DAG); + return splitVectorIntBinary(Op, DAG); } static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, @@ -25798,10 +26461,13 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, return SDValue(); } + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitVectorIntBinary(Op, DAG); + assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); - return split256IntArith(Op, DAG); + return splitVectorIntBinary(Op, DAG); } static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, @@ -25831,9 +26497,12 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, if (VT.is256BitVector() && !Subtarget.hasInt256()) { assert(VT.isInteger() && "Only handle AVX 256-bit vector integer operation"); - return Lower256IntUnary(Op, DAG); + return splitVectorIntUnary(Op, DAG); } + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) + return splitVectorIntUnary(Op, DAG); + // Default to expand. return SDValue(); } @@ -25843,7 +26512,10 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { // For AVX1 cases, split to use legal ops (everything but v4i64). if (VT.getScalarType() != MVT::i64 && VT.is256BitVector()) - return split256IntArith(Op, DAG); + return splitVectorIntBinary(Op, DAG); + + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitVectorIntBinary(Op, DAG); SDLoc DL(Op); unsigned Opcode = Op.getOpcode(); @@ -25887,7 +26559,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, // Decompose 256-bit ops into 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) - return split256IntArith(Op, DAG); + return splitVectorIntBinary(Op, DAG); + + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) + return splitVectorIntBinary(Op, DAG); SDValue A = Op.getOperand(0); SDValue B = Op.getOperand(1); @@ -26033,7 +26708,10 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // Decompose 256-bit ops into 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) - return split256IntArith(Op, DAG); + return splitVectorIntBinary(Op, DAG); + + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) + return splitVectorIntBinary(Op, DAG); if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) { assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || @@ -26122,41 +26800,9 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); } - // For signed 512-bit vectors, split into 256-bit vectors to allow the - // sign-extension to occur. - if (VT == MVT::v64i8 && IsSigned) - return split512IntArith(Op, DAG); - - // Signed AVX2 implementation - extend xmm subvectors to ymm. - if (VT == MVT::v32i8 && IsSigned) { - MVT ExVT = MVT::v16i16; - SDValue ALo = extract128BitVector(A, 0, DAG, dl); - SDValue BLo = extract128BitVector(B, 0, DAG, dl); - SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl); - SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl); - ALo = DAG.getNode(ExAVX, dl, ExVT, ALo); - BLo = DAG.getNode(ExAVX, dl, ExVT, BLo); - AHi = DAG.getNode(ExAVX, dl, ExVT, AHi); - BHi = DAG.getNode(ExAVX, dl, ExVT, BHi); - SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); - SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); - Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG); - Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG); - - // Bitcast back to VT and then pack all the even elements from Lo and Hi. - // Shuffle lowering should turn this into PACKUS+PERMQ - Lo = DAG.getBitcast(VT, Lo); - Hi = DAG.getBitcast(VT, Hi); - return DAG.getVectorShuffle(VT, dl, Lo, Hi, - { 0, 2, 4, 6, 8, 10, 12, 14, - 16, 18, 20, 22, 24, 26, 28, 30, - 32, 34, 36, 38, 40, 42, 44, 46, - 48, 50, 52, 54, 56, 58, 60, 62}); - } - - // For signed v16i8 and all unsigned vXi8 we will unpack the low and high - // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies, - // shift the results and pack the half lane results back together. + // For vXi8 we will unpack the low and high half of each 128 bit lane to widen + // to a vXi16 type. Do the multiplies, shift the results and pack the half + // lane results back together. MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); @@ -26270,9 +26916,12 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && "Unexpected argument type for lowering"); SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); + int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); Entry.Node = StackPtr; InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, - MachinePointerInfo(), /* Alignment = */ 16); + MPI, /* Alignment = */ 16); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); Entry.Ty = PointerType::get(ArgTy,0); Entry.IsSExt = false; @@ -26413,7 +27062,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, return ArithmeticShiftRight64(ShiftAmt); if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) || - VT == MVT::v64i8) { + (Subtarget.hasBWI() && VT == MVT::v64i8)) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); @@ -26859,8 +27508,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI. if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) && - (VT == MVT::v16i8 || VT == MVT::v64i8 || - (VT == MVT::v32i8 && Subtarget.hasInt256())) && + (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || + (VT == MVT::v64i8 && Subtarget.hasBWI())) && !Subtarget.hasXOP()) { int NumElts = VT.getVectorNumElements(); SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8); @@ -26923,12 +27572,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, ISD::SETGT); return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } else if (Subtarget.hasSSE41()) { - // On SSE41 targets we make use of the fact that VSELECT lowers - // to PBLENDVB which selects bytes based just on the sign bit. + // On SSE41 targets we can use PBLENDVB which selects bytes based just + // on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); - return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, + DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true @@ -27038,14 +27688,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { - // On SSE41 targets we make use of the fact that VSELECT lowers - // to PBLENDVB which selects bytes based just on the sign bit. + // On SSE41 targets we can use PBLENDVB which selects bytes based just on + // the sign bit. if (UseSSE41) { MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); Sel = DAG.getBitcast(ExtVT, Sel); - return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1)); + return DAG.getBitcast( + VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1)); } // On pre-SSE41 targets we splat the sign bit - a negative value will // set all bits of the lanes to true and VSELECT uses that in @@ -27096,7 +27747,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // Decompose 256-bit shifts into 128-bit shifts. if (VT.is256BitVector()) - return split256IntArith(Op, DAG); + return splitVectorIntBinary(Op, DAG); + + if (VT == MVT::v32i16 || VT == MVT::v64i8) + return splitVectorIntBinary(Op, DAG); return SDValue(); } @@ -27114,28 +27768,21 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, int NumElts = VT.getVectorNumElements(); // Check for constant splat rotation amount. - APInt UndefElts; - SmallVector<APInt, 32> EltBits; - int CstSplatIndex = -1; - if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) - for (int i = 0; i != NumElts; ++i) - if (!UndefElts[i]) { - if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) { - CstSplatIndex = i; - continue; - } - CstSplatIndex = -1; - break; - } + APInt CstSplatValue; + bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue); + + // Check for splat rotate by zero. + if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0) + return R; // AVX512 implicitly uses modulo rotation amounts. if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) { // Attempt to rotate by immediate. - if (0 <= CstSplatIndex) { - unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); - uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); - return DAG.getNode(Op, DL, VT, R, - DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); + if (IsCstSplat) { + unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); + uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); + return DAG.getNode(RotOpc, DL, VT, R, + DAG.getTargetConstant(RotAmt, DL, MVT::i8)); } // Else, fall-back on VPROLV/VPRORV. @@ -27149,14 +27796,14 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, // XOP implicitly uses modulo rotation amounts. if (Subtarget.hasXOP()) { if (VT.is256BitVector()) - return split256IntArith(Op, DAG); + return splitVectorIntBinary(Op, DAG); assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); // Attempt to rotate by immediate. - if (0 <= CstSplatIndex) { - uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits); + if (IsCstSplat) { + uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); return DAG.getNode(X86ISD::VROTLI, DL, VT, R, - DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); + DAG.getTargetConstant(RotAmt, DL, MVT::i8)); } // Use general rotate by variable (per-element). @@ -27165,7 +27812,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, // Split 256-bit integers on pre-AVX2 targets. if (VT.is256BitVector() && !Subtarget.hasAVX2()) - return split256IntArith(Op, DAG); + return splitVectorIntBinary(Op, DAG); assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && @@ -27173,7 +27820,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, "Only vXi32/vXi16/vXi8 vector rotates supported"); // Rotate by an uniform constant - expand back to shifts. - if (0 <= CstSplatIndex) + if (IsCstSplat) return SDValue(); bool IsSplatAmt = DAG.isSplatValue(Amt); @@ -27189,12 +27836,13 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { if (Subtarget.hasSSE41()) { - // On SSE41 targets we make use of the fact that VSELECT lowers - // to PBLENDVB which selects bytes based just on the sign bit. + // On SSE41 targets we can use PBLENDVB which selects bytes based just + // on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); - return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, + DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true @@ -27306,15 +27954,14 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { return false; } -// TODO: In 32-bit mode, use MOVLPS when SSE1 is available? -// TODO: In 32-bit mode, use FISTP when X87 is available? bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { Type *MemType = SI->getValueOperand()->getType(); bool NoImplicitFloatOps = SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && - !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2()) + !Subtarget.useSoftFloat() && !NoImplicitFloatOps && + (Subtarget.hasSSE1() || Subtarget.hasX87())) return false; return needsCmpXchgNb(MemType); @@ -27333,7 +27980,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && !Subtarget.useSoftFloat() && !NoImplicitFloatOps && - (Subtarget.hasSSE2() || Subtarget.hasX87())) + (Subtarget.hasSSE1() || Subtarget.hasX87())) return AtomicExpansionKind::None; return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg @@ -27399,7 +28046,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { AI->use_empty()) return nullptr; - auto Builder = IRBuilder<>(AI); + IRBuilder<> Builder(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or @@ -27441,7 +28088,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(), - AI->getType()->getPrimitiveSizeInBits()); + Align(AI->getType()->getPrimitiveSizeInBits())); Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); @@ -27636,18 +28283,6 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); } - // Custom splitting for BWI types when AVX512F is available but BWI isn't. - if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() && - DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) { - SDLoc dl(Op); - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); - MVT CastVT = DstVT.getHalfNumVectorElementsVT(); - Lo = DAG.getBitcast(CastVT, Lo); - Hi = DAG.getBitcast(CastVT, Hi); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi); - } - // Use MOVMSK for vector to scalar conversion to prevent scalarization. if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) { assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512"); @@ -27831,11 +28466,11 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntUnary(Op, DAG); + return splitVectorIntUnary(Op, DAG); // Decompose 512-bit ops into smaller 256-bit ops. if (VT.is512BitVector() && !Subtarget.hasBWI()) - return Lower512IntUnary(Op, DAG); + return splitVectorIntUnary(Op, DAG); // For element types greater than i8, do vXi8 pop counts and a bytesum. if (VT.getScalarType() != MVT::i8) { @@ -27879,7 +28514,7 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector()) - return Lower256IntUnary(Op, DAG); + return splitVectorIntUnary(Op, DAG); assert(VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."); @@ -27916,12 +28551,9 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SDValue In = Op.getOperand(0); SDLoc DL(Op); - // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB - // lowering. - if (VT == MVT::v8i64 || VT == MVT::v16i32) { - assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE"); - return Lower512IntUnary(Op, DAG); - } + // Split v64i8 without BWI so that we can still use the PSHUFB lowering. + if (VT == MVT::v64i8 && !Subtarget.hasBWI()) + return splitVectorIntUnary(Op, DAG); unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarType() == MVT::i8 && @@ -27929,7 +28561,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. if (VT.is256BitVector() && !Subtarget.hasInt256()) - return Lower256IntUnary(Op, DAG); + return splitVectorIntUnary(Op, DAG); // Perform BITREVERSE using PSHUFB lookups. Each byte is split into // two nibbles and a PSHUFB lookup to find the bitreverse of each @@ -28073,28 +28705,54 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, return Op; if (VT == MVT::i64 && !IsTypeLegal) { - // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled. - // FIXME: Use movlps with SSE1. - // FIXME: Use fist with X87. + // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE + // is enabled. bool NoImplicitFloatOps = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); - if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && - Subtarget.hasSSE2()) { - SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, - Node->getOperand(2)); - SDVTList Tys = DAG.getVTList(MVT::Other); - SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() }; - SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, - Ops, MVT::i64, - Node->getMemOperand()); + if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { + SDValue Chain; + if (Subtarget.hasSSE1()) { + SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, + Node->getOperand(2)); + MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32; + SclToVec = DAG.getBitcast(StVT, SclToVec); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()}; + Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, + MVT::i64, Node->getMemOperand()); + } else if (Subtarget.hasX87()) { + // First load this into an 80-bit X87 register using a stack temporary. + // This will put the whole integer into the significand. + SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64); + int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); + Chain = + DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr, + MPI, /*Align*/ 0, MachineMemOperand::MOStore); + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); + SDValue LdOps[] = {Chain, StackPtr}; + SDValue Value = + DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI, + /*Align*/ None, MachineMemOperand::MOLoad); + Chain = Value.getValue(1); + + // Now use an FIST to do the atomic store. + SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()}; + Chain = + DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other), + StoreOps, MVT::i64, Node->getMemOperand()); + } - // If this is a sequentially consistent store, also emit an appropriate - // barrier. - if (IsSeqCst) - Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); + if (Chain) { + // If this is a sequentially consistent store, also emit an appropriate + // barrier. + if (IsSeqCst) + Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); - return Chain; + return Chain; + } } } @@ -28123,9 +28781,8 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { // Set the carry flag. SDValue Carry = Op.getOperand(2); EVT CarryVT = Carry.getValueType(); - APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits()); Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), - Carry, DAG.getConstant(NegOne, DL, CarryVT)); + Carry, DAG.getAllOnesConstant(DL, CarryVT)); unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB; SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0), @@ -28170,7 +28827,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) - : (Type *)VectorType::get(ArgTy, 4); + : (Type *)FixedVectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) @@ -28267,17 +28924,15 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT)); - SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other); + SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; - SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( - VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - return SDValue(NewScatter.getNode(), 1); + return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops, + N->getMemoryVT(), N->getMemOperand()); } return SDValue(); } MVT IndexVT = Index.getSimpleValueType(); - MVT MaskVT = Mask.getSimpleValueType(); // If the index is v2i32, we're being called by type legalization and we // should just let the default handling take care of it. @@ -28295,18 +28950,17 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); - MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); Src = ExtendToType(Src, VT, DAG); Index = ExtendToType(Index, IndexVT, DAG); Mask = ExtendToType(Mask, MaskVT, DAG, true); } - SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other); + SDVTList VTs = DAG.getVTList(MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; - SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( - VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); - return SDValue(NewScatter.getNode(), 1); + return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops, + N->getMemoryVT(), N->getMemOperand()); } static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, @@ -28332,8 +28986,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), N->isExpandingLoad()); // Emit a blend. - SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad, - PassThru); + SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl); } @@ -28369,10 +29022,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), N->isExpandingLoad()); - SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - NewLoad.getValue(0), - DAG.getIntPtrConstant(0, dl)); - SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; + SDValue Extract = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Extract, NewLoad.getValue(1)}; return DAG.getMergeValues(RetOps, dl); } @@ -28430,7 +29083,6 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SDValue Mask = N->getMask(); SDValue PassThru = N->getPassThru(); MVT IndexVT = Index.getSimpleValueType(); - MVT MaskVT = Mask.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); @@ -28451,7 +29103,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); - MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); PassThru = ExtendToType(PassThru, VT, DAG); Index = ExtendToType(Index, IndexVT, DAG); @@ -28460,12 +29112,12 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index, N->getScale() }; - SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(), + SDValue NewGather = DAG.getMemIntrinsicNode( + X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(), N->getMemOperand()); SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, NewGather, DAG.getIntPtrConstant(0, dl)); - return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl); + return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl); } static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) { @@ -28531,6 +29183,20 @@ SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, return Tmp.first; } +// Custom split CVTPS2PH with wide types. +static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + SDValue RC = Op.getOperand(1); + Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC); + Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); +} + /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -28584,14 +29250,21 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); + case ISD::FP16_TO_FP: + case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG); + case ISD::FP_TO_FP16: + case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: case ISD::FSUB: return lowerFaddFsub(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); + case ISD::LRINT: + case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG); case ISD::SETCC: case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); @@ -28659,8 +29332,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG); - case ISD::ADDRSPACECAST: - return LowerADDRSPACECAST(Op, DAG); + case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); + case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); } } @@ -28706,6 +29379,35 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, N->dump(&DAG); #endif llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::CVTPH2PS: { + EVT VT = N->getValueType(0); + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo); + Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Results.push_back(Res); + return; + } + case X86ISD::STRICT_CVTPH2PS: { + EVT VT = N->getValueType(0); + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other}, + {N->getOperand(0), Lo}); + Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other}, + {N->getOperand(0), Hi}); + SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Lo.getValue(1), Hi.getValue(1)); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Results.push_back(Res); + Results.push_back(Chain); + return; + } case ISD::CTPOP: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); // Use a v2i64 if possible. @@ -28775,7 +29477,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::ABS: { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); MVT HalfT = MVT::i32; @@ -28788,15 +29489,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, DAG.getConstant(1, dl, HalfT)); Tmp = DAG.getNode( ISD::SRA, dl, HalfT, Hi, - DAG.getConstant(HalfT.getSizeInBits() - 1, dl, - TLI.getShiftAmountTy(HalfT, DAG.getDataLayout()))); + DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl)); Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, SDValue(Lo.getNode(), 1)); Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); - Results.push_back(Lo); - Results.push_back(Hi); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi)); return; } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. @@ -29148,6 +29847,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::LRINT: + case ISD::LLRINT: { + if (SDValue V = LRINT_LLRINTHelper(N, DAG)) + Results.push_back(V); + return; + } + case ISD::SINT_TO_FP: case ISD::STRICT_SINT_TO_FP: case ISD::UINT_TO_FP: @@ -29185,14 +29891,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src); SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32)); for (int i = 0; i != 2; ++i) { - SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, SignSrc, DAG.getIntPtrConstant(i, dl)); if (IsStrict) SignCvts[i] = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other}, - {N->getOperand(0), Src}); + {N->getOperand(0), Elt}); else - SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src); + SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt); }; SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts); SDValue Slow, Chain; @@ -29272,7 +29978,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(V.getValue(1)); return; } - case ISD::FP_EXTEND: { + case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: { // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. // No other ValueType for FP_EXTEND should reach this point. assert(N->getValueType(0) == MVT::v2f32 && @@ -29394,15 +30101,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Attribute::NoImplicitFloat); if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { auto *Node = cast<AtomicSDNode>(N); - if (Subtarget.hasSSE2()) { - // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the - // lower 64-bits. - SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + if (Subtarget.hasSSE1()) { + // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS. + // Then extract the lower 64-bits. + MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32; + SDVTList Tys = DAG.getVTList(LdVT, MVT::Other); SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); - SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, + if (Subtarget.hasSSE2()) { + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Ld.getValue(1)); + return; + } + // We use an alternative sequence for SSE1 that extracts as v2f32 and + // then casts to i64. This avoids a 128-bit stack temporary being + // created by type legalization if we were to cast v4f32->v2i64. + SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld, DAG.getIntPtrConstant(0, dl)); + Res = DAG.getBitcast(MVT::i64, Res); Results.push_back(Res); Results.push_back(Ld.getValue(1)); return; @@ -29410,14 +30129,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (Subtarget.hasX87()) { // First load this into an 80-bit X87 register. This will put the whole // integer into the significand. - // FIXME: Do we need to glue? See FIXME comment in BuildFILD. - SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue); + SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; - SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG, + SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, Node->getMemOperand()); SDValue Chain = Result.getValue(1); - SDValue InFlag = Result.getValue(2); // Now store the X87 register to a stack temporary and convert to i64. // This store is not atomic and doesn't need to be. @@ -29427,11 +30144,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); - SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag }; - Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl, - DAG.getVTList(MVT::Other), StoreOps, - MVT::i64, MPI, 0 /*Align*/, - MachineMemOperand::MOStore); + SDValue StoreOps[] = { Chain, Result, StackPtr }; + Chain = DAG.getMemIntrinsicNode( + X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64, + MPI, None /*Align*/, MachineMemOperand::MOStore); // Finally load the value back from the stack temporary and return it. // This load is not atomic and doesn't need to be. @@ -29480,24 +30196,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } - // Custom splitting for BWI types when AVX512F is available but BWI isn't. - if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) && - SrcVT.isVector() && isTypeLegal(SrcVT)) { - SDValue Lo, Hi; - std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); - MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8; - Lo = DAG.getBitcast(CastVT, Lo); - Hi = DAG.getBitcast(CastVT, Hi); - SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi); - Results.push_back(Res); - return; - } - if (DstVT.isVector() && SrcVT == MVT::x86mmx) { + // FIXME: Use v4f32 for SSE1? + assert(Subtarget.hasSSE2() && "Requires SSE2"); assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && "Unexpected type action!"); EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT); - SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0)); + SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, + N->getOperand(0)); + Res = DAG.getBitcast(WideVT, Res); Results.push_back(Res); return; } @@ -29529,11 +30236,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } SDValue Ops[] = { Gather->getChain(), PassThru, Mask, Gather->getBasePtr(), Index, Gather->getScale() }; - SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl, - Gather->getMemoryVT(), Gather->getMemOperand()); + SDValue Res = DAG.getMemIntrinsicNode( + X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops, + Gather->getMemoryVT(), Gather->getMemOperand()); Results.push_back(Res); - Results.push_back(Res.getValue(2)); + Results.push_back(Res.getValue(1)); return; } return; @@ -29552,7 +30259,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, if (Subtarget.hasSSE2()) { MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getPointerInfo(), Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); SDValue Chain = Res.getValue(1); MVT VecVT = MVT::getVectorVT(LdVT, 2); @@ -29573,25 +30280,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::ADDRSPACECAST: { - SDValue Src = N->getOperand(0); - EVT DstVT = N->getValueType(0); - AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N); - unsigned SrcAS = CastN->getSrcAddressSpace(); - - assert(SrcAS != CastN->getDestAddressSpace() && - "addrspacecast must be between different address spaces"); - - SDValue Res; - if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) - Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src); - else if (DstVT == MVT::i64) - Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src); - else if (DstVT == MVT::i32) - Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src); - else - report_fatal_error("Unrecognized addrspacecast type legalization"); - - Results.push_back(Res); + SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG); + Results.push_back(V); return; } } @@ -29600,362 +30290,367 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((X86ISD::NodeType)Opcode) { case X86ISD::FIRST_NUMBER: break; - case X86ISD::BSF: return "X86ISD::BSF"; - case X86ISD::BSR: return "X86ISD::BSR"; - case X86ISD::SHLD: return "X86ISD::SHLD"; - case X86ISD::SHRD: return "X86ISD::SHRD"; - case X86ISD::FAND: return "X86ISD::FAND"; - case X86ISD::FANDN: return "X86ISD::FANDN"; - case X86ISD::FOR: return "X86ISD::FOR"; - case X86ISD::FXOR: return "X86ISD::FXOR"; - case X86ISD::FILD: return "X86ISD::FILD"; - case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; - case X86ISD::FIST: return "X86ISD::FIST"; - case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM"; - case X86ISD::FLD: return "X86ISD::FLD"; - case X86ISD::FST: return "X86ISD::FST"; - case X86ISD::CALL: return "X86ISD::CALL"; - case X86ISD::BT: return "X86ISD::BT"; - case X86ISD::CMP: return "X86ISD::CMP"; - case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP"; - case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS"; - case X86ISD::COMI: return "X86ISD::COMI"; - case X86ISD::UCOMI: return "X86ISD::UCOMI"; - case X86ISD::CMPM: return "X86ISD::CMPM"; - case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM"; - case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE"; - case X86ISD::SETCC: return "X86ISD::SETCC"; - case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; - case X86ISD::FSETCC: return "X86ISD::FSETCC"; - case X86ISD::FSETCCM: return "X86ISD::FSETCCM"; - case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE"; - case X86ISD::CMOV: return "X86ISD::CMOV"; - case X86ISD::BRCOND: return "X86ISD::BRCOND"; - case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; - case X86ISD::IRET: return "X86ISD::IRET"; - case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; - case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; - case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; - case X86ISD::Wrapper: return "X86ISD::Wrapper"; - case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; - case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ"; - case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q"; - case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W"; - case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D"; - case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; - case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; - case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; - case X86ISD::PINSRB: return "X86ISD::PINSRB"; - case X86ISD::PINSRW: return "X86ISD::PINSRW"; - case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; - case X86ISD::ANDNP: return "X86ISD::ANDNP"; - case X86ISD::BLENDI: return "X86ISD::BLENDI"; - case X86ISD::BLENDV: return "X86ISD::BLENDV"; - case X86ISD::HADD: return "X86ISD::HADD"; - case X86ISD::HSUB: return "X86ISD::HSUB"; - case X86ISD::FHADD: return "X86ISD::FHADD"; - case X86ISD::FHSUB: return "X86ISD::FHSUB"; - case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; - case X86ISD::FMAX: return "X86ISD::FMAX"; - case X86ISD::FMAXS: return "X86ISD::FMAXS"; - case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE"; - case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE"; - case X86ISD::FMIN: return "X86ISD::FMIN"; - case X86ISD::FMINS: return "X86ISD::FMINS"; - case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE"; - case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE"; - case X86ISD::FMAXC: return "X86ISD::FMAXC"; - case X86ISD::FMINC: return "X86ISD::FMINC"; - case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; - case X86ISD::FRCP: return "X86ISD::FRCP"; - case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; - case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; - case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; - case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; - case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; - case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; - case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; - case X86ISD::EH_SJLJ_SETUP_DISPATCH: - return "X86ISD::EH_SJLJ_SETUP_DISPATCH"; - case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; - case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; - case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; - case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r"; - case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; - case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; - case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG"; - case X86ISD::LCMPXCHG8_SAVE_EBX_DAG: - return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG"; - case X86ISD::LCMPXCHG16_SAVE_RBX_DAG: - return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG"; - case X86ISD::LADD: return "X86ISD::LADD"; - case X86ISD::LSUB: return "X86ISD::LSUB"; - case X86ISD::LOR: return "X86ISD::LOR"; - case X86ISD::LXOR: return "X86ISD::LXOR"; - case X86ISD::LAND: return "X86ISD::LAND"; - case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; - case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; - case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE"; - case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; - case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; - case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; - case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC"; - case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS"; - case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS"; - case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES"; - case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS"; - case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES"; - case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS"; - case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; - case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT"; - case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE"; - case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS"; - case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE"; - case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; - case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND"; - case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND"; - case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND"; - case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS"; - case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND"; - case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; - case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; - case X86ISD::VSHL: return "X86ISD::VSHL"; - case X86ISD::VSRL: return "X86ISD::VSRL"; - case X86ISD::VSRA: return "X86ISD::VSRA"; - case X86ISD::VSHLI: return "X86ISD::VSHLI"; - case X86ISD::VSRLI: return "X86ISD::VSRLI"; - case X86ISD::VSRAI: return "X86ISD::VSRAI"; - case X86ISD::VSHLV: return "X86ISD::VSHLV"; - case X86ISD::VSRLV: return "X86ISD::VSRLV"; - case X86ISD::VSRAV: return "X86ISD::VSRAV"; - case X86ISD::VROTLI: return "X86ISD::VROTLI"; - case X86ISD::VROTRI: return "X86ISD::VROTRI"; - case X86ISD::VPPERM: return "X86ISD::VPPERM"; - case X86ISD::CMPP: return "X86ISD::CMPP"; - case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP"; - case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ"; - case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; - case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS"; - case X86ISD::ADD: return "X86ISD::ADD"; - case X86ISD::SUB: return "X86ISD::SUB"; - case X86ISD::ADC: return "X86ISD::ADC"; - case X86ISD::SBB: return "X86ISD::SBB"; - case X86ISD::SMUL: return "X86ISD::SMUL"; - case X86ISD::UMUL: return "X86ISD::UMUL"; - case X86ISD::OR: return "X86ISD::OR"; - case X86ISD::XOR: return "X86ISD::XOR"; - case X86ISD::AND: return "X86ISD::AND"; - case X86ISD::BEXTR: return "X86ISD::BEXTR"; - case X86ISD::BZHI: return "X86ISD::BZHI"; - case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; - case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; - case X86ISD::PTEST: return "X86ISD::PTEST"; - case X86ISD::TESTP: return "X86ISD::TESTP"; - case X86ISD::KORTEST: return "X86ISD::KORTEST"; - case X86ISD::KTEST: return "X86ISD::KTEST"; - case X86ISD::KADD: return "X86ISD::KADD"; - case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL"; - case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR"; - case X86ISD::PACKSS: return "X86ISD::PACKSS"; - case X86ISD::PACKUS: return "X86ISD::PACKUS"; - case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; - case X86ISD::VALIGN: return "X86ISD::VALIGN"; - case X86ISD::VSHLD: return "X86ISD::VSHLD"; - case X86ISD::VSHRD: return "X86ISD::VSHRD"; - case X86ISD::VSHLDV: return "X86ISD::VSHLDV"; - case X86ISD::VSHRDV: return "X86ISD::VSHRDV"; - case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; - case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; - case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; - case X86ISD::SHUFP: return "X86ISD::SHUFP"; - case X86ISD::SHUF128: return "X86ISD::SHUF128"; - case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; - case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; - case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; - case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; - case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; - case X86ISD::MOVSD: return "X86ISD::MOVSD"; - case X86ISD::MOVSS: return "X86ISD::MOVSS"; - case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; - case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; - case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; - case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD"; - case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; - case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; - case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; - case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; - case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; - case X86ISD::VPERMV: return "X86ISD::VPERMV"; - case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; - case X86ISD::VPERMI: return "X86ISD::VPERMI"; - case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; - case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; - case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE"; - case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; - case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE"; - case X86ISD::VRANGE: return "X86ISD::VRANGE"; - case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE"; - case X86ISD::VRANGES: return "X86ISD::VRANGES"; - case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE"; - case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; - case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; - case X86ISD::PSADBW: return "X86ISD::PSADBW"; - case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; - case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; - case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; - case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; - case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER"; - case X86ISD::MFENCE: return "X86ISD::MFENCE"; - case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; - case X86ISD::SAHF: return "X86ISD::SAHF"; - case X86ISD::RDRAND: return "X86ISD::RDRAND"; - case X86ISD::RDSEED: return "X86ISD::RDSEED"; - case X86ISD::RDPKRU: return "X86ISD::RDPKRU"; - case X86ISD::WRPKRU: return "X86ISD::WRPKRU"; - case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; - case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; - case X86ISD::VPSHA: return "X86ISD::VPSHA"; - case X86ISD::VPSHL: return "X86ISD::VPSHL"; - case X86ISD::VPCOM: return "X86ISD::VPCOM"; - case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; - case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2"; - case X86ISD::FMSUB: return "X86ISD::FMSUB"; - case X86ISD::FNMADD: return "X86ISD::FNMADD"; - case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; - case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; - case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; - case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND"; - case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND"; - case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND"; - case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; - case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; - case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; - case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; - case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; - case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; - case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE"; - case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE"; - case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; - case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE"; - case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; - case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE"; - case X86ISD::VREDUCES: return "X86ISD::VREDUCES"; - case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE"; - case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; - case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE"; - case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; - case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE"; - case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR"; - case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR"; - case X86ISD::XTEST: return "X86ISD::XTEST"; - case X86ISD::COMPRESS: return "X86ISD::COMPRESS"; - case X86ISD::EXPAND: return "X86ISD::EXPAND"; - case X86ISD::SELECTS: return "X86ISD::SELECTS"; - case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; - case X86ISD::RCP14: return "X86ISD::RCP14"; - case X86ISD::RCP14S: return "X86ISD::RCP14S"; - case X86ISD::RCP28: return "X86ISD::RCP28"; - case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE"; - case X86ISD::RCP28S: return "X86ISD::RCP28S"; - case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE"; - case X86ISD::EXP2: return "X86ISD::EXP2"; - case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE"; - case X86ISD::RSQRT14: return "X86ISD::RSQRT14"; - case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S"; - case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; - case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE"; - case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; - case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE"; - case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; - case X86ISD::FADDS: return "X86ISD::FADDS"; - case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND"; - case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND"; - case X86ISD::FSUBS: return "X86ISD::FSUBS"; - case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND"; - case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND"; - case X86ISD::FMULS: return "X86ISD::FMULS"; - case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND"; - case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; - case X86ISD::FDIVS: return "X86ISD::FDIVS"; - case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND"; - case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND"; - case X86ISD::FSQRTS: return "X86ISD::FSQRTS"; - case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND"; - case X86ISD::FGETEXP: return "X86ISD::FGETEXP"; - case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE"; - case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS"; - case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE"; - case X86ISD::SCALEF: return "X86ISD::SCALEF"; - case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND"; - case X86ISD::SCALEFS: return "X86ISD::SCALEFS"; - case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND"; - case X86ISD::AVG: return "X86ISD::AVG"; - case X86ISD::MULHRS: return "X86ISD::MULHRS"; - case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND"; - case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; - case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI"; - case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI"; - case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI"; - case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI"; - case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI"; - case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI"; - case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE"; - case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE"; - case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI"; - case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI"; - case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE"; - case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE"; - case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P"; - case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P"; - case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P"; - case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P"; - case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P"; - case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P"; - case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; - case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; - case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT"; - case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP"; - case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND"; - case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP"; - case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; - case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; - case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH"; - case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; - case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE"; - case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; - case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; - case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI"; - case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI"; - case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND"; - case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND"; - case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI"; - case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI"; - case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND"; - case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; - case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16"; - case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16"; - case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16"; - case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS"; - case X86ISD::LWPINS: return "X86ISD::LWPINS"; - case X86ISD::MGATHER: return "X86ISD::MGATHER"; - case X86ISD::MSCATTER: return "X86ISD::MSCATTER"; - case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD"; - case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS"; - case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD"; - case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS"; - case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB"; - case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB"; - case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB"; - case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB"; - case X86ISD::NT_CALL: return "X86ISD::NT_CALL"; - case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND"; - case X86ISD::UMWAIT: return "X86ISD::UMWAIT"; - case X86ISD::TPAUSE: return "X86ISD::TPAUSE"; - case X86ISD::ENQCMD: return "X86ISD:ENQCMD"; - case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS"; - case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT"; +#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE; + NODE_NAME_CASE(BSF) + NODE_NAME_CASE(BSR) + NODE_NAME_CASE(FSHL) + NODE_NAME_CASE(FSHR) + NODE_NAME_CASE(FAND) + NODE_NAME_CASE(FANDN) + NODE_NAME_CASE(FOR) + NODE_NAME_CASE(FXOR) + NODE_NAME_CASE(FILD) + NODE_NAME_CASE(FIST) + NODE_NAME_CASE(FP_TO_INT_IN_MEM) + NODE_NAME_CASE(FLD) + NODE_NAME_CASE(FST) + NODE_NAME_CASE(CALL) + NODE_NAME_CASE(BT) + NODE_NAME_CASE(CMP) + NODE_NAME_CASE(FCMP) + NODE_NAME_CASE(STRICT_FCMP) + NODE_NAME_CASE(STRICT_FCMPS) + NODE_NAME_CASE(COMI) + NODE_NAME_CASE(UCOMI) + NODE_NAME_CASE(CMPM) + NODE_NAME_CASE(STRICT_CMPM) + NODE_NAME_CASE(CMPM_SAE) + NODE_NAME_CASE(SETCC) + NODE_NAME_CASE(SETCC_CARRY) + NODE_NAME_CASE(FSETCC) + NODE_NAME_CASE(FSETCCM) + NODE_NAME_CASE(FSETCCM_SAE) + NODE_NAME_CASE(CMOV) + NODE_NAME_CASE(BRCOND) + NODE_NAME_CASE(RET_FLAG) + NODE_NAME_CASE(IRET) + NODE_NAME_CASE(REP_STOS) + NODE_NAME_CASE(REP_MOVS) + NODE_NAME_CASE(GlobalBaseReg) + NODE_NAME_CASE(Wrapper) + NODE_NAME_CASE(WrapperRIP) + NODE_NAME_CASE(MOVQ2DQ) + NODE_NAME_CASE(MOVDQ2Q) + NODE_NAME_CASE(MMX_MOVD2W) + NODE_NAME_CASE(MMX_MOVW2D) + NODE_NAME_CASE(PEXTRB) + NODE_NAME_CASE(PEXTRW) + NODE_NAME_CASE(INSERTPS) + NODE_NAME_CASE(PINSRB) + NODE_NAME_CASE(PINSRW) + NODE_NAME_CASE(PSHUFB) + NODE_NAME_CASE(ANDNP) + NODE_NAME_CASE(BLENDI) + NODE_NAME_CASE(BLENDV) + NODE_NAME_CASE(HADD) + NODE_NAME_CASE(HSUB) + NODE_NAME_CASE(FHADD) + NODE_NAME_CASE(FHSUB) + NODE_NAME_CASE(CONFLICT) + NODE_NAME_CASE(FMAX) + NODE_NAME_CASE(FMAXS) + NODE_NAME_CASE(FMAX_SAE) + NODE_NAME_CASE(FMAXS_SAE) + NODE_NAME_CASE(FMIN) + NODE_NAME_CASE(FMINS) + NODE_NAME_CASE(FMIN_SAE) + NODE_NAME_CASE(FMINS_SAE) + NODE_NAME_CASE(FMAXC) + NODE_NAME_CASE(FMINC) + NODE_NAME_CASE(FRSQRT) + NODE_NAME_CASE(FRCP) + NODE_NAME_CASE(EXTRQI) + NODE_NAME_CASE(INSERTQI) + NODE_NAME_CASE(TLSADDR) + NODE_NAME_CASE(TLSBASEADDR) + NODE_NAME_CASE(TLSCALL) + NODE_NAME_CASE(EH_SJLJ_SETJMP) + NODE_NAME_CASE(EH_SJLJ_LONGJMP) + NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH) + NODE_NAME_CASE(EH_RETURN) + NODE_NAME_CASE(TC_RETURN) + NODE_NAME_CASE(FNSTCW16m) + NODE_NAME_CASE(LCMPXCHG_DAG) + NODE_NAME_CASE(LCMPXCHG8_DAG) + NODE_NAME_CASE(LCMPXCHG16_DAG) + NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG) + NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG) + NODE_NAME_CASE(LADD) + NODE_NAME_CASE(LSUB) + NODE_NAME_CASE(LOR) + NODE_NAME_CASE(LXOR) + NODE_NAME_CASE(LAND) + NODE_NAME_CASE(VZEXT_MOVL) + NODE_NAME_CASE(VZEXT_LOAD) + NODE_NAME_CASE(VEXTRACT_STORE) + NODE_NAME_CASE(VTRUNC) + NODE_NAME_CASE(VTRUNCS) + NODE_NAME_CASE(VTRUNCUS) + NODE_NAME_CASE(VMTRUNC) + NODE_NAME_CASE(VMTRUNCS) + NODE_NAME_CASE(VMTRUNCUS) + NODE_NAME_CASE(VTRUNCSTORES) + NODE_NAME_CASE(VTRUNCSTOREUS) + NODE_NAME_CASE(VMTRUNCSTORES) + NODE_NAME_CASE(VMTRUNCSTOREUS) + NODE_NAME_CASE(VFPEXT) + NODE_NAME_CASE(STRICT_VFPEXT) + NODE_NAME_CASE(VFPEXT_SAE) + NODE_NAME_CASE(VFPEXTS) + NODE_NAME_CASE(VFPEXTS_SAE) + NODE_NAME_CASE(VFPROUND) + NODE_NAME_CASE(STRICT_VFPROUND) + NODE_NAME_CASE(VMFPROUND) + NODE_NAME_CASE(VFPROUND_RND) + NODE_NAME_CASE(VFPROUNDS) + NODE_NAME_CASE(VFPROUNDS_RND) + NODE_NAME_CASE(VSHLDQ) + NODE_NAME_CASE(VSRLDQ) + NODE_NAME_CASE(VSHL) + NODE_NAME_CASE(VSRL) + NODE_NAME_CASE(VSRA) + NODE_NAME_CASE(VSHLI) + NODE_NAME_CASE(VSRLI) + NODE_NAME_CASE(VSRAI) + NODE_NAME_CASE(VSHLV) + NODE_NAME_CASE(VSRLV) + NODE_NAME_CASE(VSRAV) + NODE_NAME_CASE(VROTLI) + NODE_NAME_CASE(VROTRI) + NODE_NAME_CASE(VPPERM) + NODE_NAME_CASE(CMPP) + NODE_NAME_CASE(STRICT_CMPP) + NODE_NAME_CASE(PCMPEQ) + NODE_NAME_CASE(PCMPGT) + NODE_NAME_CASE(PHMINPOS) + NODE_NAME_CASE(ADD) + NODE_NAME_CASE(SUB) + NODE_NAME_CASE(ADC) + NODE_NAME_CASE(SBB) + NODE_NAME_CASE(SMUL) + NODE_NAME_CASE(UMUL) + NODE_NAME_CASE(OR) + NODE_NAME_CASE(XOR) + NODE_NAME_CASE(AND) + NODE_NAME_CASE(BEXTR) + NODE_NAME_CASE(BZHI) + NODE_NAME_CASE(PDEP) + NODE_NAME_CASE(PEXT) + NODE_NAME_CASE(MUL_IMM) + NODE_NAME_CASE(MOVMSK) + NODE_NAME_CASE(PTEST) + NODE_NAME_CASE(TESTP) + NODE_NAME_CASE(KORTEST) + NODE_NAME_CASE(KTEST) + NODE_NAME_CASE(KADD) + NODE_NAME_CASE(KSHIFTL) + NODE_NAME_CASE(KSHIFTR) + NODE_NAME_CASE(PACKSS) + NODE_NAME_CASE(PACKUS) + NODE_NAME_CASE(PALIGNR) + NODE_NAME_CASE(VALIGN) + NODE_NAME_CASE(VSHLD) + NODE_NAME_CASE(VSHRD) + NODE_NAME_CASE(VSHLDV) + NODE_NAME_CASE(VSHRDV) + NODE_NAME_CASE(PSHUFD) + NODE_NAME_CASE(PSHUFHW) + NODE_NAME_CASE(PSHUFLW) + NODE_NAME_CASE(SHUFP) + NODE_NAME_CASE(SHUF128) + NODE_NAME_CASE(MOVLHPS) + NODE_NAME_CASE(MOVHLPS) + NODE_NAME_CASE(MOVDDUP) + NODE_NAME_CASE(MOVSHDUP) + NODE_NAME_CASE(MOVSLDUP) + NODE_NAME_CASE(MOVSD) + NODE_NAME_CASE(MOVSS) + NODE_NAME_CASE(UNPCKL) + NODE_NAME_CASE(UNPCKH) + NODE_NAME_CASE(VBROADCAST) + NODE_NAME_CASE(VBROADCAST_LOAD) + NODE_NAME_CASE(VBROADCASTM) + NODE_NAME_CASE(SUBV_BROADCAST) + NODE_NAME_CASE(VPERMILPV) + NODE_NAME_CASE(VPERMILPI) + NODE_NAME_CASE(VPERM2X128) + NODE_NAME_CASE(VPERMV) + NODE_NAME_CASE(VPERMV3) + NODE_NAME_CASE(VPERMI) + NODE_NAME_CASE(VPTERNLOG) + NODE_NAME_CASE(VFIXUPIMM) + NODE_NAME_CASE(VFIXUPIMM_SAE) + NODE_NAME_CASE(VFIXUPIMMS) + NODE_NAME_CASE(VFIXUPIMMS_SAE) + NODE_NAME_CASE(VRANGE) + NODE_NAME_CASE(VRANGE_SAE) + NODE_NAME_CASE(VRANGES) + NODE_NAME_CASE(VRANGES_SAE) + NODE_NAME_CASE(PMULUDQ) + NODE_NAME_CASE(PMULDQ) + NODE_NAME_CASE(PSADBW) + NODE_NAME_CASE(DBPSADBW) + NODE_NAME_CASE(VASTART_SAVE_XMM_REGS) + NODE_NAME_CASE(VAARG_64) + NODE_NAME_CASE(WIN_ALLOCA) + NODE_NAME_CASE(MEMBARRIER) + NODE_NAME_CASE(MFENCE) + NODE_NAME_CASE(SEG_ALLOCA) + NODE_NAME_CASE(PROBED_ALLOCA) + NODE_NAME_CASE(RDRAND) + NODE_NAME_CASE(RDSEED) + NODE_NAME_CASE(RDPKRU) + NODE_NAME_CASE(WRPKRU) + NODE_NAME_CASE(VPMADDUBSW) + NODE_NAME_CASE(VPMADDWD) + NODE_NAME_CASE(VPSHA) + NODE_NAME_CASE(VPSHL) + NODE_NAME_CASE(VPCOM) + NODE_NAME_CASE(VPCOMU) + NODE_NAME_CASE(VPERMIL2) + NODE_NAME_CASE(FMSUB) + NODE_NAME_CASE(STRICT_FMSUB) + NODE_NAME_CASE(FNMADD) + NODE_NAME_CASE(STRICT_FNMADD) + NODE_NAME_CASE(FNMSUB) + NODE_NAME_CASE(STRICT_FNMSUB) + NODE_NAME_CASE(FMADDSUB) + NODE_NAME_CASE(FMSUBADD) + NODE_NAME_CASE(FMADD_RND) + NODE_NAME_CASE(FNMADD_RND) + NODE_NAME_CASE(FMSUB_RND) + NODE_NAME_CASE(FNMSUB_RND) + NODE_NAME_CASE(FMADDSUB_RND) + NODE_NAME_CASE(FMSUBADD_RND) + NODE_NAME_CASE(VPMADD52H) + NODE_NAME_CASE(VPMADD52L) + NODE_NAME_CASE(VRNDSCALE) + NODE_NAME_CASE(STRICT_VRNDSCALE) + NODE_NAME_CASE(VRNDSCALE_SAE) + NODE_NAME_CASE(VRNDSCALES) + NODE_NAME_CASE(VRNDSCALES_SAE) + NODE_NAME_CASE(VREDUCE) + NODE_NAME_CASE(VREDUCE_SAE) + NODE_NAME_CASE(VREDUCES) + NODE_NAME_CASE(VREDUCES_SAE) + NODE_NAME_CASE(VGETMANT) + NODE_NAME_CASE(VGETMANT_SAE) + NODE_NAME_CASE(VGETMANTS) + NODE_NAME_CASE(VGETMANTS_SAE) + NODE_NAME_CASE(PCMPESTR) + NODE_NAME_CASE(PCMPISTR) + NODE_NAME_CASE(XTEST) + NODE_NAME_CASE(COMPRESS) + NODE_NAME_CASE(EXPAND) + NODE_NAME_CASE(SELECTS) + NODE_NAME_CASE(ADDSUB) + NODE_NAME_CASE(RCP14) + NODE_NAME_CASE(RCP14S) + NODE_NAME_CASE(RCP28) + NODE_NAME_CASE(RCP28_SAE) + NODE_NAME_CASE(RCP28S) + NODE_NAME_CASE(RCP28S_SAE) + NODE_NAME_CASE(EXP2) + NODE_NAME_CASE(EXP2_SAE) + NODE_NAME_CASE(RSQRT14) + NODE_NAME_CASE(RSQRT14S) + NODE_NAME_CASE(RSQRT28) + NODE_NAME_CASE(RSQRT28_SAE) + NODE_NAME_CASE(RSQRT28S) + NODE_NAME_CASE(RSQRT28S_SAE) + NODE_NAME_CASE(FADD_RND) + NODE_NAME_CASE(FADDS) + NODE_NAME_CASE(FADDS_RND) + NODE_NAME_CASE(FSUB_RND) + NODE_NAME_CASE(FSUBS) + NODE_NAME_CASE(FSUBS_RND) + NODE_NAME_CASE(FMUL_RND) + NODE_NAME_CASE(FMULS) + NODE_NAME_CASE(FMULS_RND) + NODE_NAME_CASE(FDIV_RND) + NODE_NAME_CASE(FDIVS) + NODE_NAME_CASE(FDIVS_RND) + NODE_NAME_CASE(FSQRT_RND) + NODE_NAME_CASE(FSQRTS) + NODE_NAME_CASE(FSQRTS_RND) + NODE_NAME_CASE(FGETEXP) + NODE_NAME_CASE(FGETEXP_SAE) + NODE_NAME_CASE(FGETEXPS) + NODE_NAME_CASE(FGETEXPS_SAE) + NODE_NAME_CASE(SCALEF) + NODE_NAME_CASE(SCALEF_RND) + NODE_NAME_CASE(SCALEFS) + NODE_NAME_CASE(SCALEFS_RND) + NODE_NAME_CASE(AVG) + NODE_NAME_CASE(MULHRS) + NODE_NAME_CASE(SINT_TO_FP_RND) + NODE_NAME_CASE(UINT_TO_FP_RND) + NODE_NAME_CASE(CVTTP2SI) + NODE_NAME_CASE(CVTTP2UI) + NODE_NAME_CASE(STRICT_CVTTP2SI) + NODE_NAME_CASE(STRICT_CVTTP2UI) + NODE_NAME_CASE(MCVTTP2SI) + NODE_NAME_CASE(MCVTTP2UI) + NODE_NAME_CASE(CVTTP2SI_SAE) + NODE_NAME_CASE(CVTTP2UI_SAE) + NODE_NAME_CASE(CVTTS2SI) + NODE_NAME_CASE(CVTTS2UI) + NODE_NAME_CASE(CVTTS2SI_SAE) + NODE_NAME_CASE(CVTTS2UI_SAE) + NODE_NAME_CASE(CVTSI2P) + NODE_NAME_CASE(CVTUI2P) + NODE_NAME_CASE(STRICT_CVTSI2P) + NODE_NAME_CASE(STRICT_CVTUI2P) + NODE_NAME_CASE(MCVTSI2P) + NODE_NAME_CASE(MCVTUI2P) + NODE_NAME_CASE(VFPCLASS) + NODE_NAME_CASE(VFPCLASSS) + NODE_NAME_CASE(MULTISHIFT) + NODE_NAME_CASE(SCALAR_SINT_TO_FP) + NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND) + NODE_NAME_CASE(SCALAR_UINT_TO_FP) + NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND) + NODE_NAME_CASE(CVTPS2PH) + NODE_NAME_CASE(STRICT_CVTPS2PH) + NODE_NAME_CASE(MCVTPS2PH) + NODE_NAME_CASE(CVTPH2PS) + NODE_NAME_CASE(STRICT_CVTPH2PS) + NODE_NAME_CASE(CVTPH2PS_SAE) + NODE_NAME_CASE(CVTP2SI) + NODE_NAME_CASE(CVTP2UI) + NODE_NAME_CASE(MCVTP2SI) + NODE_NAME_CASE(MCVTP2UI) + NODE_NAME_CASE(CVTP2SI_RND) + NODE_NAME_CASE(CVTP2UI_RND) + NODE_NAME_CASE(CVTS2SI) + NODE_NAME_CASE(CVTS2UI) + NODE_NAME_CASE(CVTS2SI_RND) + NODE_NAME_CASE(CVTS2UI_RND) + NODE_NAME_CASE(CVTNE2PS2BF16) + NODE_NAME_CASE(CVTNEPS2BF16) + NODE_NAME_CASE(MCVTNEPS2BF16) + NODE_NAME_CASE(DPBF16PS) + NODE_NAME_CASE(LWPINS) + NODE_NAME_CASE(MGATHER) + NODE_NAME_CASE(MSCATTER) + NODE_NAME_CASE(VPDPBUSD) + NODE_NAME_CASE(VPDPBUSDS) + NODE_NAME_CASE(VPDPWSSD) + NODE_NAME_CASE(VPDPWSSDS) + NODE_NAME_CASE(VPSHUFBITQMB) + NODE_NAME_CASE(GF2P8MULB) + NODE_NAME_CASE(GF2P8AFFINEQB) + NODE_NAME_CASE(GF2P8AFFINEINVQB) + NODE_NAME_CASE(NT_CALL) + NODE_NAME_CASE(NT_BRIND) + NODE_NAME_CASE(UMWAIT) + NODE_NAME_CASE(TPAUSE) + NODE_NAME_CASE(ENQCMD) + NODE_NAME_CASE(ENQCMDS) + NODE_NAME_CASE(VP2INTERSECT) } return nullptr; +#undef NODE_NAME_CASE } /// Return true if the addressing mode represented by AM is legal for this @@ -30021,7 +30716,8 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { return false; // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. - if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 && + // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. + if (Subtarget.hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) return false; @@ -30107,7 +30803,7 @@ bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const { } bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { - if (!VT1.isInteger() || !VT2.isInteger()) + if (!VT1.isScalarInteger() || !VT2.isScalarInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); @@ -30148,6 +30844,39 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { return false; } +bool X86TargetLowering::shouldSinkOperands(Instruction *I, + SmallVectorImpl<Use *> &Ops) const { + // A uniform shift amount in a vector shift or funnel shift may be much + // cheaper than a generic variable vector shift, so make that pattern visible + // to SDAG by sinking the shuffle instruction next to the shift. + int ShiftAmountOpNum = -1; + if (I->isShift()) + ShiftAmountOpNum = 1; + else if (auto *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::fshl || + II->getIntrinsicID() == Intrinsic::fshr) + ShiftAmountOpNum = 2; + } + + if (ShiftAmountOpNum == -1) + return false; + + auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum)); + if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 && + isVectorShiftByScalarCheap(I->getType())) { + Ops.push_back(&I->getOperandUse(ShiftAmountOpNum)); + return true; + } + + return false; +} + +bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const { + if (!Subtarget.is64Bit()) + return false; + return TargetLowering::shouldConvertPhiType(From, To); +} + bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0))) return false; @@ -30191,7 +30920,7 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { /// VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. -bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { +bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const { if (!VT.isSimple()) return false; @@ -30336,7 +31065,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, MachineOperand &Segment = MI.getOperand(5); unsigned ArgSize = MI.getOperand(6).getImm(); unsigned ArgMode = MI.getOperand(7).getImm(); - unsigned Align = MI.getOperand(8).getImm(); + Align Alignment = Align(MI.getOperand(8).getImm()); MachineFunction *MF = MBB->getParent(); @@ -30376,7 +31105,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, /* Align ArgSize to a multiple of 8 */ unsigned ArgSizeA8 = (ArgSize + 7) & ~7; - bool NeedsAlign = (Align > 8); + bool NeedsAlign = (Alignment > 8); MachineBasicBlock *thisMBB = MBB; MachineBasicBlock *overflowMBB; @@ -30524,17 +31253,16 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI, // to OverflowDestReg. if (NeedsAlign) { // Align the overflow address - assert(isPowerOf2_32(Align) && "Alignment must be a power of 2"); Register TmpReg = MRI.createVirtualRegister(AddrRegClass); // aligned_addr = (addr + (align-1)) & ~(align-1) BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) - .addReg(OverflowAddrReg) - .addImm(Align-1); + .addReg(OverflowAddrReg) + .addImm(Alignment.value() - 1); BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) - .addReg(TmpReg) - .addImm(~(uint64_t)(Align-1)); + .addReg(TmpReg) + .addImm(~(uint64_t)(Alignment.value() - 1)); } else { BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) .addReg(OverflowAddrReg); @@ -30630,7 +31358,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( MachineMemOperand *MMO = F->getMachineMemOperand( MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, - /*Size=*/16, /*Align=*/16); + /*Size=*/16, Align(16)); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) .addFrameIndex(RegSaveFrameIndex) .addImm(/*Scale=*/1) @@ -30697,11 +31425,13 @@ static bool isCMOVPseudo(MachineInstr &MI) { case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: + case X86::CMOV_VR64: case X86::CMOV_VR128: case X86::CMOV_VR128X: case X86::CMOV_VR256: case X86::CMOV_VR256X: case X86::CMOV_VR512: + case X86::CMOV_VK1: case X86::CMOV_VK2: case X86::CMOV_VK4: case X86::CMOV_VK8: @@ -30998,8 +31728,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; - ++NextMIIt; - NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end()); + NextMIIt = next_nodbg(NextMIIt, ThisMBB->end()); } } @@ -31071,6 +31800,112 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, return SinkMBB; } +static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) { + if (IsLP64) { + if (isInt<8>(Imm)) + return X86::SUB64ri8; + return X86::SUB64ri32; + } else { + if (isInt<8>(Imm)) + return X86::SUB32ri8; + return X86::SUB32ri; + } +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, + MachineBasicBlock *MBB) const { + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const X86FrameLowering &TFI = *Subtarget.getFrameLowering(); + DebugLoc DL = MI.getDebugLoc(); + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + + const unsigned ProbeSize = getStackProbeSize(*MF); + + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = ++MBB->getIterator(); + MF->insert(MBBIter, testMBB); + MF->insert(MBBIter, blockMBB); + MF->insert(MBBIter, tailMBB); + + Register sizeVReg = MI.getOperand(1).getReg(); + + Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP; + + Register TmpStackPtr = MRI.createVirtualRegister( + TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass); + Register FinalStackPtr = MRI.createVirtualRegister( + TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass); + + BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr) + .addReg(physSPReg); + { + const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr; + BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr) + .addReg(TmpStackPtr) + .addReg(sizeVReg); + } + + // test rsp size + + BuildMI(testMBB, DL, + TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) + .addReg(FinalStackPtr) + .addReg(physSPReg); + + BuildMI(testMBB, DL, TII->get(X86::JCC_1)) + .addMBB(tailMBB) + .addImm(X86::COND_L); + testMBB->addSuccessor(blockMBB); + testMBB->addSuccessor(tailMBB); + + // Touch the block then extend it. This is done on the opposite side of + // static probe where we allocate then touch, to avoid the need of probing the + // tail of the static alloca. Possible scenarios are: + // + // + ---- <- ------------ <- ------------- <- ------------ + + // | | + // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] + + // | | + // + <- ----------- <- ------------ <- ----------- <- ------------ + + // + // The property we want to enforce is to never have more than [page alloc] between two probes. + + const unsigned MovMIOpc = + TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi; + addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0) + .addImm(0); + + BuildMI(blockMBB, DL, + TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg) + .addReg(physSPReg) + .addImm(ProbeSize); + + + BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB); + blockMBB->addSuccessor(testMBB); + + // Replace original instruction by the expected stack ptr + BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) + .addReg(FinalStackPtr); + + tailMBB->splice(tailMBB->end(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + tailMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(testMBB); + + // Delete the original pseudo instruction. + MI.eraseFromParent(); + + // And we're done. + return tailMBB; +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -31231,29 +32066,16 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, BB->addSuccessor(RestoreMBB); MI.getOperand(0).setMBB(RestoreMBB); + // Marking this as an EH pad but not a funclet entry block causes PEI to + // restore stack pointers in the block. + RestoreMBB->setIsEHPad(true); + auto RestoreMBBI = RestoreMBB->begin(); - BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); return BB; } MachineBasicBlock * -X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI, - MachineBasicBlock *BB) const { - MachineFunction *MF = BB->getParent(); - const Constant *PerFn = MF->getFunction().getPersonalityFn(); - bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); - // Only 32-bit SEH requires special handling for catchpad. - if (IsSEH && Subtarget.is32Bit()) { - const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); - BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); - } - MI.eraseFromParent(); - return BB; -} - -MachineBasicBlock * X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, MachineBasicBlock *BB) const { // So, here we replace TLSADDR with the sequence: @@ -31755,12 +32577,17 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, MBB->addSuccessor(checkSspMBB); // Initialize a register with zero. - Register ZReg = MRI.createVirtualRegister(PtrRC); - unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr; - BuildMI(checkSspMBB, DL, TII->get(XorRROpc)) - .addDef(ZReg) - .addReg(ZReg, RegState::Undef) - .addReg(ZReg, RegState::Undef); + Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass); + BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg); + + if (PVT == MVT::i64) { + Register TmpZReg = MRI.createVirtualRegister(PtrRC); + BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg) + .addImm(0) + .addReg(ZReg) + .addImm(X86::sub_32bit); + ZReg = TmpZReg; + } // Read the current SSP Register value to the zeroed register. Register SSPCopyReg = MRI.createVirtualRegister(PtrRC); @@ -31889,7 +32716,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, Register Tmp = MRI.createVirtualRegister(RC); // Since FP is only updated here but NOT referenced, it's treated as GPR. const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; + Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; Register SP = RegInfo->getStackRegister(); MachineInstrBuilder MIB; @@ -32236,6 +33063,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); + auto TMMImmToTMMReg = [](unsigned Imm) { + assert (Imm < 8 && "Illegal tmm index"); + return X86::TMM0 + Imm; + }; switch (MI.getOpcode()) { default: llvm_unreachable("Unexpected instr type to insert"); case X86::TLS_addr32: @@ -32250,11 +33081,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return EmitLoweredIndirectThunk(MI, BB); case X86::CATCHRET: return EmitLoweredCatchRet(MI, BB); - case X86::CATCHPAD: - return EmitLoweredCatchPad(MI, BB); case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); + case X86::PROBED_ALLOCA_32: + case X86::PROBED_ALLOCA_64: + return EmitLoweredProbedAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); @@ -32268,11 +33100,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::CMOV_RFP32: case X86::CMOV_RFP64: case X86::CMOV_RFP80: + case X86::CMOV_VR64: case X86::CMOV_VR128: case X86::CMOV_VR128X: case X86::CMOV_VR256: case X86::CMOV_VR256X: case X86::CMOV_VR512: + case X86::CMOV_VK1: case X86::CMOV_VK2: case X86::CMOV_VK4: case X86::CMOV_VK8: @@ -32327,7 +33161,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::FP80_TO_INT64_IN_MEM: { // Change the floating point control register to use "round towards zero" // mode when truncating to an integer value. - int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); + int OrigCWFrameIdx = + MF->getFrameInfo().CreateStackObject(2, Align(2), false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); @@ -32348,7 +33183,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .addReg(NewCW, RegState::Kill, X86::sub_16bit); // Prepare memory for FLDCW. - int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false); + int NewCWFrameIdx = + MF->getFrameInfo().CreateStackObject(2, Align(2), false); addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), NewCWFrameIdx) .addReg(NewCW16, RegState::Kill); @@ -32483,6 +33319,97 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, BB->addLiveIn(BasePtr); return BB; } + case TargetOpcode::PREALLOCATED_SETUP: { + assert(Subtarget.is32Bit() && "preallocated only used in 32-bit"); + auto MFI = MF->getInfo<X86MachineFunctionInfo>(); + MFI->setHasPreallocatedCall(true); + int64_t PreallocatedId = MI.getOperand(0).getImm(); + size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId); + assert(StackAdjustment != 0 && "0 stack adjustment"); + LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment " + << StackAdjustment << "\n"); + BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP) + .addReg(X86::ESP) + .addImm(StackAdjustment); + MI.eraseFromParent(); + return BB; + } + case TargetOpcode::PREALLOCATED_ARG: { + assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit"); + int64_t PreallocatedId = MI.getOperand(1).getImm(); + int64_t ArgIdx = MI.getOperand(2).getImm(); + auto MFI = MF->getInfo<X86MachineFunctionInfo>(); + size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx]; + LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx + << ", arg offset " << ArgOffset << "\n"); + // stack pointer + offset + addRegOffset( + BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()), + X86::ESP, false, ArgOffset); + MI.eraseFromParent(); + return BB; + } + case X86::PTDPBSSD: + case X86::PTDPBSUD: + case X86::PTDPBUSD: + case X86::PTDPBUUD: + case X86::PTDPBF16PS: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + case X86::PTDPBSSD: Opc = X86::TDPBSSD; break; + case X86::PTDPBSUD: Opc = X86::TDPBSUD; break; + case X86::PTDPBUSD: Opc = X86::TDPBUSD; break; + case X86::PTDPBUUD: Opc = X86::TDPBUUD; break; + case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break; + } + + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); + MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::PTILEZERO: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Imm = MI.getOperand(0).getImm(); + BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm)); + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } + case X86::PTILELOADD: + case X86::PTILELOADDT1: + case X86::PTILESTORED: { + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc; + switch (MI.getOpcode()) { + case X86::PTILELOADD: Opc = X86::TILELOADD; break; + case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break; + case X86::PTILESTORED: Opc = X86::TILESTORED; break; + } + + MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); + unsigned CurOp = 0; + if (Opc != X86::TILESTORED) + MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), + RegState::Define); + + MIB.add(MI.getOperand(CurOp++)); // base + MIB.add(MI.getOperand(CurOp++)); // scale + MIB.add(MI.getOperand(CurOp++)); // index -- stride + MIB.add(MI.getOperand(CurOp++)); // displacement + MIB.add(MI.getOperand(CurOp++)); // segment + + if (Opc == X86::TILESTORED) + MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), + RegState::Undef); + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; + } } } @@ -32492,20 +33419,53 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, bool X86TargetLowering::targetShrinkDemandedConstant(SDValue Op, - const APInt &Demanded, + const APInt &DemandedBits, + const APInt &DemandedElts, TargetLoweringOpt &TLO) const { - // Only optimize Ands to prevent shrinking a constant that could be - // matched by movzx. - if (Op.getOpcode() != ISD::AND) - return false; - EVT VT = Op.getValueType(); + unsigned Opcode = Op.getOpcode(); + unsigned EltSize = VT.getScalarSizeInBits(); - // Ignore vectors. - if (VT.isVector()) + if (VT.isVector()) { + // If the constant is only all signbits in the active bits, then we should + // extend it to the entire constant to allow it act as a boolean constant + // vector. + auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) { + if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode())) + return false; + for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) { + if (!DemandedElts[i] || V.getOperand(i).isUndef()) + continue; + const APInt &Val = V.getConstantOperandAPInt(i); + if (Val.getBitWidth() > Val.getNumSignBits() && + Val.trunc(ActiveBits).getNumSignBits() == ActiveBits) + return true; + } + return false; + }; + // For vectors - if we have a constant, then try to sign extend. + // TODO: Handle AND/ANDN cases. + unsigned ActiveBits = DemandedBits.getActiveBits(); + if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) && + (Opcode == ISD::OR || Opcode == ISD::XOR) && + NeedsSignExtension(Op.getOperand(1), ActiveBits)) { + EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits); + EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT, + VT.getVectorNumElements()); + SDValue NewC = + TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT, + Op.getOperand(1), TLO.DAG.getValueType(ExtVT)); + SDValue NewOp = + TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC); + return TLO.CombineTo(Op, NewOp); + } return false; + } - unsigned Size = VT.getSizeInBits(); + // Only optimize Ands to prevent shrinking a constant that could be + // matched by movzx. + if (Opcode != ISD::AND) + return false; // Make sure the RHS really is a constant. ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); @@ -32515,7 +33475,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op, const APInt &Mask = C->getAPIntValue(); // Clear all non-demanded bits initially. - APInt ShrunkMask = Mask & Demanded; + APInt ShrunkMask = Mask & DemandedBits; // Find the width of the shrunk mask. unsigned Width = ShrunkMask.getActiveBits(); @@ -32527,10 +33487,10 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op, // Find the next power of 2 width, rounding up to a byte. Width = PowerOf2Ceil(std::max(Width, 8U)); // Truncate the width to size to handle illegal types. - Width = std::min(Width, Size); + Width = std::min(Width, EltSize); // Calculate a possible zero extend mask for this constant. - APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width); + APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width); // If we aren't changing the mask, just return true to keep it and prevent // the caller from optimizing. @@ -32539,7 +33499,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op, // Make sure the new mask can be represented by a combination of mask bits // and non-demanded bits. - if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded)) + if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits)) return false; // Replace the constant with the zero extend mask. @@ -32555,6 +33515,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = Known.getBitWidth(); + unsigned NumElts = DemandedElts.getBitWidth(); unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); assert((Opc >= ISD::BUILTIN_OP_END || @@ -32582,7 +33543,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(), Op.getConstantOperandVal(1)); Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1); - Known = Known.zextOrTrunc(BitWidth, false); + Known = Known.anyextOrTrunc(BitWidth); Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits()); break; } @@ -32652,10 +33613,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); - // Output known-0 bits are only known if clear in both the LHS & RHS. - Known.Zero &= Known2.Zero; - // Output known-1 are known to be set if set in either the LHS | RHS. - Known.One |= Known2.One; + Known |= Known2; break; } case X86ISD::PSADBW: { @@ -32679,6 +33637,76 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.Zero &= Known2.Zero; break; } + case X86ISD::BEXTR: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) { + unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0); + unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8); + + // If the length is 0, the result is 0. + if (Length == 0) { + Known.setAllZero(); + break; + } + + if ((Shift + Length) <= BitWidth) { + Known = DAG.computeKnownBits(Op0, Depth + 1); + Known = Known.extractBits(Length, Shift); + Known = Known.zextOrTrunc(BitWidth); + } + } + break; + } + case X86ISD::CVTSI2P: + case X86ISD::CVTUI2P: + case X86ISD::CVTP2SI: + case X86ISD::CVTP2UI: + case X86ISD::MCVTP2SI: + case X86ISD::MCVTP2UI: + case X86ISD::CVTTP2SI: + case X86ISD::CVTTP2UI: + case X86ISD::MCVTTP2SI: + case X86ISD::MCVTTP2UI: + case X86ISD::MCVTSI2P: + case X86ISD::MCVTUI2P: + case X86ISD::VFPROUND: + case X86ISD::VMFPROUND: + case X86ISD::CVTPS2PH: + case X86ISD::MCVTPS2PH: { + // Conversions - upper elements are known zero. + EVT SrcVT = Op.getOperand(0).getValueType(); + if (SrcVT.isVector()) { + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + if (NumElts > NumSrcElts && + DemandedElts.countTrailingZeros() >= NumSrcElts) + Known.setAllZero(); + } + break; + } + case X86ISD::STRICT_CVTTP2SI: + case X86ISD::STRICT_CVTTP2UI: + case X86ISD::STRICT_CVTSI2P: + case X86ISD::STRICT_CVTUI2P: + case X86ISD::STRICT_VFPROUND: + case X86ISD::STRICT_CVTPS2PH: { + // Strict Conversions - upper elements are known zero. + EVT SrcVT = Op.getOperand(1).getValueType(); + if (SrcVT.isVector()) { + unsigned NumSrcElts = SrcVT.getVectorNumElements(); + if (NumElts > NumSrcElts && + DemandedElts.countTrailingZeros() >= NumSrcElts) + Known.setAllZero(); + } + break; + } + case X86ISD::MOVQ2DQ: { + // Move from MMX to XMM. Upper half of XMM should be 0. + if (DemandedElts.countTrailingZeros() >= (NumElts / 2)) + Known.setAllZero(); + break; + } } // Handle target shuffles. @@ -32745,11 +33773,12 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( return VTBits; case X86ISD::VTRUNC: { - // TODO: Add DemandedElts support. SDValue Src = Op.getOperand(0); - unsigned NumSrcBits = Src.getScalarValueSizeInBits(); + MVT SrcVT = Src.getSimpleValueType(); + unsigned NumSrcBits = SrcVT.getScalarSizeInBits(); assert(VTBits < NumSrcBits && "Illegal truncation input type"); - unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); + unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1); if (Tmp > (NumSrcBits - VTBits)) return Tmp - (NumSrcBits - VTBits); return 1; @@ -32877,6 +33906,21 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const { return N; } +// Helper to look for a normal load that can be narrowed into a vzload with the +// specified VT and memory VT. Returns SDValue() on failure. +static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, + SelectionDAG &DAG) { + // Can't if the load is volatile or atomic. + if (!LN->isSimple()) + return SDValue(); + + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; + return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT, + LN->getPointerInfo(), LN->getOriginalAlign(), + LN->getMemOperand()->getFlags()); +} + // Attempt to match a combined shuffle mask against supported unary shuffle // instructions. // TODO: Investigate sharing more of this with shuffle lowering. @@ -33021,9 +34065,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, unsigned InputSizeInBits = MaskVT.getSizeInBits(); unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); - - bool ContainsZeros = - llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); + bool ContainsZeros = isAnyZero(Mask); // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns. if (!ContainsZeros && MaskScalarSizeInBits == 64) { @@ -33071,7 +34113,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, // Narrow the repeated mask to create 32-bit element permutes. SmallVector<int, 4> WordMask = RepeatedMask; if (MaskScalarSizeInBits == 64) - scaleShuffleMask<int>(2, RepeatedMask, WordMask); + narrowShuffleMaskElts(2, RepeatedMask, WordMask); Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); @@ -33114,17 +34156,32 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, } // Attempt to match against byte/bit shifts. - // FIXME: Add 512-bit support. - if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + if (AllowIntDomain && + ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0, Zeroable, Subtarget); - if (0 < ShiftAmt) { + if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() || + 32 <= ShuffleVT.getScalarSizeInBits())) { PermuteImm = (unsigned)ShiftAmt; return true; } } + // Attempt to match against bit rotates. + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 && + ((MaskVT.is128BitVector() && Subtarget.hasXOP()) || + Subtarget.hasAVX512())) { + int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits, + Subtarget, Mask); + if (0 < RotateAmt) { + Shuffle = X86ISD::VROTLI; + PermuteImm = (unsigned)RotateAmt; + return true; + } + } + return false; } @@ -33205,9 +34262,29 @@ static bool matchBinaryPermuteShuffle( unsigned NumMaskElts = Mask.size(); unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); + // Attempt to match against VALIGND/VALIGNQ rotate. + if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) && + ((MaskVT.is128BitVector() && Subtarget.hasVLX()) || + (MaskVT.is256BitVector() && Subtarget.hasVLX()) || + (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { + if (!isAnyZero(Mask)) { + int Rotation = matchShuffleAsElementRotate(V1, V2, Mask); + if (0 < Rotation) { + Shuffle = X86ISD::VALIGN; + if (EltSizeInBits == 64) + ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64); + else + ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32); + PermuteImm = Rotation; + return true; + } + } + } + // Attempt to match against PALIGNR byte rotate. if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || + (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask); if (0 < ByteRotation) { Shuffle = X86ISD::PALIGNR; @@ -33257,8 +34334,7 @@ static bool matchBinaryPermuteShuffle( // Attempt to combine to INSERTPS, but only if it has elements that need to // be set to zero. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && - MaskVT.is128BitVector() && - llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) && + MaskVT.is128BitVector() && isAnyZero(Mask) && matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; ShuffleVT = MVT::v4f32; @@ -33386,6 +34462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return DAG.getBitcast(RootVT, V1); } + bool OptForSize = DAG.shouldOptForSize(); unsigned RootSizeInBits = RootVT.getSizeInBits(); unsigned NumRootElts = RootVT.getVectorNumElements(); unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; @@ -33396,11 +34473,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Don't combine if we are a AVX512/EVEX target and the mask element size // is different from the root element size - this would prevent writemasks // from being reused. - // TODO - this currently prevents all lane shuffles from occurring. - // TODO - check for writemasks usage instead of always preventing combining. - // TODO - attempt to narrow Mask back to writemask size. - bool IsEVEXShuffle = - RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); + bool IsMaskedShuffle = false; + if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) { + if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT && + Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) { + IsMaskedShuffle = true; + } + } + + // If we are shuffling a broadcast (and not introducing zeros) then + // we can just use the broadcast directly. This works for smaller broadcast + // elements as well as they already repeat across each mask element + if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) && + (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) { + return DAG.getBitcast(RootVT, V1); + } // Attempt to match a subvector broadcast. // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0) @@ -33420,27 +34507,138 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } - // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. + // Handle 128/256-bit lane shuffles of 512-bit vectors. + if (RootVT.is512BitVector() && + (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) { + MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); + + // If the upper subvectors are zeroable, then an extract+insert is more + // optimal than using X86ISD::SHUF128. The insertion is free, even if it has + // to zero the upper subvectors. + if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) { + if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) + return SDValue(); // Nothing to do! + assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) && + "Unexpected lane shuffle"); + Res = DAG.getBitcast(ShuffleVT, V1); + unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts); + bool UseZero = isAnyZero(BaseMask); + Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits); + Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits); + return DAG.getBitcast(RootVT, Res); + } + + // Narrow shuffle mask to v4x128. + SmallVector<int, 4> Mask; + assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size"); + narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask); + + // Try to lower to vshuf64x2/vshuf32x4. + auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask, + SDValue V1, SDValue V2, SelectionDAG &DAG) { + unsigned PermMask = 0; + // Insure elements came from the same Op. + SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)}; + for (int i = 0; i < 4; ++i) { + assert(Mask[i] >= -1 && "Illegal shuffle sentinel value"); + if (Mask[i] < 0) + continue; + + SDValue Op = Mask[i] >= 4 ? V2 : V1; + unsigned OpIndex = i / 2; + if (Ops[OpIndex].isUndef()) + Ops[OpIndex] = Op; + else if (Ops[OpIndex] != Op) + return SDValue(); + + // Convert the 128-bit shuffle mask selection values into 128-bit + // selection bits defined by a vshuf64x2 instruction's immediate control + // byte. + PermMask |= (Mask[i] % 4) << (i * 2); + } + + return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, + DAG.getBitcast(ShuffleVT, Ops[0]), + DAG.getBitcast(ShuffleVT, Ops[1]), + DAG.getTargetConstant(PermMask, DL, MVT::i8)); + }; + + // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask + // doesn't work because our mask is for 128 bits and we don't have an MVT + // to match that. + bool PreferPERMQ = + UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) && + isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) && + isUndefOrInRange(Mask[3], 2, 4) && + (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) && + (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2)); + + if (!isAnyZero(Mask) && !PreferPERMQ) { + if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG)) + return DAG.getBitcast(RootVT, V); + } + } // Handle 128-bit lane shuffles of 256-bit vectors. - // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless - // we need to use the zeroing feature. - // TODO - this should support binary shuffles. - if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 && - !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) && - !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { + if (RootVT.is256BitVector() && NumBaseMaskElts == 2) { + MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); + + // If the upper half is zeroable, then an extract+insert is more optimal + // than using X86ISD::VPERM2X128. The insertion is free, even if it has to + // zero the upper half. + if (isUndefOrZero(BaseMask[1])) { + if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) + return SDValue(); // Nothing to do! + assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle"); + Res = DAG.getBitcast(ShuffleVT, V1); + Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL); + Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG, + DL, 256); + return DAG.getBitcast(RootVT, Res); + } + if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! - MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); - unsigned PermMask = 0; - PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); - PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); - - Res = DAG.getBitcast(ShuffleVT, V1); - Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, - DAG.getUNDEF(ShuffleVT), - DAG.getTargetConstant(PermMask, DL, MVT::i8)); - return DAG.getBitcast(RootVT, Res); + + // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless + // we need to use the zeroing feature. + // Prefer blends for sequential shuffles unless we are optimizing for size. + if (UnaryShuffle && + !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) && + (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) { + unsigned PermMask = 0; + PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); + PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4); + + Res = DAG.getBitcast(ShuffleVT, V1); + Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res, + DAG.getUNDEF(ShuffleVT), + DAG.getTargetConstant(PermMask, DL, MVT::i8)); + return DAG.getBitcast(RootVT, Res); + } + + if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) + return SDValue(); // Nothing to do! + + // TODO - handle AVX512VL cases with X86ISD::SHUF128. + if (!UnaryShuffle && !IsMaskedShuffle) { + assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) && + "Unexpected shuffle sentinel value"); + // Prefer blends to X86ISD::VPERM2X128. + if (!((BaseMask[0] == 0 && BaseMask[1] == 3) || + (BaseMask[0] == 2 && BaseMask[1] == 1))) { + unsigned PermMask = 0; + PermMask |= ((BaseMask[0] & 3) << 0); + PermMask |= ((BaseMask[1] & 3) << 4); + + Res = DAG.getNode( + X86ISD::VPERM2X128, DL, ShuffleVT, + DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2), + DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2), + DAG.getTargetConstant(PermMask, DL, MVT::i8)); + return DAG.getBitcast(RootVT, Res); + } + } } // For masks that have been widened to 128-bit elements or more, @@ -33449,9 +34647,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (BaseMaskEltSizeInBits > 64) { assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"); int MaskScale = BaseMaskEltSizeInBits / 64; - scaleShuffleMask<int>(MaskScale, BaseMask, Mask); + narrowShuffleMaskElts(MaskScale, BaseMask, Mask); } else { - Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end()); + Mask.assign(BaseMask.begin(), BaseMask.end()); + } + + // For masked shuffles, we're trying to match the root width for better + // writemask folding, attempt to scale the mask. + // TODO - variable shuffles might need this to be widened again. + if (IsMaskedShuffle && NumRootElts > Mask.size()) { + assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size"); + int MaskScale = NumRootElts / Mask.size(); + SmallVector<int, 64> ScaledMask; + narrowShuffleMaskElts(MaskScale, Mask, ScaledMask); + Mask = std::move(ScaledMask); } unsigned NumMaskElts = Mask.size(); @@ -33484,26 +34693,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, APInt Zeroable = KnownUndef | KnownZero; if (UnaryShuffle) { - // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load - // directly if we don't shuffle the lower element and we shuffle the upper - // (zero) elements within themselves. - if (V1.getOpcode() == X86ISD::VZEXT_LOAD && - (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() % - MaskEltSizeInBits) == 0) { - unsigned Scale = - cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() / - MaskEltSizeInBits; - ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale); - if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && - isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { - return DAG.getBitcast(RootVT, V1); - } - } - // Attempt to match against broadcast-from-vector. // Limit AVX1 to cases where we're loading+broadcasting a scalar element. - if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) - && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) { + if ((Subtarget.hasAVX2() || + (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) && + (!IsMaskedShuffle || NumRootElts == NumMaskElts)) { SmallVector<int, 64> BroadcastMask(NumMaskElts, 0); if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { if (V1.getValueType() == MaskVT && @@ -33529,7 +34723,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && - (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { + (!IsMaskedShuffle || + (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleSrcVT, NewV1); @@ -33540,7 +34735,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, Subtarget, Shuffle, ShuffleVT, PermuteImm) && - (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { + (!IsMaskedShuffle || + (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! Res = DAG.getBitcast(ShuffleVT, V1); @@ -33550,12 +34746,31 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } + // Attempt to combine to INSERTPS, but only if the inserted element has come + // from a scalar. + // TODO: Handle other insertions here as well? + if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 && + MaskEltSizeInBits == 32 && Subtarget.hasSSE41() && + !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) { + SDValue SrcV1 = V1, SrcV2 = V2; + if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) && + SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) { + if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) + return SDValue(); // Nothing to do! + Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, + DAG.getBitcast(MVT::v4f32, SrcV1), + DAG.getBitcast(MVT::v4f32, SrcV2), + DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); + return DAG.getBitcast(RootVT, Res); + } + } + SDValue NewV1 = V1; // Save operands in case early exit happens. SDValue NewV2 = V2; if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT, UnaryShuffle) && - (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { + (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1); @@ -33566,10 +34781,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, NewV1 = V1; // Save operands in case early exit happens. NewV2 = V2; - if (matchBinaryPermuteShuffle( - MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1, - NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) && - (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { + if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, + AllowIntDomain, NewV1, NewV2, DL, DAG, + Subtarget, Shuffle, ShuffleVT, PermuteImm) && + (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { if (Depth == 0 && Root.getOpcode() == Shuffle) return SDValue(); // Nothing to do! NewV1 = DAG.getBitcast(ShuffleVT, NewV1); @@ -33609,6 +34824,44 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, } } + // Match shuffle against TRUNCATE patterns. + if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) { + // Match against a VTRUNC instruction, accounting for src/dst sizes. + if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable, + Subtarget)) { + bool IsTRUNCATE = ShuffleVT.getVectorNumElements() == + ShuffleSrcVT.getVectorNumElements(); + unsigned Opc = + IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC; + if (Depth == 0 && Root.getOpcode() == Opc) + return SDValue(); // Nothing to do! + V1 = DAG.getBitcast(ShuffleSrcVT, V1); + Res = DAG.getNode(Opc, DL, ShuffleVT, V1); + if (ShuffleVT.getSizeInBits() < RootSizeInBits) + Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits); + return DAG.getBitcast(RootVT, Res); + } + + // Do we need a more general binary truncation pattern? + if (RootSizeInBits < 512 && + ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) || + (RootVT.is128BitVector() && Subtarget.hasVLX())) && + (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) && + isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) { + if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE) + return SDValue(); // Nothing to do! + ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); + ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2); + V1 = DAG.getBitcast(ShuffleSrcVT, V1); + V2 = DAG.getBitcast(ShuffleSrcVT, V2); + ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); + ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts); + Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2); + Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res); + return DAG.getBitcast(RootVT, Res); + } + } + // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 1) @@ -33618,8 +34871,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2; AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask; - bool MaskContainsZeros = - any_of(Mask, [](int M) { return M == SM_SentinelZero; }); + bool MaskContainsZeros = isAnyZero(Mask); if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. @@ -33714,7 +34966,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL); Res = DAG.getBitcast(MaskVT, V1); unsigned AndOpcode = - FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND); + MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND); Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask); return DAG.getBitcast(RootVT, Res); } @@ -33791,7 +35043,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, continue; } if (M == SM_SentinelZero) { - PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8)); + PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); continue; } M = Ratio * M + i % Ratio; @@ -33822,7 +35074,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, continue; } if (M == SM_SentinelZero) { - VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8)); + VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); continue; } M = Ratio * M + i % Ratio; @@ -33897,8 +35149,7 @@ static SDValue combineX86ShuffleChainWithExtract( unsigned &Offset = Offsets[i]; Src = peekThroughBitcasts(Src); EVT BaseVT = Src.getValueType(); - while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR && - isa<ConstantSDNode>(Src.getOperand(1))) { + while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) { Offset += Src.getConstantOperandVal(1); Src = Src.getOperand(0); } @@ -34121,7 +35372,8 @@ static SDValue combineX86ShufflesRecursively( assert(Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"); - assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && + unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits(); + assert(VT.getSizeInBits() == RootSizeInBits && "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. @@ -34135,6 +35387,18 @@ static SDValue combineX86ShufflesRecursively( OpZero, DAG, Depth, false)) return SDValue(); + // Shuffle inputs must be the same size as the result, bail on any larger + // inputs and widen any smaller inputs. + if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) { + return Op.getValueSizeInBits() > RootSizeInBits; + })) + return SDValue(); + + for (SDValue &Op : OpInputs) + if (Op.getValueSizeInBits() < RootSizeInBits) + Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG, + SDLoc(Op), RootSizeInBits); + SmallVector<int, 64> Mask; SmallVector<SDValue, 16> Ops; @@ -34535,6 +35799,59 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, return V; } +// Attempt to commute shufps LHS loads: +// permilps(shufps(load(),x)) --> permilps(shufps(x,load())) +static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, + SelectionDAG &DAG) { + // TODO: Add vXf64 support. + if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32) + return SDValue(); + + // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not. + auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) { + if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode())) + return SDValue(); + SDValue N0 = V.getOperand(0); + SDValue N1 = V.getOperand(1); + unsigned Imm = V.getConstantOperandVal(2); + if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) || + MayFoldLoad(peekThroughOneUseBitcasts(N1))) + return SDValue(); + Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); + return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, + DAG.getTargetConstant(Imm, DL, MVT::i8)); + }; + + switch (N.getOpcode()) { + case X86ISD::VPERMILPI: + if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) { + unsigned Imm = N.getConstantOperandVal(1); + return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP, + DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); + } + break; + case X86ISD::SHUFP: { + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + unsigned Imm = N.getConstantOperandVal(2); + if (N0 == N1) { + if (SDValue NewSHUFP = commuteSHUFP(N, N0)) + return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP, + DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); + } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) { + return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1, + DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8)); + } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) { + return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP, + DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8)); + } + break; + } + } + + return SDValue(); +} + /// Try to combine x86 target specific shuffles. static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -34544,35 +35861,105 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, SmallVector<int, 4> Mask; unsigned Opcode = N.getOpcode(); + bool IsUnary; + SmallVector<int, 64> TargetMask; + SmallVector<SDValue, 2> TargetOps; + if (isTargetShuffle(Opcode)) + getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary); + // Combine binary shuffle of 2 similar 'Horizontal' instructions into a - // single instruction. - if (VT.getScalarSizeInBits() == 64 && - (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH || - Opcode == X86ISD::UNPCKL)) { - auto BC0 = peekThroughBitcasts(N.getOperand(0)); - auto BC1 = peekThroughBitcasts(N.getOperand(1)); - EVT VT0 = BC0.getValueType(); - EVT VT1 = BC1.getValueType(); - unsigned Opcode0 = BC0.getOpcode(); - unsigned Opcode1 = BC1.getOpcode(); - if (Opcode0 == Opcode1 && VT0 == VT1 && - (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || - Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB || - Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) { - SDValue Lo, Hi; - if (Opcode == X86ISD::MOVSD) { - Lo = BC1.getOperand(0); - Hi = BC0.getOperand(1); - } else { - Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0); - Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0); + // single instruction. Attempt to match a v2X64 repeating shuffle pattern that + // represents the LHS/RHS inputs for the lower/upper halves. + SmallVector<int, 16> TargetMask128; + if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 && + isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) { + SmallVector<int, 16> WidenedMask128 = TargetMask128; + while (WidenedMask128.size() > 2) { + SmallVector<int, 16> WidenedMask; + if (!canWidenShuffleElements(WidenedMask128, WidenedMask)) + break; + WidenedMask128 = std::move(WidenedMask); + } + if (WidenedMask128.size() == 2) { + assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle"); + SDValue BC0 = peekThroughBitcasts(TargetOps.front()); + SDValue BC1 = peekThroughBitcasts(TargetOps.back()); + EVT VT0 = BC0.getValueType(); + EVT VT1 = BC1.getValueType(); + unsigned Opcode0 = BC0.getOpcode(); + unsigned Opcode1 = BC1.getOpcode(); + bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || + Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB); + if (Opcode0 == Opcode1 && VT0 == VT1 && + (isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) { + bool SingleOp = (TargetOps.size() == 1); + if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) { + SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1; + SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1; + Lo = Lo.getOperand(WidenedMask128[0] & 1); + Hi = Hi.getOperand(WidenedMask128[1] & 1); + if (SingleOp) { + MVT SrcVT = BC0.getOperand(0).getSimpleValueType(); + SDValue Undef = DAG.getUNDEF(SrcVT); + SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL); + Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo); + Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi); + Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo); + Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi); + } + SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi); + return DAG.getBitcast(VT, Horiz); + } } - SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi); - return DAG.getBitcast(VT, Horiz); } } + if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG)) + return R; + + // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to + // help expose the 'NOT' pattern further up the DAG. + // TODO: This might be beneficial for any binop with a 'splattable' operand. + switch (Opcode) { + case X86ISD::MOVDDUP: + case X86ISD::PSHUFD: { + SDValue Src = N.getOperand(0); + if (Src.hasOneUse() && Src.getValueType() == VT) { + if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) { + Not = DAG.getBitcast(VT, Not); + Not = Opcode == X86ISD::MOVDDUP + ? DAG.getNode(Opcode, DL, VT, Not) + : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1)); + EVT IntVT = Not.getValueType().changeTypeToInteger(); + SDValue AllOnes = DAG.getConstant(-1, DL, IntVT); + Not = DAG.getBitcast(IntVT, Not); + Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes); + return DAG.getBitcast(VT, Not); + } + } + break; + } + } + + // Handle specific target shuffles. switch (Opcode) { + case X86ISD::MOVDDUP: { + SDValue Src = N.getOperand(0); + // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload. + if (VT == MVT::v2f64 && Src.hasOneUse() && + ISD::isNormalLoad(Src.getNode())) { + LoadSDNode *LN = cast<LoadSDNode>(Src); + if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) { + SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad); + DCI.CombineTo(N.getNode(), Movddup); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + return N; // Return N so it doesn't get rechecked! + } + } + + return SDValue(); + } case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); SDValue BC = peekThroughBitcasts(Src); @@ -34598,7 +35985,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, // broadcast(bitcast(src)) -> bitcast(broadcast(src)) // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward. if (Src.getOpcode() == ISD::BITCAST && - SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) { + SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() && + DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) { EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(), VT.getVectorNumElements()); return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC)); @@ -34645,6 +36033,190 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, return N; // Return N so it doesn't get rechecked! } + // Due to isTypeDesirableForOp, we won't always shrink a load truncated to + // i16. So shrink it ourselves if we can make a broadcast_load. + if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE && + Src.hasOneUse() && Src.getOperand(0).hasOneUse()) { + assert(Subtarget.hasAVX2() && "Expected AVX2"); + SDValue TruncIn = Src.getOperand(0); + + // If this is a truncate of a non extending load we can just narrow it to + // use a broadcast_load. + if (ISD::isNormalLoad(TruncIn.getNode())) { + LoadSDNode *LN = cast<LoadSDNode>(TruncIn); + // Unless its volatile or atomic. + if (LN->isSimple()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue BcastLd = DAG.getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, + LN->getPointerInfo(), LN->getOriginalAlign(), + LN->getMemOperand()->getFlags()); + DCI.CombineTo(N.getNode(), BcastLd); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(Src.getNode()); + return N; // Return N so it doesn't get rechecked! + } + } + + // If this is a truncate of an i16 extload, we can directly replace it. + if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) && + ISD::isEXTLoad(Src.getOperand(0).getNode())) { + LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0)); + if (LN->getMemoryVT().getSizeInBits() == 16) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + DCI.CombineTo(N.getNode(), BcastLd); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(Src.getNode()); + return N; // Return N so it doesn't get rechecked! + } + } + + // If this is a truncate of load that has been shifted right, we can + // offset the pointer and use a narrower load. + if (TruncIn.getOpcode() == ISD::SRL && + TruncIn.getOperand(0).hasOneUse() && + isa<ConstantSDNode>(TruncIn.getOperand(1)) && + ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) { + LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0)); + unsigned ShiftAmt = TruncIn.getConstantOperandVal(1); + // Make sure the shift amount and the load size are divisible by 16. + // Don't do this if the load is volatile or atomic. + if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 && + LN->isSimple()) { + unsigned Offset = ShiftAmt / 8; + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL); + SDValue Ops[] = { LN->getChain(), Ptr }; + SDValue BcastLd = DAG.getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, + LN->getPointerInfo().getWithOffset(Offset), + LN->getOriginalAlign(), + LN->getMemOperand()->getFlags()); + DCI.CombineTo(N.getNode(), BcastLd); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(Src.getNode()); + return N; // Return N so it doesn't get rechecked! + } + } + } + + // vbroadcast(vzload X) -> vbroadcast_load X + if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) { + MemSDNode *LN = cast<MemIntrinsicSDNode>(Src); + if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + DCI.CombineTo(N.getNode(), BcastLd); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + return N; // Return N so it doesn't get rechecked! + } + } + + // vbroadcast(vector load X) -> vbroadcast_load + if (SrcVT == MVT::v2f64 && Src.hasOneUse() && + ISD::isNormalLoad(Src.getNode())) { + LoadSDNode *LN = cast<LoadSDNode>(Src); + // Unless the load is volatile or atomic. + if (LN->isSimple()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue BcastLd = DAG.getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64, + LN->getPointerInfo(), LN->getOriginalAlign(), + LN->getMemOperand()->getFlags()); + DCI.CombineTo(N.getNode(), BcastLd); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + return N; // Return N so it doesn't get rechecked! + } + } + + return SDValue(); + } + case X86ISD::VZEXT_MOVL: { + SDValue N0 = N.getOperand(0); + + // If this a vzmovl of a full vector load, replace it with a vzload, unless + // the load is volatile. + if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) { + auto *LN = cast<LoadSDNode>(N0); + if (SDValue VZLoad = + narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) { + DCI.CombineTo(N.getNode(), VZLoad); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + return N; + } + } + + // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast + // and can just use a VZEXT_LOAD. + // FIXME: Is there some way to do this with SimplifyDemandedVectorElts? + if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) { + auto *LN = cast<MemSDNode>(N0); + if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; + SDValue VZLoad = + DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + DCI.CombineTo(N.getNode(), VZLoad); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + return N; + } + } + + // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into + // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X))))))) + // if the upper bits of the i64 are zero. + if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR && + N0.getOperand(0).hasOneUse() && + N0.getOperand(0).getValueType() == MVT::i64) { + SDValue In = N0.getOperand(0); + APInt Mask = APInt::getHighBitsSet(64, 32); + if (DAG.MaskedValueIsZero(In, Mask)) { + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In); + MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); + SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc); + SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec); + return DAG.getBitcast(VT, Movl); + } + } + + // Load a scalar integer constant directly to XMM instead of transferring an + // immediate value from GPR. + // vzext_movl (scalar_to_vector C) --> load [C,0...] + if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) { + if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) { + // Create a vector constant - scalar constant followed by zeros. + EVT ScalarVT = N0.getOperand(0).getValueType(); + Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext()); + unsigned NumElts = VT.getVectorNumElements(); + Constant *Zero = ConstantInt::getNullValue(ScalarTy); + SmallVector<Constant *, 32> ConstantVec(NumElts, Zero); + ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue()); + + // Load the vector constant from constant pool. + MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT); + MachinePointerInfo MPI = + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); + Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign(); + return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment, + MachineMemOperand::MOLoad); + } + } + return SDValue(); } case X86ISD::BLENDI: { @@ -34685,6 +36257,34 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } return SDValue(); } + case X86ISD::VPERM2X128: { + // If both 128-bit values were inserted into high halves of 256-bit values, + // the shuffle can be reduced to a concatenation of subvectors: + // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y + // Note: We are only looking for the exact high/high shuffle mask because we + // expect to fold other similar patterns before creating this opcode. + SDValue Ins0 = peekThroughBitcasts(N.getOperand(0)); + SDValue Ins1 = peekThroughBitcasts(N.getOperand(1)); + unsigned Imm = N.getConstantOperandVal(2); + if (!(Imm == 0x31 && + Ins0.getOpcode() == ISD::INSERT_SUBVECTOR && + Ins1.getOpcode() == ISD::INSERT_SUBVECTOR && + Ins0.getValueType() == Ins1.getValueType())) + return SDValue(); + + SDValue X = Ins0.getOperand(1); + SDValue Y = Ins1.getOperand(1); + unsigned C1 = Ins0.getConstantOperandVal(2); + unsigned C2 = Ins1.getConstantOperandVal(2); + MVT SrcVT = X.getSimpleValueType(); + unsigned SrcElts = SrcVT.getVectorNumElements(); + if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 || + C1 != SrcElts || C2 != SrcElts) + return SDValue(); + + return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, + Ins1.getValueType(), X, Y)); + } case X86ISD::PSHUFD: case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: @@ -34724,8 +36324,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); SDValue Op0 = N.getOperand(0); SDValue Op1 = N.getOperand(1); - SDValue Op2 = N.getOperand(2); - unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned InsertPSMask = N.getConstantOperandVal(2); unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; unsigned DstIdx = (InsertPSMask >> 4) & 0x3; unsigned ZeroMask = InsertPSMask & 0xF; @@ -34865,9 +36464,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, (V.getOpcode() == X86ISD::PSHUFLW || V.getOpcode() == X86ISD::PSHUFHW) && V.getOpcode() != N.getOpcode() && - V.hasOneUse()) { + V.hasOneUse() && V.getOperand(0).hasOneUse()) { SDValue D = peekThroughOneUseBitcasts(V.getOperand(0)); - if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) { + if (D.getOpcode() == X86ISD::PSHUFD) { SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; @@ -35266,7 +36865,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, } // Attempt to combine into a vector load/broadcast. - if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true)) + if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG, + Subtarget, true)) return LD; // For AVX2, we sometimes want to combine @@ -35299,79 +36899,100 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(N, 0); } - // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros - // in the upper 64 bits. - // TODO: Can we generalize this using computeKnownBits. - if (N->getOpcode() == X86ISD::VZEXT_MOVL && - (VT == MVT::v2f64 || VT == MVT::v2i64) && - N->getOperand(0).getOpcode() == ISD::BITCAST && - (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 || - N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) { - SDValue In = N->getOperand(0).getOperand(0); - switch (In.getOpcode()) { - default: - break; - case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: - case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI: - case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: - case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI: - case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: - case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P: - case X86ISD::VFPROUND: case X86ISD::VMFPROUND: - if (In.getOperand(0).getValueType() == MVT::v2f64 || - In.getOperand(0).getValueType() == MVT::v2i64) - return N->getOperand(0); // return the bitcast - break; - case X86ISD::STRICT_CVTTP2SI: - case X86ISD::STRICT_CVTTP2UI: - case X86ISD::STRICT_CVTSI2P: - case X86ISD::STRICT_CVTUI2P: - case X86ISD::STRICT_VFPROUND: - if (In.getOperand(1).getValueType() == MVT::v2f64 || - In.getOperand(1).getValueType() == MVT::v2i64) - return N->getOperand(0); - break; - } - } - // Pull subvector inserts into undef through VZEXT_MOVL by making it an // insert into a zero vector. This helps get VZEXT_MOVL closer to // scalar_to_vectors where 256/512 are canonicalized to an insert and a // 128-bit scalar_to_vector. This reduces the number of isel patterns. if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() && - N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR && - N->getOperand(0).hasOneUse() && - N->getOperand(0).getOperand(0).isUndef() && - isNullConstant(N->getOperand(0).getOperand(2))) { - SDValue In = N->getOperand(0).getOperand(1); - SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, - getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl), - Movl, N->getOperand(0).getOperand(2)); - } - - // If this a vzmovl of a full vector load, replace it with a vzload, unless - // the load is volatile. - if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() && - ISD::isNormalLoad(N->getOperand(0).getNode())) { - LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); - if (LN->isSimple()) { - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; - SDValue VZLoad = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, - VT.getVectorElementType(), - LN->getPointerInfo(), - LN->getAlignment(), - MachineMemOperand::MOLoad); - DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); - return VZLoad; + N->getOperand(0).hasOneUse()) { + SDValue V = peekThroughOneUseBitcasts(N->getOperand(0)); + + if (V.getOpcode() == ISD::INSERT_SUBVECTOR && + V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) { + SDValue In = V.getOperand(1); + MVT SubVT = + MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(), + In.getValueSizeInBits() / VT.getScalarSizeInBits()); + In = DAG.getBitcast(SubVT, In); + SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, + getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl), + Movl, V.getOperand(2)); } } return SDValue(); } +// Simplify variable target shuffle masks based on the demanded elements. +// TODO: Handle DemandedBits in mask indices as well? +bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle( + SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, + TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const { + // If we're demanding all elements don't bother trying to simplify the mask. + unsigned NumElts = DemandedElts.getBitWidth(); + if (DemandedElts.isAllOnesValue()) + return false; + + SDValue Mask = Op.getOperand(MaskIndex); + if (!Mask.hasOneUse()) + return false; + + // Attempt to generically simplify the variable shuffle mask. + APInt MaskUndef, MaskZero; + if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, + Depth + 1)) + return true; + + // Attempt to extract+simplify a (constant pool load) shuffle mask. + // TODO: Support other types from getTargetShuffleMaskIndices? + SDValue BC = peekThroughOneUseBitcasts(Mask); + EVT BCVT = BC.getValueType(); + auto *Load = dyn_cast<LoadSDNode>(BC); + if (!Load) + return false; + + const Constant *C = getTargetConstantFromNode(Load); + if (!C) + return false; + + Type *CTy = C->getType(); + if (!CTy->isVectorTy() || + CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits()) + return false; + + // Handle scaling for i64 elements on 32-bit targets. + unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements(); + if (NumCstElts != NumElts && NumCstElts != (NumElts * 2)) + return false; + unsigned Scale = NumCstElts / NumElts; + + // Simplify mask if we have an undemanded element that is not undef. + bool Simplified = false; + SmallVector<Constant *, 32> ConstVecOps; + for (unsigned i = 0; i != NumCstElts; ++i) { + Constant *Elt = C->getAggregateElement(i); + if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) { + ConstVecOps.push_back(UndefValue::get(Elt->getType())); + Simplified = true; + continue; + } + ConstVecOps.push_back(Elt); + } + if (!Simplified) + return false; + + // Generate new constant pool entry + legalize immediately for the load. + SDLoc DL(Op); + SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT); + SDValue LegalCV = LowerConstantPool(CV, TLO.DAG); + SDValue NewMask = TLO.DAG.getLoad( + BCVT, DL, TLO.DAG.getEntryNode(), LegalCV, + MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()), + Load->getAlign()); + return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask)); +} + bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const { @@ -35541,12 +37162,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( // Aggressively peek through ops to get at the demanded elts. // TODO - we should do this for all target/faux shuffles ops. if (!DemandedElts.isAllOnesValue()) { - APInt DemandedSrcBits = - APInt::getAllOnesValue(N0.getScalarValueSizeInBits()); - SDValue NewN0 = SimplifyMultipleUseDemandedBits( - N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1); - SDValue NewN1 = SimplifyMultipleUseDemandedBits( - N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1); + SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS, + TLO.DAG, Depth + 1); + SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS, + TLO.DAG, Depth + 1); if (NewN0 || NewN1) { NewN0 = NewN0 ? NewN0 : N0; NewN1 = NewN1 ? NewN1 : N1; @@ -35608,6 +37227,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( KnownUndef = LHSUndef & RHSUndef; break; } + case X86ISD::VZEXT_MOVL: { + // If upper demanded elements are already zero then we have nothing to do. + SDValue Src = Op.getOperand(0); + APInt DemandedUpperElts = DemandedElts; + DemandedUpperElts.clearLowBits(1); + if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero()) + return TLO.CombineTo(Op, Src); + break; + } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -35625,36 +37253,32 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, Depth + 1)) return true; + // Aggressively peek through src to get at the demanded elt. + // TODO - we should do this for all target/faux shuffles ops. + if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( + Src, SrcElts, TLO.DAG, Depth + 1)) + return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); break; } - case X86ISD::VPERMV: { - SDValue Mask = Op.getOperand(0); - APInt MaskUndef, MaskZero; - if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, - Depth + 1)) + case X86ISD::VPERMV: + if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO, + Depth)) return true; break; - } case X86ISD::PSHUFB: case X86ISD::VPERMV3: - case X86ISD::VPERMILPV: { - SDValue Mask = Op.getOperand(1); - APInt MaskUndef, MaskZero; - if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, - Depth + 1)) + case X86ISD::VPERMILPV: + if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO, + Depth)) return true; break; - } case X86ISD::VPPERM: - case X86ISD::VPERMIL2: { - SDValue Mask = Op.getOperand(2); - APInt MaskUndef, MaskZero; - if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, - Depth + 1)) + case X86ISD::VPERMIL2: + if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO, + Depth)) return true; break; } - } // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not // demand any of the high elements, then narrow the op to 128/256-bits: e.g. @@ -35669,18 +37293,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( ExtSizeInBits = SizeInBits / 4; switch (Opc) { - // Zero upper elements. - case X86ISD::VZEXT_MOVL: { - SDLoc DL(Op); - SDValue Ext0 = - extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); - SDValue ExtOp = - TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0); - SDValue UndefVec = TLO.DAG.getUNDEF(VT); - SDValue Insert = - insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); - return TLO.CombineTo(Op, Insert); - } // Subvector broadcast. case X86ISD::SUBV_BROADCAST: { SDLoc DL(Op); @@ -35733,10 +37345,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } break; } - // Target Shuffles. + // Zero upper elements. + case X86ISD::VZEXT_MOVL: + // Target unary shuffles by immediate: + case X86ISD::PSHUFD: + case X86ISD::PSHUFLW: + case X86ISD::PSHUFHW: + case X86ISD::VPERMILPI: + // (Non-Lane Crossing) Target Shuffles. + case X86ISD::VPERMILPV: + case X86ISD::VPERMIL2: case X86ISD::PSHUFB: case X86ISD::UNPCKL: case X86ISD::UNPCKH: + case X86ISD::BLENDI: // Saturated Packs. case X86ISD::PACKSS: case X86ISD::PACKUS: @@ -35746,14 +37368,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( case X86ISD::FHADD: case X86ISD::FHSUB: { SDLoc DL(Op); + SmallVector<SDValue, 4> Ops; + for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { + SDValue SrcOp = Op.getOperand(i); + EVT SrcVT = SrcOp.getValueType(); + assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && + "Unsupported vector size"); + Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL, + ExtSizeInBits) + : SrcOp); + } MVT ExtVT = VT.getSimpleVT(); ExtVT = MVT::getVectorVT(ExtVT.getScalarType(), ExtSizeInBits / ExtVT.getScalarSizeInBits()); - SDValue Ext0 = - extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); - SDValue Ext1 = - extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits); - SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1); + SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops); SDValue UndefVec = TLO.DAG.getUNDEF(VT); SDValue Insert = insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); @@ -35850,6 +37478,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( unsigned BitWidth = OriginalDemandedBits.getBitWidth(); unsigned Opc = Op.getOpcode(); switch(Opc) { + case X86ISD::VTRUNC: { + KnownBits KnownOp; + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + // Simplify the input, using demanded bit information. + APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits()); + APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements()); + if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1)) + return true; + break; + } case X86ISD::PMULDQ: case X86ISD::PMULUDQ: { // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. @@ -35906,6 +37546,14 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( } } + // If we are only demanding sign bits then we can use the shift source directly. + unsigned NumSignBits = + TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1); + unsigned UpperDemandedBits = + BitWidth - OriginalDemandedBits.countTrailingZeros(); + if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits) + return TLO.CombineTo(Op, Op0); + if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, TLO, Depth + 1)) return true; @@ -36019,7 +37667,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( return TLO.CombineTo( Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1))); - Known = KnownVec.zext(BitWidth, true); + Known = KnownVec.zext(BitWidth); return false; } break; @@ -36072,6 +37720,17 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS, KnownRHS, TLO, Depth + 1)) return true; + + // Attempt to avoid multi-use ops if we don't need anything from them. + SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( + Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1); + SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits( + Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1); + if (DemandedOp0 || DemandedOp1) { + SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0); + SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1); + return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1)); + } } // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support. break; @@ -36104,16 +37763,51 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( // MOVMSK only uses the MSB from each vector element. KnownBits KnownSrc; - if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts, - KnownSrc, TLO, Depth + 1)) + APInt DemandedSrcBits = APInt::getSignMask(SrcBits); + if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO, + Depth + 1)) return true; if (KnownSrc.One[SrcBits - 1]) Known.One.setLowBits(NumElts); else if (KnownSrc.Zero[SrcBits - 1]) Known.Zero.setLowBits(NumElts); + + // Attempt to avoid multi-use os if we don't need anything from it. + if (SDValue NewSrc = SimplifyMultipleUseDemandedBits( + Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1)) + return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); return false; } + case X86ISD::BEXTR: { + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + + // Only bottom 16-bits of the control bits are required. + if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) { + // NOTE: SimplifyDemandedBits won't do this for constants. + const APInt &Val1 = Cst1->getAPIntValue(); + APInt MaskedVal1 = Val1 & 0xFFFF; + if (MaskedVal1 != Val1) { + SDLoc DL(Op); + return TLO.CombineTo( + Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0, + TLO.DAG.getConstant(MaskedVal1, DL, VT))); + } + } + + KnownBits Known1; + APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16)); + if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1)) + return true; + + // If the length is 0, replace with 0. + KnownBits LengthBits = Known1.extractBits(8, 8); + if (LengthBits.isZero()) + return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); + + break; + } } return TargetLowering::SimplifyDemandedBitsForTargetNode( @@ -36137,8 +37831,26 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) && !DemandedElts[CIdx->getZExtValue()]) return Vec; - break; + break; + } + case X86ISD::VSHLI: { + // If we are only demanding sign bits then we can use the shift source + // directly. + SDValue Op0 = Op.getOperand(0); + unsigned ShAmt = Op.getConstantOperandVal(1); + unsigned BitWidth = DemandedBits.getBitWidth(); + unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); + unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros(); + if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits) + return Op0; + break; } + case X86ISD::VSRAI: + // iff we only need the sign bit then we can use the source directly. + // TODO: generalize where we only demand extended signbits. + if (DemandedBits.isSignMask()) + return Op.getOperand(0); + break; case X86ISD::PCMPGT: // icmp sgt(0, R) == ashr(R, BitWidth-1). // iff we only need the sign bit then we can use R directly. @@ -36172,13 +37884,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( int M = ShuffleMask[i]; if (!DemandedElts[i] || ShuffleUndef[i]) continue; - int Op = M / NumElts; - int Index = M % NumElts; - if (M < 0 || Index != i) { + int OpIdx = M / NumElts; + int EltIdx = M % NumElts; + if (M < 0 || EltIdx != i) { IdentityOp.clearAllBits(); break; } - IdentityOp &= APInt::getOneBitSet(NumOps, Op); + IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx); if (IdentityOp == 0) break; } @@ -36209,6 +37921,51 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { return false; } +// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents. +static unsigned getAltBitOpcode(unsigned Opcode) { + switch(Opcode) { + case ISD::AND: return X86ISD::FAND; + case ISD::OR: return X86ISD::FOR; + case ISD::XOR: return X86ISD::FXOR; + case X86ISD::ANDNP: return X86ISD::FANDN; + } + llvm_unreachable("Unknown bitwise opcode"); +} + +// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets. +static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, + const SDLoc &DL) { + EVT SrcVT = Src.getValueType(); + if (SrcVT != MVT::v4i1) + return SDValue(); + + switch (Src.getOpcode()) { + case ISD::SETCC: + if (Src.getOperand(0).getValueType() == MVT::v4i32 && + ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) && + cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) { + SDValue Op0 = Src.getOperand(0); + if (ISD::isNormalLoad(Op0.getNode())) + return DAG.getBitcast(MVT::v4f32, Op0); + if (Op0.getOpcode() == ISD::BITCAST && + Op0.getOperand(0).getValueType() == MVT::v4f32) + return Op0.getOperand(0); + } + break; + case ISD::AND: + case ISD::XOR: + case ISD::OR: { + SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL); + SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL); + if (Op0 && Op1) + return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0, + Op1); + break; + } + } + return SDValue(); +} + // Helper to push sign extension of vXi1 SETCC result through bitops. static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL) { @@ -36239,18 +37996,40 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) return SDValue(); + // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type + // legalization destroys the v4i32 type. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) { + if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) { + V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, + DAG.getBitcast(MVT::v4f32, V)); + return DAG.getZExtOrTrunc(V, DL, VT); + } + } + // If the input is a truncate from v16i8 or v32i8 go ahead and use a // movmskb even with avx512. This will be better than truncating to vXi1 and // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 // vpcmpeqb/vpcmpgtb. - bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && - (Src.getOperand(0).getValueType() == MVT::v16i8 || - Src.getOperand(0).getValueType() == MVT::v32i8 || - Src.getOperand(0).getValueType() == MVT::v64i8); + bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && + (Src.getOperand(0).getValueType() == MVT::v16i8 || + Src.getOperand(0).getValueType() == MVT::v32i8 || + Src.getOperand(0).getValueType() == MVT::v64i8); + + // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled + // directly with vpmovmskb/vmovmskps/vmovmskpd. + if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() && + cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT && + ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) { + EVT CmpVT = Src.getOperand(0).getValueType(); + EVT EltVT = CmpVT.getVectorElementType(); + if (CmpVT.getSizeInBits() <= 256 && + (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64)) + PreferMovMsk = true; + } // With AVX512 vxi1 types are legal and we prefer using k-regs. // MOVMSK is supported in SSE2 or later. - if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated)) + if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk)) return SDValue(); // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and @@ -36306,7 +38085,14 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, case MVT::v64i1: // If we have AVX512F, but not AVX512BW and the input is truncated from // v64i8 checked earlier. Then split the input and make two pmovmskbs. - if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) { + if (Subtarget.hasAVX512()) { + if (Subtarget.hasBWI()) + return SDValue(); + SExtVT = MVT::v64i8; + break; + } + // Split if this is a <64 x i8> comparison result. + if (checkBitcastSrcVectorSize(Src, 512)) { SExtVT = MVT::v64i8; break; } @@ -36476,6 +38262,74 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, return Ops[0]; } +// Recursive function that attempts to find if a bool vector node was originally +// a vector/float/double that got truncated/extended/bitcast to/from a scalar +// integer. If so, replace the scalar ops with bool vector equivalents back down +// the chain. +static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + unsigned Opc = V.getOpcode(); + switch (Opc) { + case ISD::BITCAST: { + // Bitcast from a vector/float/double, we can cheaply bitcast to VT. + SDValue Src = V.getOperand(0); + EVT SrcVT = Src.getValueType(); + if (SrcVT.isVector() || SrcVT.isFloatingPoint()) + return DAG.getBitcast(VT, Src); + break; + } + case ISD::TRUNCATE: { + // If we find a suitable source, a truncated scalar becomes a subvector. + SDValue Src = V.getOperand(0); + EVT NewSrcVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits()); + if (TLI.isTypeLegal(NewSrcVT)) + if (SDValue N0 = + combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget)) + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0, + DAG.getIntPtrConstant(0, DL)); + break; + } + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: { + // If we find a suitable source, an extended scalar becomes a subvector. + SDValue Src = V.getOperand(0); + EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + Src.getScalarValueSizeInBits()); + if (TLI.isTypeLegal(NewSrcVT)) + if (SDValue N0 = + combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget)) + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT) + : DAG.getConstant(0, DL, VT), + N0, DAG.getIntPtrConstant(0, DL)); + break; + } + case ISD::OR: { + // If we find suitable sources, we can just move an OR to the vector domain. + SDValue Src0 = V.getOperand(0); + SDValue Src1 = V.getOperand(1); + if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget)) + if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget)) + return DAG.getNode(Opc, DL, VT, N0, N1); + break; + } + case ISD::SHL: { + // If we find a suitable source, a SHL becomes a KSHIFTL. + SDValue Src0 = V.getOperand(0); + if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1))) + if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget)) + return DAG.getNode( + X86ISD::KSHIFTL, DL, VT, N0, + DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8)); + break; + } + } + return SDValue(); +} + static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -36494,24 +38348,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; - // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type - // legalization destroys the v4i32 type. - if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 && - VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC && - N0.getOperand(0).getValueType() == MVT::v4i32 && - ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) && - cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) { - SDValue N00 = N0.getOperand(0); - // Only do this if we can avoid scalarizing the input. - if (ISD::isNormalLoad(N00.getNode()) || - (N00.getOpcode() == ISD::BITCAST && - N00.getOperand(0).getValueType() == MVT::v4f32)) { - SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, - DAG.getBitcast(MVT::v4f32, N00)); - return DAG.getZExtOrTrunc(V, dl, VT); - } - } - // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && @@ -36553,6 +38389,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, N0 = DAG.getBitcast(MVT::i8, N0); return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); } + } else { + // If we're bitcasting from iX to vXi1, see if the integer originally + // began as a vXi1 and whether we can remove the bitcast entirely. + if (VT.isVector() && VT.getScalarType() == MVT::i1 && + SrcVT.isScalarInteger() && + DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + if (SDValue V = + combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget)) + return V; + } } // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and @@ -36567,19 +38413,30 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, DAG.getBitcast(MVT::i16, N0.getOperand(0))); - // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT - // determines // the number of bits loaded. Remaining bits are zero. + // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast + // and the vbroadcast_load are both integer or both fp. In some cases this + // will remove the bitcast entirely. if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() && - VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) { + VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) { auto *BCast = cast<MemIntrinsicSDNode>(N0); - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; - SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, - VT.getVectorElementType(), - BCast->getMemOperand()); - DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); - return ResNode; + unsigned SrcVTSize = SrcVT.getScalarSizeInBits(); + unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits(); + // Don't swap i8/i16 since don't have fp types that size. + if (MemSize >= 32) { + MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize) + : MVT::getIntegerVT(MemSize); + MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize) + : MVT::getIntegerVT(SrcVTSize); + LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements()); + + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + MemVT, BCast->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); + return DAG.getBitcast(VT, ResNode); + } } // Since MMX types are special and don't usually play with other vector types, @@ -36666,6 +38523,47 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, return DAG.getConstant(0, SDLoc(N0), VT); } + // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1. + // Turn it into a sign bit compare that produces a k-register. This avoids + // a trip through a GPR. + if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && + VT.isVector() && VT.getVectorElementType() == MVT::i1 && + isPowerOf2_32(VT.getVectorNumElements())) { + unsigned NumElts = VT.getVectorNumElements(); + SDValue Src = N0; + + // Peek through truncate. + if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) + Src = N0.getOperand(0); + + if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) { + SDValue MovmskIn = Src.getOperand(0); + MVT MovmskVT = MovmskIn.getSimpleValueType(); + unsigned MovMskElts = MovmskVT.getVectorNumElements(); + + // We allow extra bits of the movmsk to be used since they are known zero. + // We can't convert a VPMOVMSKB without avx512bw. + if (MovMskElts <= NumElts && + (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) { + EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger(); + MovmskIn = DAG.getBitcast(IntVT, MovmskIn); + SDLoc dl(N); + MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts); + SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn, + DAG.getConstant(0, dl, IntVT), ISD::SETLT); + if (EVT(CmpVT) == VT) + return Cmp; + + // Pad with zeroes up to original VT to replace the zeroes that were + // being used from the MOVMSK. + unsigned NumConcats = NumElts / MovMskElts; + SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT)); + Ops[0] = Cmp; + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops); + } + } + } + // Try to remove bitcasts from input and output of mask arithmetic to // remove GPR<->K-register crossings. if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget)) @@ -36790,12 +38688,9 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, // First, reduce the source down to 128-bit, applying BinOp to lo/hi. while (SrcVT.getSizeInBits() > 128) { - unsigned NumElts = SrcVT.getVectorNumElements(); - unsigned NumSubElts = NumElts / 2; - SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts); - unsigned SubSizeInBits = SrcVT.getSizeInBits(); - SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits); - SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits); + SDValue Lo, Hi; + std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL); + SrcVT = Lo.getValueType(); MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); } assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || @@ -36882,6 +38777,25 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); Movmsk = DAG.getBitcast(MovmskVT, Match); } else { + // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have + // PCMPEQQ (SSE41+), use PCMPEQD instead. + if (BinOp == ISD::AND && !Subtarget.hasSSE41() && + Match.getOpcode() == ISD::SETCC && + ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) && + cast<CondCodeSDNode>(Match.getOperand(2))->get() == + ISD::CondCode::SETEQ) { + SDValue Vec = Match.getOperand(0); + if (Vec.getValueType().getScalarType() == MVT::i64 && + (2 * NumElts) <= MaxElts) { + NumElts *= 2; + EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); + Match = DAG.getSetCC( + DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)), + DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ); + } + } + // Use combineBitcastvxi1 to create the MOVMSK. while (NumElts > MaxElts) { SDValue Lo, Hi; @@ -36896,10 +38810,7 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, return SDValue(); Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32); } else { - // Bail with AVX512VL (which uses predicate registers). - if (Subtarget.hasVLX()) - return SDValue(); - + // FIXME: Better handling of k-registers or 512-bit vectors? unsigned MatchSizeInBits = Match.getValueSizeInBits(); if (!(MatchSizeInBits == 128 || (MatchSizeInBits == 256 && Subtarget.hasAVX()))) @@ -36976,21 +38887,14 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, if (!Subtarget.hasSSE2()) return SDValue(); - // Verify the type we're extracting from is any integer type above i16. - EVT VT = Extract->getOperand(0).getValueType(); - if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) + EVT ExtractVT = Extract->getValueType(0); + // Verify the type we're extracting is either i32 or i64. + // FIXME: Could support other types, but this is what we have coverage for. + if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64) return SDValue(); - unsigned RegSize = 128; - if (Subtarget.useBWIRegs()) - RegSize = 512; - else if (Subtarget.hasAVX()) - RegSize = 256; - - // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512. - // TODO: We should be able to handle larger vectors by splitting them before - // feeding them into several SADs, and then reducing over those. - if (RegSize / VT.getVectorNumElements() < 8) + EVT VT = Extract->getOperand(0).getValueType(); + if (!isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); // Match shuffle + add pyramid. @@ -37006,8 +38910,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // (extends the sign bit which is zero). // So it is correct to skip the sign/zero extend instruction. if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || - Root.getOpcode() == ISD::ZERO_EXTEND || - Root.getOpcode() == ISD::ANY_EXTEND)) + Root.getOpcode() == ISD::ZERO_EXTEND || + Root.getOpcode() == ISD::ANY_EXTEND)) Root = Root.getOperand(0); // If there was a match, we want Root to be a select that is the root of an @@ -37027,7 +38931,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // If the original vector was wider than 8 elements, sum over the results // in the SAD vector. unsigned Stages = Log2_32(VT.getVectorNumElements()); - MVT SadVT = SAD.getSimpleValueType(); + EVT SadVT = SAD.getValueType(); if (Stages > 3) { unsigned SadElems = SadVT.getVectorNumElements(); @@ -37042,12 +38946,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, } } - MVT Type = Extract->getSimpleValueType(0); - unsigned TypeSizeInBits = Type.getSizeInBits(); - // Return the lowest TypeSizeInBits bits. - MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); + unsigned ExtractSizeInBits = ExtractVT.getSizeInBits(); + // Return the lowest ExtractSizeInBits bits. + EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT, + SadVT.getSizeInBits() / ExtractSizeInBits); SAD = DAG.getBitcast(ResVT, SAD); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD, Extract->getOperand(1)); } @@ -37066,19 +38970,34 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); EVT SrcSVT = SrcVT.getVectorElementType(); + unsigned SrcEltBits = SrcSVT.getSizeInBits(); unsigned NumSrcElts = SrcVT.getVectorNumElements(); // Don't attempt this for boolean mask vectors or unknown extraction indices. if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx)) return SDValue(); + const APInt &IdxC = N->getConstantOperandAPInt(1); + if (IdxC.uge(NumSrcElts)) + return SDValue(); + SDValue SrcBC = peekThroughBitcasts(Src); - // Handle extract(broadcast(scalar_value)), it doesn't matter what index is. + // Handle extract(bitcast(broadcast(scalar_value))). if (X86ISD::VBROADCAST == SrcBC.getOpcode()) { SDValue SrcOp = SrcBC.getOperand(0); - if (SrcOp.getValueSizeInBits() == VT.getSizeInBits()) - return DAG.getBitcast(VT, SrcOp); + EVT SrcOpVT = SrcOp.getValueType(); + if (SrcOpVT.isScalarInteger() && VT.isInteger() && + (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) { + unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits; + unsigned Offset = IdxC.urem(Scale) * SrcEltBits; + // TODO support non-zero offsets. + if (Offset == 0) { + SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType()); + SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT); + return SrcOp; + } + } } // If we're extracting a single element from a broadcast load and there are @@ -37087,22 +39006,43 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC); unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && - VT.getSizeInBits() == SrcBCWidth) { + VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) { SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(), MemIntr->getBasePtr(), MemIntr->getPointerInfo(), - MemIntr->getAlignment(), + MemIntr->getOriginalAlign(), MemIntr->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); return Load; } } + // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers. + // TODO: Move to DAGCombine? + if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() && + SrcBC.getValueType().isInteger() && + (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 && + SrcBC.getScalarValueSizeInBits() == + SrcBC.getOperand(0).getValueSizeInBits()) { + unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits; + if (IdxC.ult(Scale)) { + unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits(); + SDValue Scl = SrcBC.getOperand(0); + EVT SclVT = Scl.getValueType(); + if (Offset) { + Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl, + DAG.getShiftAmountConstant(Offset, SclVT, dl)); + } + Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType()); + Scl = DAG.getZExtOrTrunc(Scl, dl, VT); + return Scl; + } + } + // Handle extract(truncate(x)) for 0'th index. // TODO: Treat this as a faux shuffle? // TODO: When can we use this for general indices? - if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && - isNullConstant(Idx)) { + if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) { Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl); Src = DAG.getBitcast(SrcVT, Src); return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx); @@ -37114,12 +39054,18 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) return SDValue(); + // Shuffle inputs must be the same size as the result. + if (llvm::any_of(Ops, [SrcVT](SDValue Op) { + return SrcVT.getSizeInBits() != Op.getValueSizeInBits(); + })) + return SDValue(); + // Attempt to narrow/widen the shuffle mask to the correct size. if (Mask.size() != NumSrcElts) { if ((NumSrcElts % Mask.size()) == 0) { SmallVector<int, 16> ScaledMask; int Scale = NumSrcElts / Mask.size(); - scaleShuffleMask<int>(Scale, Mask, ScaledMask); + narrowShuffleMaskElts(Scale, Mask, ScaledMask); Mask = std::move(ScaledMask); } else if ((Mask.size() % NumSrcElts) == 0) { // Simplify Mask based on demanded element. @@ -37144,7 +39090,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if (Mask.size() != NumSrcElts) return SDValue(); - int SrcIdx = Mask[N->getConstantOperandVal(1)]; + int SrcIdx = Mask[IdxC.getZExtValue()]; // If the shuffle source element is undef/zero then we can just accept it. if (SrcIdx == SM_SentinelUndef) @@ -37171,8 +39117,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) || (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) { - assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() && - "Unexpected extraction type"); + assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type"); unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); SrcOp = DAG.getBitcast(SrcVT, SrcOp); SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, @@ -37342,12 +39287,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, // vXi8 reduction - sum lo/hi halves then use PSADBW. if (VT == MVT::i8) { while (Rdx.getValueSizeInBits() > 128) { - unsigned HalfSize = VecVT.getSizeInBits() / 2; - unsigned HalfElts = VecVT.getVectorNumElements() / 2; - SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize); - SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize); - Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi); - VecVT = Rdx.getValueType(); + SDValue Lo, Hi; + std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); + VecVT = Lo.getValueType(); + Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi); } assert(VecVT == MVT::v16i8 && "v16i8 reduction expected"); @@ -37362,8 +39305,7 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, } // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize. - bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); - if (!Subtarget.hasFastHorizontalOps() && !OptForSize) + if (!shouldUseHorizontalOp(true, DAG, Subtarget)) return SDValue(); unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; @@ -37495,11 +39437,21 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, // Attempt to extract a i1 element by using MOVMSK to extract the signbits // and then testing the relevant element. + // + // Note that we only combine extracts on the *same* result number, i.e. + // t0 = merge_values a0, a1, a2, a3 + // i1 = extract_vector_elt t0, Constant:i64<2> + // i1 = extract_vector_elt t0, Constant:i64<3> + // but not + // i1 = extract_vector_elt t0:1, Constant:i64<2> + // since the latter would need its own MOVMSK. if (CIdx && SrcVT.getScalarType() == MVT::i1) { SmallVector<SDNode *, 16> BoolExtracts; - auto IsBoolExtract = [&BoolExtracts](SDNode *Use) { + unsigned ResNo = InputVector.getResNo(); + auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) { if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && isa<ConstantSDNode>(Use->getOperand(1)) && + Use->getOperand(0).getResNo() == ResNo && Use->getValueType(0) == MVT::i1) { BoolExtracts.push_back(Use); return true; @@ -37548,8 +39500,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, assert(CondVT.isVector() && "Vector select expects a vector selector!"); - // Check if the first operand is all zeros and Cond type is vXi1. - // This situation only applies to avx512. // TODO: Use isNullOrNullSplat() to distinguish constants with undefs? // TODO: Can we assert that both operands are not zeros (because that should // get simplified at node creation time)? @@ -37564,14 +39514,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, return DAG.getConstant(0, DL, VT); } - if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() && - Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { - // Invert the cond to not(cond) : xor(op,allones)=not(op) - SDValue CondNew = DAG.getNOT(DL, Cond, CondVT); - // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 - return DAG.getSelect(DL, VT, CondNew, RHS, LHS); - } - // To use the condition operand as a bitwise mask, it must have elements that // are the same size as the select elements. Ie, the condition operand must // have already been promoted from the IR select condition type <N x i1>. @@ -37796,12 +39738,13 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, return true; }; + APInt DemandedBits(APInt::getSignMask(BitWidth)); + if (OnlyUsedAsSelectCond(Cond)) { - APInt DemandedMask(APInt::getSignMask(BitWidth)); KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); - if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true)) + if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true)) return SDValue(); // If we changed the computation somewhere in the DAG, this change will @@ -37823,15 +39766,9 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, } // Otherwise we can still at least try to simplify multiple use bits. - APInt DemandedMask(APInt::getSignMask(BitWidth)); - APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements())); - KnownBits Known; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); - if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask, - DemandedElts, DAG, 0)) - return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), - V, N->getOperand(1), N->getOperand(2)); + if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG)) + return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V, + N->getOperand(1), N->getOperand(2)); return SDValue(); } @@ -38315,6 +40252,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, } } + // Check if the first operand is all zeros and Cond type is vXi1. + // If this an avx512 target we can improve the use of zero masking by + // swapping the operands and inverting the condition. + if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() && + Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 && + ISD::isBuildVectorAllZeros(LHS.getNode()) && + !ISD::isBuildVectorAllZeros(RHS.getNode())) { + // Invert the cond to not(cond) : xor(op,allones)=not(op) + SDValue CondNew = DAG.getNOT(DL, Cond, CondVT); + // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 + return DAG.getSelect(DL, VT, CondNew, RHS, LHS); + } + // Early exit check if (!TLI.isTypeLegal(VT)) return SDValue(); @@ -38334,12 +40284,86 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(CondVT, CondNot), RHS, LHS); - // Custom action for SELECT MMX - if (VT == MVT::x86mmx) { - LHS = DAG.getBitcast(MVT::i64, LHS); - RHS = DAG.getBitcast(MVT::i64, RHS); - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS); - return DAG.getBitcast(VT, newSelect); + // Try to optimize vXi1 selects if both operands are either all constants or + // bitcasts from scalar integer type. In that case we can convert the operands + // to integer and use an integer select which will be converted to a CMOV. + // We need to take a little bit of care to avoid creating an i64 type after + // type legalization. + if (N->getOpcode() == ISD::SELECT && VT.isVector() && + VT.getVectorElementType() == MVT::i1 && + (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) { + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements()); + bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()); + bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()); + + if ((LHSIsConst || + (LHS.getOpcode() == ISD::BITCAST && + LHS.getOperand(0).getValueType() == IntVT)) && + (RHSIsConst || + (RHS.getOpcode() == ISD::BITCAST && + RHS.getOperand(0).getValueType() == IntVT))) { + if (LHSIsConst) + LHS = combinevXi1ConstantToInteger(LHS, DAG); + else + LHS = LHS.getOperand(0); + + if (RHSIsConst) + RHS = combinevXi1ConstantToInteger(RHS, DAG); + else + RHS = RHS.getOperand(0); + + SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS); + return DAG.getBitcast(VT, Select); + } + } + + // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of + // single bits, then invert the predicate and swap the select operands. + // This can lower using a vector shift bit-hack rather than mask and compare. + if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() && + N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 && + Cond.getOperand(0).getOpcode() == ISD::AND && + isNullOrNullSplat(Cond.getOperand(1)) && + cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && + Cond.getOperand(0).getValueType() == VT) { + // The 'and' mask must be composed of power-of-2 constants. + SDValue And = Cond.getOperand(0); + auto *C = isConstOrConstSplat(And.getOperand(1)); + if (C && C->getAPIntValue().isPowerOf2()) { + // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS + SDValue NotCond = + DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE); + return DAG.getSelect(DL, VT, NotCond, RHS, LHS); + } + + // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld + // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply. + // 16-bit lacks a proper blendv. + unsigned EltBitWidth = VT.getScalarSizeInBits(); + bool CanShiftBlend = + TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) || + (Subtarget.hasAVX2() && EltBitWidth == 64) || + (Subtarget.hasXOP())); + if (CanShiftBlend && + ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) { + return C->getAPIntValue().isPowerOf2(); + })) { + // Create a left-shift constant to get the mask bits over to the sign-bit. + SDValue Mask = And.getOperand(1); + SmallVector<int, 32> ShlVals; + for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { + auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i)); + ShlVals.push_back(EltBitWidth - 1 - + MaskVal->getAPIntValue().exactLogBase2()); + } + // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS + SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL); + SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt); + SDValue NewCond = + DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT); + return DAG.getSelect(DL, VT, NewCond, RHS, LHS); + } } return SDValue(); @@ -38665,6 +40689,282 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { return SDValue(); } +/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC +/// to avoid the inversion. +static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST. + if (EFLAGS.getOpcode() != X86ISD::PTEST && + EFLAGS.getOpcode() != X86ISD::TESTP) + return SDValue(); + + // PTEST/TESTP sets EFLAGS as: + // TESTZ: ZF = (Op0 & Op1) == 0 + // TESTC: CF = (~Op0 & Op1) == 0 + // TESTNZC: ZF == 0 && CF == 0 + EVT VT = EFLAGS.getValueType(); + SDValue Op0 = EFLAGS.getOperand(0); + SDValue Op1 = EFLAGS.getOperand(1); + EVT OpVT = Op0.getValueType(); + + // TEST*(~X,Y) == TEST*(X,Y) + if (SDValue NotOp0 = IsNOT(Op0, DAG)) { + X86::CondCode InvCC; + switch (CC) { + case X86::COND_B: + // testc -> testz. + InvCC = X86::COND_E; + break; + case X86::COND_AE: + // !testc -> !testz. + InvCC = X86::COND_NE; + break; + case X86::COND_E: + // testz -> testc. + InvCC = X86::COND_B; + break; + case X86::COND_NE: + // !testz -> !testc. + InvCC = X86::COND_AE; + break; + case X86::COND_A: + case X86::COND_BE: + // testnzc -> testnzc (no change). + InvCC = CC; + break; + default: + InvCC = X86::COND_INVALID; + break; + } + + if (InvCC != X86::COND_INVALID) { + CC = InvCC; + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, + DAG.getBitcast(OpVT, NotOp0), Op1); + } + } + + if (CC == X86::COND_E || CC == X86::COND_NE) { + // TESTZ(X,~Y) == TESTC(Y,X) + if (SDValue NotOp1 = IsNOT(Op1, DAG)) { + CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, + DAG.getBitcast(OpVT, NotOp1), Op0); + } + + if (Op0 == Op1) { + SDValue BC = peekThroughBitcasts(Op0); + EVT BCVT = BC.getValueType(); + assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && + "Unexpected vector type"); + + // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y) + if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) { + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, + DAG.getBitcast(OpVT, BC.getOperand(0)), + DAG.getBitcast(OpVT, BC.getOperand(1))); + } + + // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y) + if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) { + CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, + DAG.getBitcast(OpVT, BC.getOperand(0)), + DAG.getBitcast(OpVT, BC.getOperand(1))); + } + + // If every element is an all-sign value, see if we can use MOVMSK to + // more efficiently extract the sign bits and compare that. + // TODO: Handle TESTC with comparison inversion. + // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on + // MOVMSK combines to make sure its never worse than PTEST? + unsigned EltBits = BCVT.getScalarSizeInBits(); + if (DAG.ComputeNumSignBits(BC) == EltBits) { + assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result"); + APInt SignMask = APInt::getSignMask(EltBits); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (SDValue Res = + TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) { + // For vXi16 cases we need to use pmovmksb and extract every other + // sign bit. + SDLoc DL(EFLAGS); + if (EltBits == 16) { + MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8; + Res = DAG.getBitcast(MovmskVT, Res); + Res = getPMOVMSKB(DL, Res, DAG, Subtarget); + Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res, + DAG.getConstant(0xAAAAAAAA, DL, MVT::i32)); + } else { + Res = getPMOVMSKB(DL, Res, DAG, Subtarget); + } + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res, + DAG.getConstant(0, DL, MVT::i32)); + } + } + } + + // TESTZ(-1,X) == TESTZ(X,X) + if (ISD::isBuildVectorAllOnes(Op0.getNode())) + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1); + + // TESTZ(X,-1) == TESTZ(X,X) + if (ISD::isBuildVectorAllOnes(Op1.getNode())) + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); + } + + return SDValue(); +} + +// Attempt to simplify the MOVMSK input based on the comparison type. +static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Handle eq/ne against zero (any_of). + // Handle eq/ne against -1 (all_of). + if (!(CC == X86::COND_E || CC == X86::COND_NE)) + return SDValue(); + if (EFLAGS.getValueType() != MVT::i32) + return SDValue(); + unsigned CmpOpcode = EFLAGS.getOpcode(); + if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB) + return SDValue(); + auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1)); + if (!CmpConstant) + return SDValue(); + const APInt &CmpVal = CmpConstant->getAPIntValue(); + + SDValue CmpOp = EFLAGS.getOperand(0); + unsigned CmpBits = CmpOp.getValueSizeInBits(); + assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch"); + + // Peek through any truncate. + if (CmpOp.getOpcode() == ISD::TRUNCATE) + CmpOp = CmpOp.getOperand(0); + + // Bail if we don't find a MOVMSK. + if (CmpOp.getOpcode() != X86ISD::MOVMSK) + return SDValue(); + + SDValue Vec = CmpOp.getOperand(0); + MVT VecVT = Vec.getSimpleValueType(); + assert((VecVT.is128BitVector() || VecVT.is256BitVector()) && + "Unexpected MOVMSK operand"); + unsigned NumElts = VecVT.getVectorNumElements(); + unsigned NumEltBits = VecVT.getScalarSizeInBits(); + + bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue(); + bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits && + CmpVal.isMask(NumElts); + if (!IsAnyOf && !IsAllOf) + return SDValue(); + + // See if we can peek through to a vector with a wider element type, if the + // signbits extend down to all the sub-elements as well. + // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose + // potential SimplifyDemandedBits/Elts cases. + if (Vec.getOpcode() == ISD::BITCAST) { + SDValue BC = peekThroughBitcasts(Vec); + MVT BCVT = BC.getSimpleValueType(); + unsigned BCNumElts = BCVT.getVectorNumElements(); + unsigned BCNumEltBits = BCVT.getScalarSizeInBits(); + if ((BCNumEltBits == 32 || BCNumEltBits == 64) && + BCNumEltBits > NumEltBits && + DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) { + SDLoc DL(EFLAGS); + unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1); + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, + DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC), + DAG.getConstant(CmpMask, DL, MVT::i32)); + } + } + + // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X). + // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X). + if (IsAllOf && Subtarget.hasSSE41()) { + SDValue BC = peekThroughBitcasts(Vec); + if (BC.getOpcode() == X86ISD::PCMPEQ && + ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) { + MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0)); + return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); + } + } + + // See if we can avoid a PACKSS by calling MOVMSK on the sources. + // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out + // sign bits prior to the comparison with zero unless we know that + // the vXi16 splats the sign bit down to the lower i8 half. + // TODO: Handle all_of patterns. + if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) { + SDValue VecOp0 = Vec.getOperand(0); + SDValue VecOp1 = Vec.getOperand(1); + bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8; + bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8; + // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA. + if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) { + SDLoc DL(EFLAGS); + SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0); + Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); + Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16); + if (!SignExt0) { + Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result, + DAG.getConstant(0xAAAA, DL, MVT::i16)); + } + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, + DAG.getConstant(0, DL, MVT::i16)); + } + // PMOVMSKB(PACKSSBW(LO(X), HI(X))) + // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA. + if (CmpBits == 16 && Subtarget.hasInt256() && + VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR && + VecOp0.getOperand(0) == VecOp1.getOperand(0) && + VecOp0.getConstantOperandAPInt(1) == 0 && + VecOp1.getConstantOperandAPInt(1) == 8 && + (IsAnyOf || (SignExt0 && SignExt1))) { + SDLoc DL(EFLAGS); + SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0)); + Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); + unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF; + if (!SignExt0 || !SignExt1) { + assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"); + Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, + DAG.getConstant(0xAAAAAAAA, DL, MVT::i32)); + } + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, + DAG.getConstant(CmpMask, DL, MVT::i32)); + } + } + + // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced. + SmallVector<int, 32> ShuffleMask; + SmallVector<SDValue, 2> ShuffleInputs; + if (NumElts == CmpBits && + getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs, + ShuffleMask, DAG) && + ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) && + ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) { + unsigned NumShuffleElts = ShuffleMask.size(); + APInt DemandedElts = APInt::getNullValue(NumShuffleElts); + for (int M : ShuffleMask) { + assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index"); + DemandedElts.setBit(M); + } + if (DemandedElts.isAllOnesValue()) { + SDLoc DL(EFLAGS); + SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]); + Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); + Result = + DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType()); + return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, + EFLAGS.getOperand(1)); + } + } + + return SDValue(); +} + /// Optimize an EFLAGS definition used according to the condition code \p CC /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing /// uses of chain values. @@ -38677,6 +40977,13 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) return R; + + if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget)) + return R; + + if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget)) + return R; + return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); } @@ -38698,7 +41005,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // Try to simplify the EFLAGS and condition code operands. // We can't always do this as FCMOV only supports a subset of X86 cond. if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { - if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { + if (!(FalseOp.getValueType() == MVT::f80 || + (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) || + (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) || + !Subtarget.hasCMov() || hasFPCMov(CC)) { SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8), Flags}; return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); @@ -39007,7 +41317,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, : ISD::SIGN_EXTEND, DL, VT, MulLo); - MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); + EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2); // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, // the higher part is also needed. SDValue MulHi = @@ -39138,10 +41448,14 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, if (!VT.isVector() || VT.getVectorElementType() != MVT::i32) return SDValue(); - // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. - // Also allow v2i32 if it will be widened. + // Make sure the type is legal or will be widened to a legal type. + if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); - if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT)) + + // Without BWI, we would need to split v32i16. + if (WVT == MVT::v32i16 && !Subtarget.hasBWI()) return SDValue(); SDValue N0 = N->getOperand(0); @@ -39358,6 +41672,64 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, return NewMul; } +// Try to form a MULHU or MULHS node by looking for +// (srl (mul ext, ext), 16) +// TODO: This is X86 specific because we want to be able to handle wide types +// before type legalization. But we can only do it if the vector will be +// legalized via widening/splitting. Type legalization can't handle promotion +// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG +// combiner. +static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && + "SRL or SRA node is required here!"); + SDLoc DL(N); + + // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand + // the multiply. + if (!Subtarget.hasSSE41()) + return SDValue(); + + // The operation feeding into the shift must be a multiply. + SDValue ShiftOperand = N->getOperand(0); + if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse()) + return SDValue(); + + // Input type should be at least vXi32. + EVT VT = N->getValueType(0); + if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32) + return SDValue(); + + // Need a shift by 16. + APInt ShiftAmt; + if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) || + ShiftAmt != 16) + return SDValue(); + + SDValue LHS = ShiftOperand.getOperand(0); + SDValue RHS = ShiftOperand.getOperand(1); + + unsigned ExtOpc = LHS.getOpcode(); + if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) || + RHS.getOpcode() != ExtOpc) + return SDValue(); + + // Peek through the extends. + LHS = LHS.getOperand(0); + RHS = RHS.getOperand(0); + + // Ensure the input types match. + EVT MulVT = LHS.getValueType(); + if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT) + return SDValue(); + + unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU; + SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS); + + ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + return DAG.getNode(ExtOpc, DL, VT, Mulh); +} + static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -39417,12 +41789,16 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) { +static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned Size = VT.getSizeInBits(); + if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget)) + return V; + // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) @@ -39471,11 +41847,15 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) { } static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); + if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget)) + return V; + // Only do this on the last DAG combine as it can interfere with other // combines. if (!DCI.isAfterLegalizeDAG()) @@ -39519,16 +41899,92 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) { + unsigned Opcode = N->getOpcode(); + assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && + "Unexpected pack opcode"); + + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + unsigned NumDstElts = VT.getVectorNumElements(); + + // Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) + // to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for + // truncation trees that help us avoid lane crossing shuffles. + // TODO: There's a lot more we can do for PACK/HADD style shuffle combines. + if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N0.getConstantOperandAPInt(1) == 0 && + N1.getConstantOperandAPInt(1) == (NumDstElts / 2) && + N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() && + N0.getOperand(0).getValueType().is256BitVector()) { + // TODO - support target/faux shuffles. + SDValue Vec = peekThroughBitcasts(N0.getOperand(0)); + if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) { + // To keep the PACK LHS/RHS coherency, we must be able to scale the unary + // shuffle to a vXi64 width - we can probably relax this in the future. + SmallVector<int, 4> ShuffleMask; + if (SVN->getOperand(1).isUndef() && + scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) { + SDLoc DL(N); + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL); + Lo = DAG.getBitcast(N0.getValueType(), Lo); + Hi = DAG.getBitcast(N1.getValueType(), Hi); + SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi); + Res = DAG.getBitcast(MVT::v4i32, Res); + Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask); + return DAG.getBitcast(VT, Res); + } + } + } + + // Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)). + // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles. + if (VT.is256BitVector()) { + if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) { + if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) { + SmallVector<int, 2> ShuffleMask0, ShuffleMask1; + if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) && + scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) { + SDValue Op00 = SVN0->getOperand(0); + SDValue Op01 = SVN0->getOperand(1); + SDValue Op10 = SVN1->getOperand(0); + SDValue Op11 = SVN1->getOperand(1); + if ((Op00 == Op11) && (Op01 == Op10)) { + std::swap(Op10, Op11); + ShuffleVectorSDNode::commuteMask(ShuffleMask1); + } + if ((Op00 == Op10) && (Op01 == Op11)) { + SmallVector<int, 4> ShuffleMask; + ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end()); + ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end()); + SDLoc DL(N); + SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01); + Res = DAG.getBitcast(MVT::v4i64, Res); + Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask); + return DAG.getBitcast(VT, Res); + } + } + } + } + } + + return SDValue(); +} + static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { unsigned Opcode = N->getOpcode(); assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && - "Unexpected shift opcode"); + "Unexpected pack opcode"); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + unsigned NumDstElts = VT.getVectorNumElements(); unsigned DstBitsPerElt = VT.getScalarSizeInBits(); unsigned SrcBitsPerElt = 2 * DstBitsPerElt; assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt && @@ -39545,7 +42001,6 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { unsigned NumLanes = VT.getSizeInBits() / 128; - unsigned NumDstElts = VT.getVectorNumElements(); unsigned NumSrcElts = NumDstElts / 2; unsigned NumDstEltsPerLane = NumDstElts / NumLanes; unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; @@ -39592,6 +42047,10 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N)); } + // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()). + if (SDValue V = combineVectorPackWithShuffle(N, DAG)) + return V; + // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular // truncate to create a larger truncate. if (Subtarget.hasAVX512() && @@ -39674,26 +42133,37 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, if (ShiftVal >= NumBitsPerElt) { if (LogicalShift) return DAG.getConstant(0, SDLoc(N), VT); - else - ShiftVal = NumBitsPerElt - 1; + ShiftVal = NumBitsPerElt - 1; } - // Shift N0 by zero -> N0. + // (shift X, 0) -> X if (!ShiftVal) return N0; - // Shift zero -> zero. + // (shift 0, C) -> 0 if (ISD::isBuildVectorAllZeros(N0.getNode())) + // N0 is all zeros or undef. We guarantee that the bits shifted into the + // result are all zeros, not undef. return DAG.getConstant(0, SDLoc(N), VT); - // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2) - // clamped to (NumBitsPerElt - 1). - if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) { + // (VSRAI -1, C) -> -1 + if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode())) + // N0 is all ones or undef. We guarantee that the bits shifted into the + // result are all ones, not undef. + return DAG.getConstant(-1, SDLoc(N), VT); + + // (shift (shift X, C2), C1) -> (shift X, (C1 + C2)) + if (Opcode == N0.getOpcode()) { unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue(); unsigned NewShiftVal = ShiftVal + ShiftVal2; - if (NewShiftVal >= NumBitsPerElt) + if (NewShiftVal >= NumBitsPerElt) { + // Out of range logical bit shifts are guaranteed to be zero. + // Out of range arithmetic bit shifts splat the sign bit. + if (LogicalShift) + return DAG.getConstant(0, SDLoc(N), VT); NewShiftVal = NumBitsPerElt - 1; - return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0), + } + return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0), DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8)); } @@ -39743,19 +42213,24 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) || - (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) && + (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) || + N->getOpcode() == ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion"); - unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(SDValue(N, 0), - APInt::getAllOnesValue(NumBitsPerElt), DCI)) - return SDValue(N, 0); + if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) { + unsigned NumBitsPerElt = VT.getScalarSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), + APInt::getAllOnesValue(NumBitsPerElt), DCI)) + return SDValue(N, 0); + } - // Attempt to combine PINSRB/PINSRW patterns to a shuffle. - SDValue Op(N, 0); - if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) - return Res; + // Attempt to combine insertion patterns to a shuffle. + if (VT.isSimple() && DCI.isAfterLegalizeDAG()) { + SDValue Op(N, 0); + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } return SDValue(); } @@ -39778,7 +42253,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); // The SETCCs should both refer to the same CMP. - if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1) + if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1) return SDValue(); SDValue CMP00 = CMP0->getOperand(0); @@ -39877,10 +42352,27 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (SDValue Not = IsNOT(N0, DAG)) { + auto GetNot = [&VT, &DAG](SDValue V) { + // Basic X = NOT(Y) detection. + if (SDValue Not = IsNOT(V, DAG)) + return Not; + // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y). + if (V.getOpcode() == X86ISD::VBROADCAST) { + SDValue Src = V.getOperand(0); + EVT SrcVT = Src.getValueType(); + if (!SrcVT.isVector()) + return SDValue(); + if (SDValue Not = IsNOT(Src, DAG)) + return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT, + DAG.getBitcast(SrcVT, Not)); + } + return SDValue(); + }; + + if (SDValue Not = GetNot(N0)) { X = Not; Y = N1; - } else if (SDValue Not = IsNOT(N1, DAG)) { + } else if (SDValue Not = GetNot(N1)) { X = Not; Y = N0; } else @@ -39891,6 +42383,65 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y); } +// Try to widen AND, OR and XOR nodes to VT in order to remove casts around +// logical operations, like in the example below. +// or (and (truncate x, truncate y)), +// (xor (truncate z, build_vector (constants))) +// Given a target type \p VT, we generate +// or (and x, y), (xor z, zext(build_vector (constants))) +// given x, y and z are of type \p VT. We can do so, if operands are either +// truncates from VT types, the second operand is a vector of constants or can +// be recursively promoted. +static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG, + unsigned Depth) { + // Limit recursion to avoid excessive compile times. + if (Depth >= SelectionDAG::MaxRecursionDepth) + return SDValue(); + + if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND && + N->getOpcode() != ISD::OR) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT)) + return SDValue(); + + if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1)) + N0 = NN0; + else { + // The Left side has to be a trunc. + if (N0.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + // The type of the truncated inputs. + if (N0.getOperand(0).getValueType() != VT) + return SDValue(); + + N0 = N0.getOperand(0); + } + + if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1)) + N1 = NN1; + else { + // The right side has to be a 'trunc' or a constant vector. + bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && + N1.getOperand(0).getValueType() == VT; + if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) + return SDValue(); + + if (RHSTrunc) + N1 = N1.getOperand(0); + else + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); + } + + return DAG.getNode(N->getOpcode(), DL, VT, N0, N1); +} + // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized // register. In most cases we actually compare or select YMM-sized registers // and mixing the two types creates horrible code. This method optimizes @@ -39902,6 +42453,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); assert(VT.isVector() && "Expected vector type"); + SDLoc DL(N); assert((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node"); @@ -39909,57 +42461,33 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, SDValue Narrow = N->getOperand(0); EVT NarrowVT = Narrow.getValueType(); - if (Narrow->getOpcode() != ISD::XOR && - Narrow->getOpcode() != ISD::AND && - Narrow->getOpcode() != ISD::OR) - return SDValue(); - - SDValue N0 = Narrow->getOperand(0); - SDValue N1 = Narrow->getOperand(1); - SDLoc DL(Narrow); - - // The Left side has to be a trunc. - if (N0.getOpcode() != ISD::TRUNCATE) - return SDValue(); - - // The type of the truncated inputs. - if (N0.getOperand(0).getValueType() != VT) - return SDValue(); - - // The right side has to be a 'trunc' or a constant vector. - bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && - N1.getOperand(0).getValueType() == VT; - if (!RHSTrunc && - !ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) - return SDValue(); - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - - if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT)) - return SDValue(); - - // Set N0 and N1 to hold the inputs to the new wide operation. - N0 = N0.getOperand(0); - if (RHSTrunc) - N1 = N1.getOperand(0); - else - N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); - // Generate the wide operation. - SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1); - unsigned Opcode = N->getOpcode(); - switch (Opcode) { + SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0); + if (!Op) + return SDValue(); + switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode"); case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: - return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType()); + return DAG.getZeroExtendInReg(Op, DL, NarrowVT); case ISD::SIGN_EXTEND: return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(NarrowVT)); } } +static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) { + unsigned FPOpcode; + switch (Opcode) { + default: llvm_unreachable("Unexpected input node for FP logic conversion"); + case ISD::AND: FPOpcode = X86ISD::FAND; break; + case ISD::OR: FPOpcode = X86ISD::FOR; break; + case ISD::XOR: FPOpcode = X86ISD::FXOR; break; + } + return FPOpcode; +} + /// If both input operands of a logic op are being cast from floating point /// types, try to convert this into a floating point logic node to avoid /// unnecessary moves from SSE to integer registers. @@ -39984,18 +42512,45 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, (Subtarget.hasSSE2() && N00Type == MVT::f64))) return SDValue(); - unsigned FPOpcode; - switch (N->getOpcode()) { - default: llvm_unreachable("Unexpected input node for FP logic conversion"); - case ISD::AND: FPOpcode = X86ISD::FAND; break; - case ISD::OR: FPOpcode = X86ISD::FOR; break; - case ISD::XOR: FPOpcode = X86ISD::FXOR; break; - } - + unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode()); SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); return DAG.getBitcast(VT, FPLogic); } +// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y)) +// to reduce XMM->GPR traffic. +static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) { + unsigned Opc = N->getOpcode(); + assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && + "Unexpected bit opcode"); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Both operands must be single use MOVMSK. + if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() || + N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse()) + return SDValue(); + + SDValue Vec0 = N0.getOperand(0); + SDValue Vec1 = N1.getOperand(0); + EVT VecVT0 = Vec0.getValueType(); + EVT VecVT1 = Vec1.getValueType(); + + // Both MOVMSK operands must be from vectors of the same size and same element + // size, but its OK for a fp/int diff. + if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() || + VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits()) + return SDValue(); + + SDLoc DL(N); + unsigned VecOpc = + VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc; + SDValue Result = + DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1)); + return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); +} + /// If this is a zero/all-bits result that is bitwise-anded with a low bits /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and' /// with a shift-right to eliminate loading the vector constant mask value. @@ -40318,7 +42873,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // TODO: Support multiple SrcOps. if (VT == MVT::i1) { SmallVector<SDValue, 2> SrcOps; - if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) && + SmallVector<APInt, 2> SrcPartials; + if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) && SrcOps.size() == 1) { SDLoc dl(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -40328,9 +42884,11 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) Mask = DAG.getBitcast(MaskVT, SrcOps[0]); if (Mask) { - APInt AllBits = APInt::getAllOnesValue(NumElts); - return DAG.getSetCC(dl, MVT::i1, Mask, - DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ); + assert(SrcPartials[0].getBitWidth() == NumElts && + "Unexpected partial reduction mask"); + SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT); + Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits); + return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ); } } } @@ -40338,6 +42896,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget)) return V; + if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) + return R; + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -40446,6 +43007,16 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, } SDLoc DL(N); + + if (UseVPTERNLOG) { + // Emit a VPTERNLOG node directly. + SDValue A = DAG.getBitcast(VT, N0.getOperand(1)); + SDValue B = DAG.getBitcast(VT, N0.getOperand(0)); + SDValue C = DAG.getBitcast(VT, N1.getOperand(0)); + SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8); + return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm); + } + SDValue X = N->getOperand(0); SDValue Y = DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)), @@ -40529,6 +43100,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, if (!Subtarget.hasSSE41()) return SDValue(); + // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops. + if (Subtarget.hasVLX()) + return SDValue(); + MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); @@ -40645,139 +43220,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, return Ret; } -static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node"); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - EVT VT = N->getValueType(0); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - - if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) || - !TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) - return SDValue(); - - // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.shouldOptForSize(); - unsigned Bits = VT.getScalarSizeInBits(); - - // SHLD/SHRD instructions have lower register pressure, but on some - // platforms they have higher latency than the equivalent - // series of shifts/or that would otherwise be generated. - // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions - // have higher latencies and we are not optimizing for size. - if (!OptForSize && Subtarget.isSHLDSlow()) - return SDValue(); - - if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) - std::swap(N0, N1); - if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) - return SDValue(); - if (!N0.hasOneUse() || !N1.hasOneUse()) - return SDValue(); - - EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); - - SDValue ShAmt0 = N0.getOperand(1); - if (ShAmt0.getValueType() != ShiftVT) - return SDValue(); - SDValue ShAmt1 = N1.getOperand(1); - if (ShAmt1.getValueType() != ShiftVT) - return SDValue(); - - // Peek through any modulo shift masks. - SDValue ShMsk0; - if (ShAmt0.getOpcode() == ISD::AND && - isa<ConstantSDNode>(ShAmt0.getOperand(1)) && - ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) { - ShMsk0 = ShAmt0; - ShAmt0 = ShAmt0.getOperand(0); - } - SDValue ShMsk1; - if (ShAmt1.getOpcode() == ISD::AND && - isa<ConstantSDNode>(ShAmt1.getOperand(1)) && - ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) { - ShMsk1 = ShAmt1; - ShAmt1 = ShAmt1.getOperand(0); - } - - if (ShAmt0.getOpcode() == ISD::TRUNCATE) - ShAmt0 = ShAmt0.getOperand(0); - if (ShAmt1.getOpcode() == ISD::TRUNCATE) - ShAmt1 = ShAmt1.getOperand(0); - - SDLoc DL(N); - unsigned Opc = ISD::FSHL; - SDValue Op0 = N0.getOperand(0); - SDValue Op1 = N1.getOperand(0); - if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) { - Opc = ISD::FSHR; - std::swap(Op0, Op1); - std::swap(ShAmt0, ShAmt1); - std::swap(ShMsk0, ShMsk1); - } - - auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1, - SDValue Amt) { - if (Opc == ISD::FSHR) - std::swap(Op0, Op1); - return DAG.getNode(Opc, DL, VT, Op0, Op1, - DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt)); - }; - - // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C ) - // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C ) - // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C ) - // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C ) - // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C ) - // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C ) - if (ShAmt1.getOpcode() == ISD::SUB) { - SDValue Sum = ShAmt1.getOperand(0); - if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) { - SDValue ShAmt1Op1 = ShAmt1.getOperand(1); - if (ShAmt1Op1.getOpcode() == ISD::AND && - isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) && - ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) { - ShMsk1 = ShAmt1Op1; - ShAmt1Op1 = ShAmt1Op1.getOperand(0); - } - if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE) - ShAmt1Op1 = ShAmt1Op1.getOperand(0); - if ((SumC->getAPIntValue() == Bits || - (SumC->getAPIntValue() == 0 && ShMsk1)) && - ShAmt1Op1 == ShAmt0) - return GetFunnelShift(Op0, Op1, ShAmt0); - } - } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { - auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); - if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits) - return GetFunnelShift(Op0, Op1, ShAmt0); - } else if (ShAmt1.getOpcode() == ISD::XOR) { - SDValue Mask = ShAmt1.getOperand(1); - if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) { - unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL); - SDValue ShAmt1Op0 = ShAmt1.getOperand(0); - if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE) - ShAmt1Op0 = ShAmt1Op0.getOperand(0); - if (MaskC->getSExtValue() == (Bits - 1) && - (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) { - if (Op1.getOpcode() == InnerShift && - isa<ConstantSDNode>(Op1.getOperand(1)) && - Op1.getConstantOperandAPInt(1).isOneValue()) { - return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); - } - // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ). - if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD && - Op1.getOperand(0) == Op1.getOperand(1)) { - return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0); - } - } - } - } - - return SDValue(); -} - static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -40797,7 +43239,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, // TODO: Support multiple SrcOps. if (VT == MVT::i1) { SmallVector<SDValue, 2> SrcOps; - if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) && + SmallVector<APInt, 2> SrcPartials; + if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) && SrcOps.size() == 1) { SDLoc dl(N); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -40807,13 +43250,19 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) Mask = DAG.getBitcast(MaskVT, SrcOps[0]); if (Mask) { - APInt AllBits = APInt::getNullValue(NumElts); - return DAG.getSetCC(dl, MVT::i1, Mask, - DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE); + assert(SrcPartials[0].getBitWidth() == NumElts && + "Unexpected partial reduction mask"); + SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT); + SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT); + Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits); + return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE); } } } + if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) + return R; + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -40829,8 +43278,33 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; - if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget)) - return R; + // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y). + // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X). + // iff the upper elements of the non-shifted arg are zero. + // KUNPCK require 16+ bool vector elements. + if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfElts = NumElts / 2; + APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts); + if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL && + N1.getConstantOperandAPInt(1) == HalfElts && + DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) { + SDLoc dl(N); + return DAG.getNode( + ISD::CONCAT_VECTORS, dl, VT, + extractSubVector(N0, 0, DAG, dl, HalfElts), + extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts)); + } + if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL && + N0.getConstantOperandAPInt(1) == HalfElts && + DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) { + SDLoc dl(N); + return DAG.getNode( + ISD::CONCAT_VECTORS, dl, VT, + extractSubVector(N1, 0, DAG, dl, HalfElts), + extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts)); + } + } // Attempt to recursively combine an OR of shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { @@ -41179,18 +43653,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, // A lambda checking the given SDValue is a constant vector and each element // is in the range [Min, Max]. auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { - BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); - if (!BV || !BV->isConstant()) - return false; - for (SDValue Op : V->ops()) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); - if (!C) - return false; - const APInt &Val = C->getAPIntValue(); - if (Val.ult(Min) || Val.ugt(Max)) - return false; - } - return true; + return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) { + return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max)); + }); }; // Check if each element of the vector is right-shifted by one. @@ -41291,10 +43756,10 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. ISD::LoadExtType Ext = Ld->getExtensionType(); bool Fast; - unsigned Alignment = Ld->getAlignment(); if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && Ext == ISD::NON_EXTLOAD && - ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) || + ((Ld->isNonTemporal() && !Subtarget.hasInt256() && + Ld->getAlignment() >= 16) || (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, *Ld->getMemOperand(), &Fast) && !Fast))) { @@ -41302,17 +43767,18 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, if (NumElems < 2) return SDValue(); - unsigned HalfAlign = 16; + unsigned HalfOffset = 16; SDValue Ptr1 = Ld->getBasePtr(); - SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl); + SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl); EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), NumElems / 2); SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(), - Alignment, Ld->getMemOperand()->getFlags()); + Ld->getOriginalAlign(), + Ld->getMemOperand()->getFlags()); SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2, - Ld->getPointerInfo().getWithOffset(HalfAlign), - MinAlign(Alignment, HalfAlign), + Ld->getPointerInfo().getWithOffset(HalfOffset), + Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); @@ -41329,13 +43795,28 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); if (TLI.isTypeLegal(IntVT)) { SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Alignment, + Ld->getPointerInfo(), + Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad); return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true); } } + // Cast ptr32 and ptr64 pointers to the default address space before a load. + unsigned AddrSpace = Ld->getAddressSpace(); + if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR || + AddrSpace == X86AS::PTR32_UPTR) { + MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + if (PtrVT != Ld->getBasePtr().getSimpleValueType()) { + SDValue Cast = + DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0); + return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(), + Ld->getOriginalAlign(), + Ld->getMemOperand()->getFlags()); + } + } + return SDValue(); } @@ -41482,7 +43963,7 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N); + auto *Mld = cast<MaskedLoadSDNode>(N); // TODO: Expanding load with constant mask may be optimized as well. if (Mld->isExpandingLoad()) @@ -41491,12 +43972,33 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI)) return ScalarLoad; + // TODO: Do some AVX512 subsets benefit from this transform? if (!Subtarget.hasAVX512()) if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI)) return Blend; } + // If the mask value has been legalized to a non-boolean vector, try to + // simplify ops leading up to it. We only demand the MSB of each lane. + SDValue Mask = Mld->getMask(); + if (Mask.getScalarValueSizeInBits() != 1) { + EVT VT = Mld->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); + } + if (SDValue NewMask = + TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG)) + return DAG.getMaskedLoad( + VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(), + NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(), + Mld->getAddressingMode(), Mld->getExtensionType()); + } + return SDValue(); } @@ -41548,9 +44050,18 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, // simplify ops leading up to it. We only demand the MSB of each lane. SDValue Mask = Mst->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { - APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits())); - if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) + APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); return SDValue(N, 0); + } + if (SDValue NewMask = + TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG)) + return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(), + Mst->getBasePtr(), Mst->getOffset(), NewMask, + Mst->getMemoryVT(), Mst->getMemOperand(), + Mst->getAddressingMode()); } SDValue Value = Mst->getValue(); @@ -41572,7 +44083,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, StoreSDNode *St = cast<StoreSDNode>(N); EVT StVT = St->getMemoryVT(); SDLoc dl(St); - unsigned Alignment = St->getAlignment(); SDValue StoredVal = St->getValue(); EVT VT = StoredVal.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -41585,7 +44095,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, StoredVal = DAG.getBitcast(NewVT, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), + St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } @@ -41596,7 +44106,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, StoredVal.getOperand(0).getValueType() == MVT::i8) { return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0), St->getBasePtr(), St->getPointerInfo(), - St->getAlignment(), St->getMemOperand()->getFlags()); + St->getOriginalAlign(), + St->getMemOperand()->getFlags()); } // Widen v2i1/v4i1 stores to v8i1. @@ -41607,7 +44118,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, Ops[0] = StoredVal; StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), + St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } @@ -41616,7 +44127,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) && ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) { // If its a v64i1 store without 64-bit support, we need two stores. - if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { + if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) { SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl, StoredVal->ops().slice(0, 32)); Lo = combinevXi1ConstantToInteger(Lo, DAG); @@ -41629,18 +44140,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, SDValue Ch0 = DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(), - Alignment, St->getMemOperand()->getFlags()); + St->getOriginalAlign(), + St->getMemOperand()->getFlags()); SDValue Ch1 = DAG.getStore(St->getChain(), dl, Hi, Ptr1, St->getPointerInfo().getWithOffset(4), - MinAlign(Alignment, 4U), + St->getOriginalAlign(), St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), + St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } @@ -41659,7 +44171,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, } // Split under-aligned vector non-temporal stores. - if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) { + if (St->isNonTemporal() && StVT == VT && + St->getAlignment() < VT.getStoreSize()) { // ZMM/YMM nt-stores - either it can be stored as a series of shorter // vectors or the legalizer can scalarize it to use MOVNTI. if (VT.is256BitVector() || VT.is512BitVector()) { @@ -41713,7 +44226,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl)) return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), + St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); if (TLI.isTruncStoreLegal(VT, StVT)) { @@ -41731,6 +44244,20 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, return SDValue(); } + // Cast ptr32 and ptr64 pointers to the default address space before a store. + unsigned AddrSpace = St->getAddressSpace(); + if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR || + AddrSpace == X86AS::PTR32_UPTR) { + MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + if (PtrVT != St->getBasePtr().getSimpleValueType()) { + SDValue Cast = + DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0); + return DAG.getStore(St->getChain(), dl, StoredVal, Cast, + St->getPointerInfo(), St->getOriginalAlign(), + St->getMemOperand()->getFlags(), St->getAAInfo()); + } + } + // Turn load->store of MMX types into GPR load/stores. This avoids clobbering // the FP state in cases where an emms may be missing. // A preferable solution to the general problem is to figure out the right @@ -41785,13 +44312,38 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, BitCast, OldExtract.getOperand(1)); return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), + St->getPointerInfo(), St->getOriginalAlign(), St->getMemOperand()->getFlags()); } return SDValue(); } +static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + auto *St = cast<MemIntrinsicSDNode>(N); + + SDValue StoredVal = N->getOperand(1); + MVT VT = StoredVal.getSimpleValueType(); + EVT MemVT = St->getMemoryVT(); + + // Figure out which elements we demand. + unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits(); + APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef, + KnownZero, DCI)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); + } + + return SDValue(); +} + /// Return 'true' if this vector operation is "horizontal" /// and return the operands for the horizontal operation in LHS and RHS. A /// horizontal operation performs the binary operation on successive elements @@ -42028,17 +44580,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, // of one truncation. // i.e. if one of the inputs will constant fold or the input is repeated. switch (SrcOpcode) { - case ISD::AND: - case ISD::XOR: - case ISD::OR: { - SDValue Op0 = Src.getOperand(0); - SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) && - (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) - return TruncateArithmetic(Op0, Op1); - break; - } - case ISD::MUL: // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its // better to truncate if we have the chance. @@ -42047,21 +44588,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, !TLI.isOperationLegal(SrcOpcode, SrcVT)) return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); LLVM_FALLTHROUGH; - case ISD::ADD: { - SDValue Op0 = Src.getOperand(0); - SDValue Op1 = Src.getOperand(1); - if (TLI.isOperationLegal(SrcOpcode, VT) && - (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) - return TruncateArithmetic(Op0, Op1); - break; - } + case ISD::AND: + case ISD::XOR: + case ISD::OR: + case ISD::ADD: case ISD::SUB: { - // TODO: ISD::SUB We are conservative and require both sides to be freely - // truncatable to avoid interfering with combineSubToSubus. SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(SrcOpcode, VT) && - (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1)))) + (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; } @@ -42172,13 +44707,17 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, MVT InSVT = InVT.getScalarType(); // Check we have a truncation suited for PACKSS/PACKUS. - if (!VT.is128BitVector() && !VT.is256BitVector()) + if (!isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32) return SDValue(); if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); + // Truncation to sub-128bit vXi32 can be better handled with shuffles. + if (SVT == MVT::i32 && VT.getSizeInBits() < 128) + return SDValue(); + // AVX512 has fast truncate, but if the input is already going to be split, // there's no harm in trying pack. if (Subtarget.hasAVX512() && @@ -42199,6 +44738,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, // Use PACKSS if the input has sign-bits that extend all the way to the // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. unsigned NumSignBits = DAG.ComputeNumSignBits(In); + + // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with + // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later + // on and combines/simplifications can't then use it. + if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits()) + return SDValue(); + if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits)) return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); @@ -42227,9 +44773,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, if (!VT.isVector() || VT.getVectorElementType() != MVT::i16) return SDValue(); - // Input type should be vXi32. + // Input type should be at least vXi32. EVT InVT = Src.getValueType(); - if (InVT.getVectorElementType() != MVT::i32) + if (InVT.getVectorElementType().getSizeInBits() < 32) return SDValue(); // Need a shift by 16. @@ -42438,7 +44984,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return combineVectorTruncation(N, DAG, Subtarget); } -static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) { +static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); SDValue In = N->getOperand(0); SDLoc DL(N); @@ -42448,6 +44995,11 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) { if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) + return SDValue(N, 0); + return SDValue(); } @@ -42540,37 +45092,46 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, if (NegMul) { switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); - case ISD::FMA: Opcode = X86ISD::FNMADD; break; - case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; - case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; - case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FNMADD: Opcode = ISD::FMA; break; - case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; - case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; - case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; + case ISD::FMA: Opcode = X86ISD::FNMADD; break; + case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; + case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FNMADD: Opcode = ISD::FMA; break; + case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; + case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; + case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; } } if (NegAcc) { switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); - case ISD::FMA: Opcode = X86ISD::FMSUB; break; - case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; - case X86ISD::FMSUB: Opcode = ISD::FMA; break; - case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; - case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; - case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; - case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; - case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; - case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break; - case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; - case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; - case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; + case ISD::FMA: Opcode = X86ISD::FMSUB; break; + case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break; + case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; + case X86ISD::FMSUB: Opcode = ISD::FMA; break; + case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break; + case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; + case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; + case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break; + case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; + case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; + case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break; + case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; + case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break; + case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; + case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; + case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; } } if (NegRes) { switch (Opcode) { + // For accuracy reason, we never combine fneg and fma under strict FP. default: llvm_unreachable("Unexpected opcode"); case ISD::FMA: Opcode = X86ISD::FNMSUB; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; @@ -42588,18 +45149,20 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, /// Do target-specific dag combines on floating point negations. static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT OrigVT = N->getValueType(0); SDValue Arg = isFNEG(DAG, N); if (!Arg) return SDValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = Arg.getValueType(); EVT SVT = VT.getScalarType(); SDLoc DL(N); // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + if (!TLI.isTypeLegal(VT)) return SDValue(); // If we're negating a FMUL node on a target with FMA, then we can avoid the @@ -42613,80 +45176,25 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(OrigVT, NewNode); } - // If we're negating an FMA node, then we can adjust the - // instruction to include the extra negation. - if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) { - switch (Arg.getOpcode()) { - case ISD::FMA: - case X86ISD::FMSUB: - case X86ISD::FNMADD: - case X86ISD::FNMSUB: - case X86ISD::FMADD_RND: - case X86ISD::FMSUB_RND: - case X86ISD::FNMADD_RND: - case X86ISD::FNMSUB_RND: { - // We can't handle scalar intrinsic node here because it would only - // invert one element and not the whole vector. But we could try to handle - // a negation of the lower element only. - unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true); - return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops())); - } - } - } + bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool LegalOperations = !DCI.isBeforeLegalizeOps(); + if (SDValue NegArg = + TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize)) + return DAG.getBitcast(OrigVT, NegArg); return SDValue(); } -char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG, - bool LegalOperations, - bool ForCodeSize, - unsigned Depth) const { - // fneg patterns are removable even if they have multiple uses. - if (isFNEG(DAG, Op.getNode(), Depth)) - return 2; - - // Don't recurse exponentially. - if (Depth > SelectionDAG::MaxRecursionDepth) - return 0; - - EVT VT = Op.getValueType(); - EVT SVT = VT.getScalarType(); - switch (Op.getOpcode()) { - case ISD::FMA: - case X86ISD::FMSUB: - case X86ISD::FNMADD: - case X86ISD::FNMSUB: - case X86ISD::FMADD_RND: - case X86ISD::FMSUB_RND: - case X86ISD::FNMADD_RND: - case X86ISD::FNMSUB_RND: { - if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || - !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations) - break; - - // This is always negatible for free but we might be able to remove some - // extra operand negations as well. - for (int i = 0; i != 3; ++i) { - char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations, - ForCodeSize, Depth + 1); - if (V == 2) - return V; - } - return 1; - } - } - - return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations, - ForCodeSize, Depth); -} - SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, + NegatibleCost &Cost, unsigned Depth) const { // fneg patterns are removable even if they have multiple uses. - if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) + if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) { + Cost = NegatibleCost::Cheaper; return DAG.getBitcast(Op.getValueType(), Arg); + } EVT VT = Op.getValueType(); EVT SVT = VT.getScalarType(); @@ -42701,35 +45209,41 @@ SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, case X86ISD::FNMADD_RND: case X86ISD::FNMSUB_RND: { if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || - !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations) + !(SVT == MVT::f32 || SVT == MVT::f64) || + !isOperationLegal(ISD::FMA, VT)) break; // This is always negatible for free but we might be able to remove some // extra operand negations as well. SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue()); - for (int i = 0; i != 3; ++i) { - char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations, - ForCodeSize, Depth + 1); - if (V == 2) - NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations, - ForCodeSize, Depth + 1); - } + for (int i = 0; i != 3; ++i) + NewOps[i] = getCheaperNegatedExpression( + Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1); bool NegA = !!NewOps[0]; bool NegB = !!NewOps[1]; bool NegC = !!NewOps[2]; unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true); + Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper + : NegatibleCost::Neutral; + // Fill in the non-negated ops with the original values. for (int i = 0, e = Op.getNumOperands(); i != e; ++i) if (!NewOps[i]) NewOps[i] = Op.getOperand(i); return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps); } + case X86ISD::FRCP: + if (SDValue NegOp0 = + getNegatedExpression(Op.getOperand(0), DAG, LegalOperations, + ForCodeSize, Cost, Depth + 1)) + return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0); + break; } return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, - ForCodeSize, Depth); + ForCodeSize, Cost, Depth); } static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, @@ -42790,6 +45304,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) return Cmp; + if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) + return R; + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -42802,33 +45319,21 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG, if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; - return combineFneg(N, DAG, Subtarget); + return combineFneg(N, DAG, DCI, Subtarget); } static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); EVT VT = N->getValueType(0); unsigned NumBits = VT.getSizeInBits(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - // TODO - Constant Folding. - if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) { - // Reduce Cst1 to the bottom 16-bits. - // NOTE: SimplifyDemandedBits won't do this for constants. - const APInt &Val1 = Cst1->getAPIntValue(); - APInt MaskedVal1 = Val1 & 0xFFFF; - if (MaskedVal1 != Val1) - return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0, - DAG.getConstant(MaskedVal1, SDLoc(N), VT)); - } - - // Only bottom 16-bits of the control bits are required. - APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16)); - if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI)) + + // Simplify the inputs. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedMask(APInt::getAllOnesValue(NumBits)); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) return SDValue(N, 0); return SDValue(); @@ -42919,6 +45424,7 @@ static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); @@ -42930,7 +45436,7 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, if (isNullFPScalarOrVectorConst(N->getOperand(1))) return N->getOperand(0); - if (SDValue NewVal = combineFneg(N, DAG, Subtarget)) + if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget)) return NewVal; return lowerX86FPLogicOp(N, DAG, Subtarget); @@ -43041,23 +45547,16 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); - // Unless the load is volatile or atomic. - if (LN->isSimple()) { + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getIntegerVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) { SDLoc dl(N); - unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); - MVT MemVT = MVT::getIntegerVT(NumBits); - MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); - SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); - SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; - SDValue VZLoad = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, - LN->getPointerInfo(), - LN->getAlignment(), - LN->getMemOperand()->getFlags()); SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad)); DCI.CombineTo(N, Convert); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); return SDValue(N, 0); } } @@ -43067,33 +45566,33 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { - // FIXME: Handle strict fp nodes. + bool IsStrict = N->isTargetStrictFPOpcode(); EVT VT = N->getValueType(0); // Convert a full vector load into vzload when not all bits are needed. - SDValue In = N->getOperand(0); + SDValue In = N->getOperand(IsStrict ? 1 : 0); MVT InVT = In.getSimpleValueType(); if (VT.getVectorNumElements() < InVT.getVectorNumElements() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); LoadSDNode *LN = cast<LoadSDNode>(In); - // Unless the load is volatile or atomic. - if (LN->isSimple()) { + unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); + MVT MemVT = MVT::getFloatingPointVT(NumBits); + MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); + if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) { SDLoc dl(N); - unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); - MVT MemVT = MVT::getFloatingPointVT(NumBits); - MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); - SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); - SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; - SDValue VZLoad = - DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT, - LN->getPointerInfo(), - LN->getAlignment(), - LN->getMemOperand()->getFlags()); - SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, - DAG.getBitcast(InVT, VZLoad)); - DCI.CombineTo(N, Convert); + if (IsStrict) { + SDValue Convert = + DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other}, + {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)}); + DCI.CombineTo(N, Convert, Convert.getValue(1)); + } else { + SDValue Convert = + DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad)); + DCI.CombineTo(N, Convert); + } DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); return SDValue(N, 0); } } @@ -43132,14 +45631,58 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { - SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // BT ignores high bits in the bit index operand. unsigned BitWidth = N1.getValueSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); - if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask)) - return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1); + if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); + } + + return SDValue(); +} + +static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS; + SDValue Src = N->getOperand(IsStrict ? 1 : 0); + + if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) { + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedElts = APInt::getLowBitsSet(8, 4); + if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, + DCI)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); + } + + // Convert a full vector load into vzload when not all bits are needed. + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0)); + if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) { + SDLoc dl(N); + if (IsStrict) { + SDValue Convert = DAG.getNode( + N->getOpcode(), dl, {MVT::v4f32, MVT::Other}, + {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)}); + DCI.CombineTo(N, Convert, Convert.getValue(1)); + } else { + SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32, + DAG.getBitcast(MVT::v8i16, VZLoad)); + DCI.CombineTo(N, Convert); + } + + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + return SDValue(N, 0); + } + } + } return SDValue(); } @@ -43225,7 +45768,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || - N0.getOpcode() == ISD::SIGN_EXTEND)) { + N0.getOpcode() == ISD::SIGN_EXTEND)) { SDValue N00 = N0.getOperand(0); // EXTLOAD has a better solution on AVX2, @@ -43234,9 +45777,14 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, if (!ISD::isNormalLoad(N00.getNode())) return SDValue(); + // Attempt to promote any comparison mask ops before moving the + // SIGN_EXTEND_INREG in the way. + if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget)) + return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1); + if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { - SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, - N00, N1); + SDValue Tmp = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1); return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); } } @@ -43421,6 +45969,21 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, for (unsigned i = 0; i != Scale; ++i) ShuffleMask.append(EltSizeInBits, i); + Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); + } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits && + (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) { + // If we have register broadcast instructions, use the scalar size as the + // element type for the shuffle. Then cast to the wider element type. The + // widened bits won't be used, and this might allow the use of a broadcast + // load. + assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale"); + unsigned Scale = EltSizeInBits / NumElts; + EVT BroadcastVT = + EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); + ShuffleMask.append(NumElts * Scale, 0); + Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask); + Vec = DAG.getBitcast(VT, Vec); } else { // For smaller scalar integers, we can simply any-extend it to the vector // element size (we don't care about the upper bits) and broadcast it to all @@ -43428,8 +45991,8 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); ShuffleMask.append(NumElts, 0); + Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); } - Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); // Now, mask the relevant bit in each element. SmallVector<SDValue, 32> Bits; @@ -43474,7 +46037,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, // We can only do this if the vector size in 256 bits or less. unsigned Size = VT.getSizeInBits(); - if (Size > 256) + if (Size > 256 && Subtarget.useAVX512Regs()) return SDValue(); // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since @@ -43492,7 +46055,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC); if (N->getOpcode() == ISD::ZERO_EXTEND) - Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType()); + Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType()); return Res; } @@ -43505,6 +46068,23 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, EVT InVT = N0.getValueType(); SDLoc DL(N); + // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry)) + if (!DCI.isBeforeLegalizeOps() && + N0.getOpcode() == X86ISD::SETCC_CARRY) { + SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0), + N0->getOperand(1)); + bool ReplaceOtherUses = !N0.hasOneUse(); + DCI.CombineTo(N, Setcc); + // Replace other uses with a truncate of the widened setcc_carry. + if (ReplaceOtherUses) { + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), + N0.getValueType(), Setcc); + DCI.CombineTo(N0.getNode(), Trunc); + } + + return SDValue(N, 0); + } + if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) return NewCMov; @@ -43542,6 +46122,7 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); + bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode(); // Let legalize expand this if it isn't a legal type yet. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -43552,15 +46133,16 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) return SDValue(); - SDValue A = N->getOperand(0); - SDValue B = N->getOperand(1); - SDValue C = N->getOperand(2); + SDValue A = N->getOperand(IsStrict ? 1 : 0); + SDValue B = N->getOperand(IsStrict ? 2 : 1); + SDValue C = N->getOperand(IsStrict ? 3 : 2); auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); bool LegalOperations = !DCI.isBeforeLegalizeOps(); - if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) { - V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize); + if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations, + CodeSize)) { + V = NegV; return true; } // Look through extract_vector_elts. If it comes from an FNEG, create a @@ -43568,11 +46150,10 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isNullConstant(V.getOperand(1))) { SDValue Vec = V.getOperand(0); - if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) { - SDValue NegVal = - TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize); + if (SDValue NegV = TLI.getCheaperNegatedExpression( + Vec, DAG, LegalOperations, CodeSize)) { V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(), - NegVal, V.getOperand(1)); + NegV, V.getOperand(1)); return true; } } @@ -43592,9 +46173,15 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false); - if (N->getNumOperands() == 4) - return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); - return DAG.getNode(NewOpcode, dl, VT, A, B, C); + if (IsStrict) { + assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4"); + return DAG.getNode(NewOpcode, dl, {VT, MVT::Other}, + {N->getOperand(0), A, B, C}); + } else { + if (N->getNumOperands() == 4) + return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); + return DAG.getNode(NewOpcode, dl, VT, A, B, C); + } } // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) @@ -43608,10 +46195,11 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, bool LegalOperations = !DCI.isBeforeLegalizeOps(); SDValue N2 = N->getOperand(2); - if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2) - return SDValue(); - SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize); + SDValue NegN2 = + TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize); + if (!NegN2) + return SDValue(); unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false); if (N->getNumOperands() == 4) @@ -43624,38 +46212,26 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> - // (and (i32 x86isd::setcc_carry), 1) - // This eliminates the zext. This transformation is necessary because - // ISD::SETCC is always legalized to i8. SDLoc dl(N); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); - if (N0.getOpcode() == ISD::AND && - N0.hasOneUse() && - N0.getOperand(0).hasOneUse()) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == X86ISD::SETCC_CARRY) { - if (!isOneConstant(N0.getOperand(1))) - return SDValue(); - return DAG.getNode(ISD::AND, dl, VT, - DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, - N00.getOperand(0), N00.getOperand(1)), - DAG.getConstant(1, dl, VT)); + // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry)) + // FIXME: Is this needed? We don't seem to have any tests for it. + if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND && + N0.getOpcode() == X86ISD::SETCC_CARRY) { + SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0), + N0->getOperand(1)); + bool ReplaceOtherUses = !N0.hasOneUse(); + DCI.CombineTo(N, Setcc); + // Replace other uses with a truncate of the widened setcc_carry. + if (ReplaceOtherUses) { + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), + N0.getValueType(), Setcc); + DCI.CombineTo(N0.getNode(), Trunc); } - } - if (N0.getOpcode() == ISD::TRUNCATE && - N0.hasOneUse() && - N0.getOperand(0).hasOneUse()) { - SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == X86ISD::SETCC_CARRY) { - return DAG.getNode(ISD::AND, dl, VT, - DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, - N00.getOperand(0), N00.getOperand(1)), - DAG.getConstant(1, dl, VT)); - } + return SDValue(N, 0); } if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) @@ -43768,13 +46344,12 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, EVT VT = SetCC->getValueType(0); SDLoc DL(SetCC); - bool HasAVX = Subtarget.hasAVX(); // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. // Otherwise use PCMPEQ (plus AND) and mask testing. if ((OpSize == 128 && Subtarget.hasSSE2()) || - (OpSize == 256 && HasAVX) || + (OpSize == 256 && Subtarget.hasAVX()) || (OpSize == 512 && Subtarget.useAVX512Regs())) { bool HasPT = Subtarget.hasSSE41(); @@ -43828,11 +46403,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, X = DAG.getBitcast(TmpCastVT, X); if (!NeedZExt && !TmpZext) return X; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, DAG.getConstant(0, DL, VecVT), X, - DAG.getConstant(0, DL, VecIdxVT)); + DAG.getVectorIdxConstant(0, DL)); }; SDValue Cmp; @@ -43865,17 +46438,16 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG, Cmp); SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp); X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; - SDValue SetCC = getSETCC(X86CC, PT, DL, DAG); - return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0)); + SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG); + return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0)); } // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne - // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq - // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne + assert(Cmp.getValueType() == MVT::v16i8 && + "Non 128-bit vector on pre-SSE41 target"); SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); - SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL, - MVT::i32); + SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32); return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC); } @@ -43892,23 +46464,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); if (CC == ISD::SETNE || CC == ISD::SETEQ) { - // 0-x == y --> x+y == 0 - // 0-x != y --> x+y != 0 - if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) && - LHS.hasOneUse()) { - SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1)); - return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); - } - // x == 0-y --> x+y == 0 - // x != 0-y --> x+y != 0 - if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && - RHS.hasOneUse()) { - SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); - return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); - } - if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget)) return V; + + if (VT == MVT::i1 && isNullConstant(RHS)) { + SDValue X86CC; + if (SDValue V = + MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC)) + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V)); + } } if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && @@ -43931,7 +46496,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, if (IsSEXT0 && IsVZero1) { assert(VT == Op0.getOperand(0).getValueType() && - "Uexpected operand type"); + "Unexpected operand type"); if (TmpCC == ISD::SETGT) return DAG.getConstant(0, DL, VT); if (TmpCC == ISD::SETLE) @@ -44021,20 +46586,43 @@ static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, if (Mask.getScalarValueSizeInBits() != 1) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); - if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) + if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); return SDValue(N, 0); + } } return SDValue(); } +static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, + SDValue Index, SDValue Base, SDValue Scale, + SelectionDAG &DAG) { + SDLoc DL(GorS); + + if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { + SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(), + Gather->getMask(), Base, Index, Scale } ; + return DAG.getMaskedGather(Gather->getVTList(), + Gather->getMemoryVT(), DL, Ops, + Gather->getMemOperand(), + Gather->getIndexType()); + } + auto *Scatter = cast<MaskedScatterSDNode>(GorS); + SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(), + Scatter->getMask(), Base, Index, Scale }; + return DAG.getMaskedScatter(Scatter->getVTList(), + Scatter->getMemoryVT(), DL, + Ops, Scatter->getMemOperand(), + Scatter->getIndexType()); +} + static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); auto *GorS = cast<MaskedGatherScatterSDNode>(N); - SDValue Chain = GorS->getChain(); SDValue Index = GorS->getIndex(); - SDValue Mask = GorS->getMask(); SDValue Base = GorS->getBasePtr(); SDValue Scale = GorS->getScale(); @@ -44054,21 +46642,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, unsigned NumElts = Index.getValueType().getVectorNumElements(); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); - if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { - SDValue Ops[] = { Chain, Gather->getPassThru(), - Mask, Base, Index, Scale } ; - return DAG.getMaskedGather(Gather->getVTList(), - Gather->getMemoryVT(), DL, Ops, - Gather->getMemOperand(), - Gather->getIndexType()); - } - auto *Scatter = cast<MaskedScatterSDNode>(GorS); - SDValue Ops[] = { Chain, Scatter->getValue(), - Mask, Base, Index, Scale }; - return DAG.getMaskedScatter(Scatter->getVTList(), - Scatter->getMemoryVT(), DL, - Ops, Scatter->getMemOperand(), - Scatter->getIndexType()); + return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); } } @@ -44083,21 +46657,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, unsigned NumElts = Index.getValueType().getVectorNumElements(); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); - if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { - SDValue Ops[] = { Chain, Gather->getPassThru(), - Mask, Base, Index, Scale } ; - return DAG.getMaskedGather(Gather->getVTList(), - Gather->getMemoryVT(), DL, Ops, - Gather->getMemOperand(), - Gather->getIndexType()); - } - auto *Scatter = cast<MaskedScatterSDNode>(GorS); - SDValue Ops[] = { Chain, Scatter->getValue(), - Mask, Base, Index, Scale }; - return DAG.getMaskedScatter(Scatter->getVTList(), - Scatter->getMemoryVT(), DL, - Ops, Scatter->getMemOperand(), - Scatter->getIndexType()); + return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); } } @@ -44110,30 +46670,20 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT, Index.getValueType().getVectorNumElements()); Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); - if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { - SDValue Ops[] = { Chain, Gather->getPassThru(), - Mask, Base, Index, Scale } ; - return DAG.getMaskedGather(Gather->getVTList(), - Gather->getMemoryVT(), DL, Ops, - Gather->getMemOperand(), - Gather->getIndexType()); - } - auto *Scatter = cast<MaskedScatterSDNode>(GorS); - SDValue Ops[] = { Chain, Scatter->getValue(), - Mask, Base, Index, Scale }; - return DAG.getMaskedScatter(Scatter->getVTList(), - Scatter->getMemoryVT(), DL, - Ops, Scatter->getMemOperand(), - Scatter->getIndexType()); + return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); } } // With vector masks we only demand the upper bit of the mask. + SDValue Mask = GorS->getMask(); if (Mask.getScalarValueSizeInBits() != 1) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); - if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) + if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); return SDValue(N, 0); + } } return SDValue(); @@ -44172,10 +46722,11 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// TODO: Could we move this to DAGCombine? static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG) { - // Take advantage of vector comparisons producing 0 or -1 in each lane to - // optimize away operation when it's from a constant. + // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane + // to optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> @@ -44187,9 +46738,10 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, // aren't the same. EVT VT = N->getValueType(0); bool IsStrict = N->isStrictFPOpcode(); + unsigned NumEltBits = VT.getScalarSizeInBits(); SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); - if (!VT.isVector() || Op0->getOpcode() != ISD::AND || - Op0->getOperand(0)->getOpcode() != ISD::SETCC || + if (!VT.isVector() || Op0.getOpcode() != ISD::AND || + DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits || VT.getSizeInBits() != Op0.getValueSizeInBits()) return SDValue(); @@ -44362,7 +46914,6 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, if (!Subtarget.useSoftFloat() && Subtarget.hasX87() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); - EVT LdVT = Ld->getValueType(0); // This transformation is not supported if the result type is f16 or f128. if (VT == MVT::f16 || VT == MVT::f128) @@ -44373,11 +46924,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, if (Subtarget.hasDQI() && VT != MVT::f80) return SDValue(); - if (Ld->isSimple() && !VT.isVector() && - ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() && - !Subtarget.is64Bit() && LdVT == MVT::i64) { - std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD( - SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG); + if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) && + Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) { + std::pair<SDValue, SDValue> Tmp = + Subtarget.getTargetLowering()->BuildFILD( + VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG); DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second); return Tmp.first; } @@ -44711,7 +47263,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { } if (CC == X86::COND_A) { - SDValue EFLAGS = Y->getOperand(1); + SDValue EFLAGS = Y.getOperand(1); // Try to convert COND_A into COND_B in an attempt to facilitate // materializing "setb reg". // @@ -44724,13 +47276,44 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), EFLAGS.getOperand(1), EFLAGS.getOperand(0)); - SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), X, DAG.getConstant(0, DL, VT), NewEFLAGS); } } + if (CC == X86::COND_AE) { + // X + SETAE --> sbb X, -1 + // X - SETAE --> adc X, -1 + return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, + DAG.getVTList(VT, MVT::i32), X, + DAG.getConstant(-1, DL, VT), Y.getOperand(1)); + } + + if (CC == X86::COND_BE) { + // X + SETBE --> sbb X, -1 + // X - SETBE --> adc X, -1 + SDValue EFLAGS = Y.getOperand(1); + // Try to convert COND_BE into COND_AE in an attempt to facilitate + // materializing "setae reg". + // + // Do not flip "e <= c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + SDValue NewSub = DAG.getNode( + X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); + return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, + DAG.getVTList(VT, MVT::i32), X, + DAG.getConstant(-1, DL, VT), NewEFLAGS); + } + } + if (CC != X86::COND_E && CC != X86::COND_NE) return SDValue(); @@ -44767,15 +47350,18 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) || (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) { SDValue One = DAG.getConstant(1, DL, ZVT); - SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); + SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); + SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, - DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1); + DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), + Cmp1.getValue(1)); } } // (cmp Z, 1) sets the carry flag if Z is 0. SDValue One = DAG.getConstant(1, DL, ZVT); - SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One); + SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); + SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); // Add the flags type for ADC/SBB nodes. SDVTList VTs = DAG.getVTList(VT, MVT::i32); @@ -44784,151 +47370,12 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) if (CC == X86::COND_NE) return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, - DAG.getConstant(-1ULL, DL, VT), Cmp1); + DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1)); // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, - DAG.getConstant(0, DL, VT), Cmp1); -} - -static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasSSE2()) - return SDValue(); - - EVT VT = N->getValueType(0); - - // If the vector size is less than 128, or greater than the supported RegSize, - // do not use PMADD. - if (!VT.isVector() || VT.getVectorNumElements() < 8) - return SDValue(); - - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - - auto UsePMADDWD = [&](SDValue Op) { - ShrinkMode Mode; - return Op.getOpcode() == ISD::MUL && - canReduceVMulWidth(Op.getNode(), DAG, Mode) && - Mode != ShrinkMode::MULU16 && - (!Subtarget.hasSSE41() || - (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && - Op->isOnlyUserOf(Op.getOperand(1).getNode()))); - }; - - SDValue MulOp, OtherOp; - if (UsePMADDWD(Op0)) { - MulOp = Op0; - OtherOp = Op1; - } else if (UsePMADDWD(Op1)) { - MulOp = Op1; - OtherOp = Op0; - } else - return SDValue(); - - SDLoc DL(N); - EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, - VT.getVectorNumElements()); - EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, - VT.getVectorNumElements() / 2); - - // Shrink the operands of mul. - SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0)); - SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); - - // Madd vector size is half of the original vector size - auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef<SDValue> Ops) { - MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); - return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops); - }; - SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 }, - PMADDWDBuilder); - // Fill the rest of the output with 0 - SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType()); - SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); - - // Preserve the reduction flag on the ADD. We may need to revisit for the - // other operand. - SDNodeFlags Flags; - Flags.setVectorReduction(true); - return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags); -} - -static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasSSE2()) - return SDValue(); - - SDLoc DL(N); - EVT VT = N->getValueType(0); - - // TODO: There's nothing special about i32, any integer type above i16 should - // work just as well. - if (!VT.isVector() || !VT.isSimple() || - !(VT.getVectorElementType() == MVT::i32)) - return SDValue(); - - unsigned RegSize = 128; - if (Subtarget.useBWIRegs()) - RegSize = 512; - else if (Subtarget.hasAVX()) - RegSize = 256; - - // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512. - // TODO: We should be able to handle larger vectors by splitting them before - // feeding them into several SADs, and then reducing over those. - if (VT.getSizeInBits() / 4 > RegSize) - return SDValue(); - - // We know N is a reduction add. To match SAD, we need one of the operands to - // be an ABS. - SDValue AbsOp = N->getOperand(0); - SDValue OtherOp = N->getOperand(1); - if (AbsOp.getOpcode() != ISD::ABS) - std::swap(AbsOp, OtherOp); - if (AbsOp.getOpcode() != ISD::ABS) - return SDValue(); - - // Check whether we have an abs-diff pattern feeding into the select. - SDValue SadOp0, SadOp1; - if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1)) - return SDValue(); - - // SAD pattern detected. Now build a SAD instruction and an addition for - // reduction. Note that the number of elements of the result of SAD is less - // than the number of elements of its input. Therefore, we could only update - // part of elements in the reduction vector. - SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget); - - // The output of PSADBW is a vector of i64. - // We need to turn the vector of i64 into a vector of i32. - // If the reduction vector is at least as wide as the psadbw result, just - // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of - // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64 - // result to v2i32 which will be removed by type legalization. If we/ widen - // narrow vectors then we bitcast to v4i32 and extract v2i32. - MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); - Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); - - if (VT.getSizeInBits() > ResVT.getSizeInBits()) { - // Fill the upper elements with zero to match the add width. - assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs"); - unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits(); - SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT)); - Ops[0] = Sad; - Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops); - } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) { - Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad, - DAG.getIntPtrConstant(0, DL)); - } - - // Preserve the reduction flag on the ADD. We may need to revisit for the - // other operand. - SDNodeFlags Flags; - Flags.setVectorReduction(true); - return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags); + DAG.getConstant(0, DL, VT), Cmp1.getValue(1)); } static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, @@ -45020,30 +47467,25 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, Mode == ShrinkMode::MULU16) return SDValue(); + EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + VT.getVectorNumElements() * 2); + SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0)); + SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1)); + auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef<SDValue> Ops) { - // Shrink by adding truncate nodes and let DAGCombine fold with the - // sources. EVT InVT = Ops[0].getValueType(); - assert(InVT.getScalarType() == MVT::i32 && - "Unexpected scalar element type"); assert(InVT == Ops[1].getValueType() && "Operands' types mismatch"); EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, InVT.getVectorNumElements() / 2); - EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, - InVT.getVectorNumElements()); - return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, - DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]), - DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1])); + return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); }; - return SplitOpsAndApply(DAG, Subtarget, DL, VT, - { Mul.getOperand(0), Mul.getOperand(1) }, - PMADDBuilder); + return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder); } // Attempt to turn this pattern into PMADDWD. -// (mul (add (sext (build_vector)), (sext (build_vector))), -// (add (sext (build_vector)), (sext (build_vector))) +// (add (mul (sext (build_vector)), (sext (build_vector))), +// (mul (sext (build_vector)), (sext (build_vector))) static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget) { @@ -45165,13 +47607,6 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - const SDNodeFlags Flags = N->getFlags(); - if (Flags.hasVectorReduction()) { - if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget)) - return Sad; - if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget)) - return MAdd; - } EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -45262,6 +47697,38 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, SubusRHS = MinLHS; else return SDValue(); + } else if (Op1.getOpcode() == ISD::TRUNCATE && + Op1.getOperand(0).getOpcode() == ISD::UMIN && + (EltVT == MVT::i8 || EltVT == MVT::i16)) { + // Special case where the UMIN has been truncated. Try to push the truncate + // further up. This is similar to the i32/i64 special processing. + SubusLHS = Op0; + SDValue MinLHS = Op1.getOperand(0).getOperand(0); + SDValue MinRHS = Op1.getOperand(0).getOperand(1); + EVT TruncVT = Op1.getOperand(0).getValueType(); + if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 || + TruncVT == MVT::v8i64)) && + !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32))) + return SDValue(); + SDValue OpToSaturate; + if (MinLHS.getOpcode() == ISD::ZERO_EXTEND && + MinLHS.getOperand(0) == Op0) + OpToSaturate = MinRHS; + else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND && + MinRHS.getOperand(0) == Op0) + OpToSaturate = MinLHS; + else + return SDValue(); + + // Saturate the non-extended input and then truncate it. + SDLoc DL(N); + SDValue SaturationConst = + DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(), + VT.getScalarSizeInBits()), + DL, TruncVT); + SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate, + SaturationConst); + SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin); } else return SDValue(); @@ -45376,6 +47843,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors"); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); })) return DAG.getUNDEF(VT); @@ -45386,6 +47854,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, return getZeroVector(VT, Subtarget, DAG, DL); SDValue Op0 = Ops[0]; + bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; }); // Fold subvector loads into one. // If needed, look through bitcasts to get to the load. @@ -45402,13 +47871,28 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } // Repeated subvectors. - if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) { + if (IsSplat) { // If this broadcast/subv_broadcast is inserted into both halves, use a // larger broadcast/subv_broadcast. if (Op0.getOpcode() == X86ISD::VBROADCAST || Op0.getOpcode() == X86ISD::SUBV_BROADCAST) return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); + // If this broadcast_load is inserted into both halves, use a larger + // broadcast_load. Update other uses to use an extracted subvector. + if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) { + auto *MemIntr = cast<MemIntrinsicSDNode>(Op0); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()}; + SDValue BcastLd = DAG.getMemIntrinsicNode( + X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith( + Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits())); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); + return BcastLd; + } + // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0)))) @@ -45420,12 +47904,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && (Subtarget.hasAVX2() || - (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) && + (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) && Op0.getOperand(0).getValueType() == VT.getScalarType()) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); - } - bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; }); + // concat_vectors(extract_subvector(broadcast(x)), + // extract_subvector(broadcast(x))) -> broadcast(x) + if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + Op0.getOperand(0).getValueType() == VT) { + if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST || + Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD) + return Op0.getOperand(0); + } + } // Repeated opcode. // TODO - combineX86ShufflesRecursively should handle shuffle concatenation @@ -45435,6 +47926,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, })) { unsigned NumOps = Ops.size(); switch (Op0.getOpcode()) { + case X86ISD::SHUFP: { + // Add SHUFPD support if/when necessary. + if (!IsSplat && VT.getScalarType() == MVT::f32 && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op.getOperand(2) == Op0.getOperand(2); + })) { + SmallVector<SDValue, 2> LHS, RHS; + for (unsigned i = 0; i != NumOps; ++i) { + LHS.push_back(Ops[i].getOperand(0)); + RHS.push_back(Ops[i].getOperand(1)); + } + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS), + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS), + Op0.getOperand(2)); + } + break; + } case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::PSHUFD: @@ -45461,8 +47970,42 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, return DAG.getBitcast(VT, Res); } break; + case X86ISD::VSHLI: + case X86ISD::VSRAI: + case X86ISD::VSRLI: + if (((VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.useAVX512Regs() && + (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op0.getOperand(1) == Op.getOperand(1); + })) { + SmallVector<SDValue, 2> Src; + for (unsigned i = 0; i != NumOps; ++i) + Src.push_back(Ops[i].getOperand(0)); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src), + Op0.getOperand(1)); + } + break; + case X86ISD::VPERMI: + case X86ISD::VROTLI: + case X86ISD::VROTRI: + if (VT.is512BitVector() && Subtarget.useAVX512Regs() && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op0.getOperand(1) == Op.getOperand(1); + })) { + SmallVector<SDValue, 2> Src; + for (unsigned i = 0; i != NumOps; ++i) + Src.push_back(Ops[i].getOperand(0)); + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src), + Op0.getOperand(1)); + } + break; + case X86ISD::PACKSS: case X86ISD::PACKUS: - if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) { + if (!IsSplat && NumOps == 2 && VT.is256BitVector() && + Subtarget.hasInt256()) { SmallVector<SDValue, 2> LHS, RHS; for (unsigned i = 0; i != NumOps; ++i) { LHS.push_back(Ops[i].getOperand(0)); @@ -45476,6 +48019,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS)); } break; + case X86ISD::PALIGNR: + if (!IsSplat && + ((VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.useBWIRegs())) && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op0.getOperand(2) == Op.getOperand(2); + })) { + SmallVector<SDValue, 2> LHS, RHS; + for (unsigned i = 0; i != NumOps; ++i) { + LHS.push_back(Ops[i].getOperand(0)); + RHS.push_back(Ops[i].getOperand(1)); + } + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS), + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS), + Op0.getOperand(2)); + } + break; } } @@ -45565,7 +48126,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, // if the insert or extract can be represented with a subregister operation. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubVec.getOperand(0).getSimpleValueType() == OpVT && - (IdxVal != 0 || !Vec.isUndef())) { + (IdxVal != 0 || + !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) { int ExtIdxVal = SubVec.getConstantOperandVal(1); if (ExtIdxVal != 0) { int VecNumElts = OpVT.getVectorNumElements(); @@ -45654,7 +48216,7 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) { unsigned SelElts = SelVT.getVectorNumElements(); unsigned CastedElts = WideVT.getVectorNumElements(); - unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue(); + unsigned ExtIdx = Ext->getConstantOperandVal(1); if (SelElts % CastedElts == 0) { // The select has the same or more (narrower) elements than the extract // operand. The extraction index gets scaled by that factor. @@ -45699,6 +48261,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, MVT VT = N->getSimpleValueType(0); SDValue InVec = N->getOperand(0); + unsigned IdxVal = N->getConstantOperandVal(1); SDValue InVecBC = peekThroughBitcasts(InVec); EVT InVecVT = InVec.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -45716,7 +48279,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (isConcatenatedNot(InVecBC.getOperand(0)) || isConcatenatedNot(InVecBC.getOperand(1))) { // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1 - SDValue Concat = split256IntArith(InVecBC, DAG); + SDValue Concat = splitVectorIntBinary(InVecBC, DAG); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, DAG.getBitcast(InVecVT, Concat), N->getOperand(1)); } @@ -45728,8 +48291,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, if (SDValue V = narrowExtractedVectorSelect(N, DAG)) return V; - unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); - if (ISD::isBuildVectorAllZeros(InVec.getNode())) return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); @@ -45779,6 +48340,43 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, } } + // If we're extracting an upper subvector from a broadcast we should just + // extract the lowest subvector instead which should allow + // SimplifyDemandedVectorElts do more simplifications. + if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST || + InVec.getOpcode() == X86ISD::VBROADCAST_LOAD)) + return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits()); + + // If we're extracting a broadcasted subvector, just use the source. + if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST && + InVec.getOperand(0).getValueType() == VT) + return InVec.getOperand(0); + + // Attempt to extract from the source of a shuffle vector. + if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 && + (IdxVal % VT.getVectorNumElements()) == 0) { + SmallVector<int, 32> ShuffleMask; + SmallVector<int, 32> ScaledMask; + SmallVector<SDValue, 2> ShuffleInputs; + unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits(); + // Decode the shuffle mask and scale it so its shuffling subvectors. + if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) && + scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) { + unsigned SubVecIdx = IdxVal / VT.getVectorNumElements(); + if (ScaledMask[SubVecIdx] == SM_SentinelUndef) + return DAG.getUNDEF(VT); + if (ScaledMask[SubVecIdx] == SM_SentinelZero) + return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); + SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs]; + if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) { + unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs; + unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements(); + return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG, + SDLoc(N), VT.getSizeInBits()); + } + } + } + // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { @@ -45851,13 +48449,30 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { Src.getOperand(1)); // Reduce v2i64 to v4i32 if we don't need the upper bits. - // TODO: Move to DAGCombine? - if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND && - Src.getValueType() == MVT::i64 && Src.hasOneUse() && - Src.getOperand(0).getScalarValueSizeInBits() <= 32) - return DAG.getBitcast( - VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, - DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32))); + // TODO: Move to DAGCombine/SimplifyDemandedBits? + if (VT == MVT::v2i64 || VT == MVT::v2f64) { + auto IsAnyExt64 = [](SDValue Op) { + if (Op.getValueType() != MVT::i64 || !Op.hasOneUse()) + return SDValue(); + if (Op.getOpcode() == ISD::ANY_EXTEND && + Op.getOperand(0).getScalarValueSizeInBits() <= 32) + return Op.getOperand(0); + if (auto *Ld = dyn_cast<LoadSDNode>(Op)) + if (Ld->getExtensionType() == ISD::EXTLOAD && + Ld->getMemoryVT().getScalarSizeInBits() <= 32) + return Op; + return SDValue(); + }; + if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src))) + return DAG.getBitcast( + VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, + DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32))); + } + + // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ. + if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST && + Src.getOperand(0).getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0)); return SDValue(); } @@ -45928,13 +48543,16 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG, auto *Ld = cast<LoadSDNode>(In); if (Ld->isSimple()) { MVT SVT = In.getSimpleValueType().getVectorElementType(); - ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD; - EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT, - VT.getVectorNumElements()); + ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG + ? ISD::SEXTLOAD + : ISD::ZEXTLOAD; + EVT MemVT = + EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements()); if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { SDValue Load = DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), MemVT, Ld->getAlignment(), + Ld->getPointerInfo(), MemVT, + Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); return Load; @@ -45971,6 +48589,196 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS. +// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce +// extra instructions between the conversion due to going to scalar and back. +static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16) + return SDValue(); + + if (N->getValueType(0) != MVT::f32 || + N->getOperand(0).getOperand(0).getValueType() != MVT::f32) + return SDValue(); + + SDLoc dl(N); + SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, + N->getOperand(0).getOperand(0)); + Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res, + DAG.getTargetConstant(4, dl, MVT::i32)); + Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, + DAG.getIntPtrConstant(0, dl)); +} + +static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) + return SDValue(); + + bool IsStrict = N->isStrictFPOpcode(); + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(IsStrict ? 1 : 0); + EVT SrcVT = Src.getValueType(); + + if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16) + return SDValue(); + + if (VT.getVectorElementType() != MVT::f32 && + VT.getVectorElementType() != MVT::f64) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) + return SDValue(); + + SDLoc dl(N); + + // Convert the input to vXi16. + EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); + Src = DAG.getBitcast(IntVT, Src); + + // Widen to at least 8 input elements. + if (NumElts < 8) { + unsigned NumConcats = 8 / NumElts; + SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT) + : DAG.getConstant(0, dl, IntVT); + SmallVector<SDValue, 4> Ops(NumConcats, Fill); + Ops[0] = Src; + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops); + } + + // Destination is vXf32 with at least 4 elements. + EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, + std::max(4U, NumElts)); + SDValue Cvt, Chain; + if (IsStrict) { + Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other}, + {N->getOperand(0), Src}); + Chain = Cvt.getValue(1); + } else { + Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src); + } + + if (NumElts < 4) { + assert(NumElts == 2 && "Unexpected size"); + Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt, + DAG.getIntPtrConstant(0, dl)); + } + + if (IsStrict) { + // Extend to the original VT if necessary. + if (Cvt.getValueType() != VT) { + Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other}, + {Chain, Cvt}); + Chain = Cvt.getValue(1); + } + return DAG.getMergeValues({Cvt, Chain}, dl); + } + + // Extend to the original VT if necessary. + return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt); +} + +// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to +// cases where the loads have the same input chain and the output chains are +// unused. This avoids any memory ordering issues. +static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + // Only do this if the chain result is unused. + if (N->hasAnyUseOfValue(1)) + return SDValue(); + + auto *MemIntrin = cast<MemIntrinsicSDNode>(N); + + SDValue Ptr = MemIntrin->getBasePtr(); + SDValue Chain = MemIntrin->getChain(); + EVT VT = N->getSimpleValueType(0); + EVT MemVT = MemIntrin->getMemoryVT(); + + // Look at other users of our base pointer and try to find a wider broadcast. + // The input chain and the size of the memory VT must match. + for (SDNode *User : Ptr->uses()) + if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD && + cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr && + cast<MemIntrinsicSDNode>(User)->getChain() == Chain && + cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() == + MemVT.getSizeInBits() && + !User->hasAnyUseOfValue(1) && + User->getValueSizeInBits(0) > VT.getSizeInBits()) { + SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), + VT.getSizeInBits()); + Extract = DAG.getBitcast(VT, Extract); + return DCI.CombineTo(N, Extract, SDValue(User, 1)); + } + + return SDValue(); +} + +static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) + return SDValue(); + + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 || + SrcVT.getVectorElementType() != MVT::f32) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) + return SDValue(); + + SDLoc dl(N); + + // Widen to at least 4 input elements. + if (NumElts < 4) + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getConstantFP(0.0, dl, SrcVT)); + + // Destination is v8i16 with at least 8 elements. + EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + std::max(8U, NumElts)); + SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, + DAG.getTargetConstant(4, dl, MVT::i32)); + + // Extract down to real number of elements. + if (NumElts < 8) { + EVT IntVT = VT.changeVectorElementTypeToInteger(); + Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt, + DAG.getIntPtrConstant(0, dl)); + } + + return DAG.getBitcast(VT, Cvt); +} + +static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) { + SDValue Src = N->getOperand(0); + + // Turn MOVDQ2Q+simple_load into an mmx load. + if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { + LoadSDNode *LN = cast<LoadSDNode>(Src.getNode()); + + if (LN->isSimple()) { + SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), + LN->getBasePtr(), + LN->getPointerInfo(), + LN->getOriginalAlign(), + LN->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1)); + return NewLd; + } + } + + return SDValue(); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -46002,8 +48810,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::ADC: return combineADC(N, DAG, DCI); case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); case ISD::SHL: return combineShiftLeft(N, DAG); - case ISD::SRA: return combineShiftRightArithmetic(N, DAG); - case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI); + case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget); + case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget); case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); @@ -46012,6 +48820,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); + case X86ISD::VEXTRACT_STORE: + return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::STRICT_SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget); @@ -46020,14 +48830,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return combineUIntToFP(N, DAG, Subtarget); case ISD::FADD: case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); - case ISD::FNEG: return combineFneg(N, DAG, Subtarget); + case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); - case X86ISD::VTRUNC: return combineVTRUNC(N, DAG); + case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); case X86ISD::FXOR: - case X86ISD::FOR: return combineFOr(N, DAG, Subtarget); + case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return combineFMinFMax(N, DAG); case ISD::FMINNUM: @@ -46036,8 +48846,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: + case X86ISD::STRICT_CVTTP2SI: case X86ISD::CVTTP2SI: - case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI); + case X86ISD::STRICT_CVTTP2UI: + case X86ISD::CVTTP2UI: + return combineCVTP2I_CVTTP2I(N, DAG, DCI); + case X86ISD::STRICT_CVTPH2PS: + case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); @@ -46060,12 +48875,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VSRAI: case X86ISD::VSRLI: return combineVectorShiftImm(N, DAG, DCI, Subtarget); + case ISD::INSERT_VECTOR_ELT: case X86ISD::PINSRB: case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: case X86ISD::EXTRQI: case X86ISD::INSERTQI: + case X86ISD::VALIGN: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: @@ -46097,12 +48914,16 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case X86ISD::FMADD_RND: case X86ISD::FMSUB: + case X86ISD::STRICT_FMSUB: case X86ISD::FMSUB_RND: case X86ISD::FNMADD: + case X86ISD::STRICT_FNMADD: case X86ISD::FNMADD_RND: case X86ISD::FNMSUB: + case X86ISD::STRICT_FNMSUB: case X86ISD::FNMSUB_RND: - case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget); + case ISD::FMA: + case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget); case X86ISD::FMADDSUB_RND: case X86ISD::FMSUBADD_RND: case X86ISD::FMADDSUB: @@ -46118,6 +48939,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); case X86ISD::KSHIFTL: case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); + case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); + case ISD::STRICT_FP_EXTEND: + case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget); + case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget); + case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI); + case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); } return SDValue(); @@ -46266,27 +49093,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { return true; } -bool X86TargetLowering:: - isDesirableToCombineBuildVectorToShuffleTruncate( - ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const { - - assert(SrcVT.getVectorNumElements() == ShuffleMask.size() && - "Element count mismatch"); - assert( - Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && - "Shuffle Mask expected to be legal"); - - // For 32-bit elements VPERMD is better than shuffle+truncate. - // TODO: After we improve lowerBuildVector, add execption for VPERMW. - if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2()) - return false; - - if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask)) - return false; - - return true; -} - //===----------------------------------------------------------------------===// // X86 Inline Assembly Support //===----------------------------------------------------------------------===// @@ -46327,7 +49133,7 @@ static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { } bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { - InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); + InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); const std::string &AsmStr = IA->getAsmString(); @@ -46450,7 +49256,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { case 'y': case 'x': case 'v': - case 'Y': case 'l': case 'k': // AVX512 masking registers. return C_RegisterClass; @@ -46487,7 +49292,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { default: break; case 'z': - case '0': return C_Register; case 'i': case 'm': @@ -46543,19 +49347,17 @@ TargetLowering::ConstraintWeight if (type->isX86_MMXTy() && Subtarget.hasMMX()) weight = CW_SpecificReg; break; - case 'Y': { - unsigned Size = StringRef(constraint).size(); - // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y' - char NextChar = Size == 2 ? constraint[1] : 'i'; - if (Size > 2) + case 'Y': + if (StringRef(constraint).size() != 2) break; - switch (NextChar) { + switch (constraint[1]) { default: return CW_Invalid; // XMM0 case 'z': - case '0': - if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) + if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || + ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) || + ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())) return CW_SpecificReg; return CW_Invalid; // Conditional OpMask regs (AVX512) @@ -46568,7 +49370,7 @@ TargetLowering::ConstraintWeight if (type->isX86_MMXTy() && Subtarget.hasMMX()) return weight; return CW_Invalid; - // Any SSE reg when ISA >= SSE2, same as 'Y' + // Any SSE reg when ISA >= SSE2, same as 'x' case 'i': case 't': case '2': @@ -46576,9 +49378,7 @@ TargetLowering::ConstraintWeight return CW_Invalid; break; } - // Fall through (handle "Y" constraint). - LLVM_FALLTHROUGH; - } + break; case 'v': if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()) weight = CW_Register; @@ -46660,8 +49460,6 @@ LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { - if (Subtarget.hasSSE2()) - return "Y"; if (Subtarget.hasSSE1()) return "x"; } @@ -46910,26 +49708,26 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, break; case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. if (Subtarget.is64Bit()) { - if (VT == MVT::i32 || VT == MVT::f32) - return std::make_pair(0U, &X86::GR32RegClass); - if (VT == MVT::i16) - return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8RegClass); - if (VT == MVT::i64 || VT == MVT::f64) + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16RegClass); + if (VT == MVT::i32 || VT == MVT::f32) + return std::make_pair(0U, &X86::GR32RegClass); + if (VT != MVT::f80) return std::make_pair(0U, &X86::GR64RegClass); break; } LLVM_FALLTHROUGH; // 32-bit fallthrough case 'Q': // Q_REGS - if (VT == MVT::i32 || VT == MVT::f32) - return std::make_pair(0U, &X86::GR32_ABCDRegClass); - if (VT == MVT::i16) - return std::make_pair(0U, &X86::GR16_ABCDRegClass); if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); - if (VT == MVT::i64) + if (VT == MVT::i16) + return std::make_pair(0U, &X86::GR16_ABCDRegClass); + if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit()) + return std::make_pair(0U, &X86::GR32_ABCDRegClass); + if (VT != MVT::f80) return std::make_pair(0U, &X86::GR64_ABCDRegClass); break; case 'r': // GENERAL_REGS @@ -46940,15 +49738,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::GR16RegClass); if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit()) return std::make_pair(0U, &X86::GR32RegClass); - return std::make_pair(0U, &X86::GR64RegClass); + if (VT != MVT::f80) + return std::make_pair(0U, &X86::GR64RegClass); + break; case 'R': // LEGACY_REGS if (VT == MVT::i8 || VT == MVT::i1) return std::make_pair(0U, &X86::GR8_NOREXRegClass); if (VT == MVT::i16) return std::make_pair(0U, &X86::GR16_NOREXRegClass); - if (VT == MVT::i32 || !Subtarget.is64Bit()) + if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit()) return std::make_pair(0U, &X86::GR32_NOREXRegClass); - return std::make_pair(0U, &X86::GR64_NOREXRegClass); + if (VT != MVT::f80) + return std::make_pair(0U, &X86::GR64_NOREXRegClass); + break; case 'f': // FP Stack registers. // If SSE is enabled for this VT, use f80 to ensure the isel moves the // value to the correct fpstack register class. @@ -46956,13 +49758,12 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &X86::RFP32RegClass); if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) return std::make_pair(0U, &X86::RFP64RegClass); - return std::make_pair(0U, &X86::RFP80RegClass); + if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) + return std::make_pair(0U, &X86::RFP80RegClass); + break; case 'y': // MMX_REGS if MMX allowed. if (!Subtarget.hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); - case 'Y': // SSE_REGS if SSE2 allowed - if (!Subtarget.hasSSE2()) break; - LLVM_FALLTHROUGH; case 'v': case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed if (!Subtarget.hasSSE1()) break; @@ -46981,7 +49782,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (VConstraint && Subtarget.hasVLX()) return std::make_pair(0U, &X86::FR64XRegClass); return std::make_pair(0U, &X86::FR64RegClass); - // TODO: Handle i128 in FR128RegClass after it is tested well. + case MVT::i128: + if (Subtarget.is64Bit()) { + if (VConstraint && Subtarget.hasVLX()) + return std::make_pair(0U, &X86::VR128XRegClass); + return std::make_pair(0U, &X86::VR128RegClass); + } + break; // Vector types and fp128. case MVT::f128: case MVT::v16i8: @@ -47005,6 +49812,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Subtarget.hasAVX()) return std::make_pair(0U, &X86::VR256RegClass); break; + case MVT::v64i8: + case MVT::v32i16: case MVT::v8f64: case MVT::v16f32: case MVT::v16i32: @@ -47023,14 +49832,50 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 'i': case 't': case '2': - return getRegForInlineAsmConstraint(TRI, "Y", VT); + return getRegForInlineAsmConstraint(TRI, "x", VT); case 'm': if (!Subtarget.hasMMX()) break; return std::make_pair(0U, &X86::VR64RegClass); case 'z': - case '0': if (!Subtarget.hasSSE1()) break; - return std::make_pair(X86::XMM0, &X86::VR128RegClass); + switch (VT.SimpleTy) { + default: break; + // Scalar SSE types. + case MVT::f32: + case MVT::i32: + return std::make_pair(X86::XMM0, &X86::FR32RegClass); + case MVT::f64: + case MVT::i64: + return std::make_pair(X86::XMM0, &X86::FR64RegClass); + case MVT::f128: + case MVT::v16i8: + case MVT::v8i16: + case MVT::v4i32: + case MVT::v2i64: + case MVT::v4f32: + case MVT::v2f64: + return std::make_pair(X86::XMM0, &X86::VR128RegClass); + // AVX types. + case MVT::v32i8: + case MVT::v16i16: + case MVT::v8i32: + case MVT::v4i64: + case MVT::v8f32: + case MVT::v4f64: + if (Subtarget.hasAVX()) + return std::make_pair(X86::YMM0, &X86::VR256RegClass); + break; + case MVT::v64i8: + case MVT::v32i16: + case MVT::v8f64: + case MVT::v16f32: + case MVT::v16i32: + case MVT::v8i64: + if (Subtarget.hasAVX512()) + return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass); + break; + } + break; case 'k': // This register class doesn't allocate k0 for masked vector operation. if (Subtarget.hasAVX512()) { @@ -47056,7 +49901,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. - std::pair<unsigned, const TargetRegisterClass*> Res; + std::pair<Register, const TargetRegisterClass*> Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? @@ -47127,7 +49972,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (isGRClass(*Class)) { unsigned Size = VT.getSizeInBits(); if (Size == 1) Size = 8; - unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); + Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { bool is64Bit = Subtarget.is64Bit(); const TargetRegisterClass *RC = @@ -47243,8 +50088,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. - bool OptSize = - Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); + bool OptSize = Attr.hasFnAttribute(Attribute::MinSize); return OptSize && !VT.isVector(); } @@ -47301,10 +50145,35 @@ bool X86TargetLowering::supportSwiftError() const { return Subtarget.is64Bit(); } +/// Returns true if stack probing through a function call is requested. +bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const { + return !getStackProbeSymbolName(MF).empty(); +} + +/// Returns true if stack probing through inline assembly is requested. +bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const { + + // No inline stack probe for Windows, they have their own mechanism. + if (Subtarget.isOSWindows() || + MF.getFunction().hasFnAttribute("no-stack-arg-probe")) + return false; + + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) + return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm"; + + return false; +} + /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { + // Inline Stack probes disable stack probe call + if (hasInlineStackProbe(MF)) + return ""; + // If the function specifically requests stack probes, emit them. if (MF.getFunction().hasFnAttribute("probe-stack")) return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h index 830cdfc79c0a..7f3dc90a2d73 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h @@ -14,8 +14,6 @@ #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" namespace llvm { @@ -24,680 +22,809 @@ namespace llvm { namespace X86ISD { // X86 Specific DAG Nodes - enum NodeType : unsigned { - // Start the numbering where the builtin ops leave off. - FIRST_NUMBER = ISD::BUILTIN_OP_END, - - /// Bit scan forward. - BSF, - /// Bit scan reverse. - BSR, - - /// Double shift instructions. These correspond to - /// X86::SHLDxx and X86::SHRDxx instructions. - SHLD, - SHRD, - - /// Bitwise logical AND of floating point values. This corresponds - /// to X86::ANDPS or X86::ANDPD. - FAND, - - /// Bitwise logical OR of floating point values. This corresponds - /// to X86::ORPS or X86::ORPD. - FOR, - - /// Bitwise logical XOR of floating point values. This corresponds - /// to X86::XORPS or X86::XORPD. - FXOR, - - /// Bitwise logical ANDNOT of floating point values. This - /// corresponds to X86::ANDNPS or X86::ANDNPD. - FANDN, - - /// These operations represent an abstract X86 call - /// instruction, which includes a bunch of information. In particular the - /// operands of these node are: - /// - /// #0 - The incoming token chain - /// #1 - The callee - /// #2 - The number of arg bytes the caller pushes on the stack. - /// #3 - The number of arg bytes the callee pops off the stack. - /// #4 - The value to pass in AL/AX/EAX (optional) - /// #5 - The value to pass in DL/DX/EDX (optional) - /// - /// The result values of these nodes are: - /// - /// #0 - The outgoing token chain - /// #1 - The first register result value (optional) - /// #2 - The second register result value (optional) - /// - CALL, - - /// Same as call except it adds the NoTrack prefix. - NT_CALL, - - /// X86 compare and logical compare instructions. - CMP, COMI, UCOMI, - - /// X86 bit-test instructions. - BT, - - /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS - /// operand, usually produced by a CMP instruction. - SETCC, - - /// X86 Select - SELECTS, - - // Same as SETCC except it's materialized with a sbb and the value is all - // one's or all zero's. - SETCC_CARRY, // R = carry_bit ? ~0 : 0 - - /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. - /// Operands are two FP values to compare; result is a mask of - /// 0s or 1s. Generally DTRT for C/C++ with NaNs. - FSETCC, - - /// X86 FP SETCC, similar to above, but with output as an i1 mask and - /// and a version with SAE. - FSETCCM, FSETCCM_SAE, - - /// X86 conditional moves. Operand 0 and operand 1 are the two values - /// to select from. Operand 2 is the condition code, and operand 3 is the - /// flag operand produced by a CMP or TEST instruction. - CMOV, - - /// X86 conditional branches. Operand 0 is the chain operand, operand 1 - /// is the block to branch if condition is true, operand 2 is the - /// condition code, and operand 3 is the flag operand produced by a CMP - /// or TEST instruction. - BRCOND, - - /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and - /// operand 1 is the target address. - NT_BRIND, - - /// Return with a flag operand. Operand 0 is the chain operand, operand - /// 1 is the number of bytes of stack to pop. - RET_FLAG, - - /// Return from interrupt. Operand 0 is the number of bytes to pop. - IRET, - - /// Repeat fill, corresponds to X86::REP_STOSx. - REP_STOS, - - /// Repeat move, corresponds to X86::REP_MOVSx. - REP_MOVS, - - /// On Darwin, this node represents the result of the popl - /// at function entry, used for PIC code. - GlobalBaseReg, - - /// A wrapper node for TargetConstantPool, TargetJumpTable, - /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, - /// MCSymbol and TargetBlockAddress. - Wrapper, - - /// Special wrapper used under X86-64 PIC mode for RIP - /// relative displacements. - WrapperRIP, - - /// Copies a 64-bit value from an MMX vector to the low word - /// of an XMM vector, with the high word zero filled. - MOVQ2DQ, - - /// Copies a 64-bit value from the low word of an XMM vector - /// to an MMX vector. - MOVDQ2Q, - - /// Copies a 32-bit value from the low word of a MMX - /// vector to a GPR. - MMX_MOVD2W, - - /// Copies a GPR into the low 32-bit word of a MMX vector - /// and zero out the high word. - MMX_MOVW2D, - - /// Extract an 8-bit value from a vector and zero extend it to - /// i32, corresponds to X86::PEXTRB. - PEXTRB, - - /// Extract a 16-bit value from a vector and zero extend it to - /// i32, corresponds to X86::PEXTRW. - PEXTRW, - - /// Insert any element of a 4 x float vector into any element - /// of a destination 4 x floatvector. - INSERTPS, - - /// Insert the lower 8-bits of a 32-bit value to a vector, - /// corresponds to X86::PINSRB. - PINSRB, - - /// Insert the lower 16-bits of a 32-bit value to a vector, - /// corresponds to X86::PINSRW. - PINSRW, - - /// Shuffle 16 8-bit values within a vector. - PSHUFB, - - /// Compute Sum of Absolute Differences. - PSADBW, - /// Compute Double Block Packed Sum-Absolute-Differences - DBPSADBW, - - /// Bitwise Logical AND NOT of Packed FP values. - ANDNP, - - /// Blend where the selector is an immediate. - BLENDI, - - /// Dynamic (non-constant condition) vector blend where only the sign bits - /// of the condition elements are used. This is used to enforce that the - /// condition mask is not valid for generic VSELECT optimizations. This - /// is also used to implement the intrinsics. - /// Operands are in VSELECT order: MASK, TRUE, FALSE - BLENDV, - - /// Combined add and sub on an FP vector. - ADDSUB, - - // FP vector ops with rounding mode. - FADD_RND, FADDS, FADDS_RND, - FSUB_RND, FSUBS, FSUBS_RND, - FMUL_RND, FMULS, FMULS_RND, - FDIV_RND, FDIVS, FDIVS_RND, - FMAX_SAE, FMAXS_SAE, - FMIN_SAE, FMINS_SAE, - FSQRT_RND, FSQRTS, FSQRTS_RND, - - // FP vector get exponent. - FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE, - // Extract Normalized Mantissas. - VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE, - // FP Scale. - SCALEF, SCALEF_RND, - SCALEFS, SCALEFS_RND, - - // Unsigned Integer average. - AVG, - - /// Integer horizontal add/sub. - HADD, - HSUB, - - /// Floating point horizontal add/sub. - FHADD, - FHSUB, - - // Detect Conflicts Within a Vector - CONFLICT, - - /// Floating point max and min. - FMAX, FMIN, - - /// Commutative FMIN and FMAX. - FMAXC, FMINC, - - /// Scalar intrinsic floating point max and min. - FMAXS, FMINS, - - /// Floating point reciprocal-sqrt and reciprocal approximation. - /// Note that these typically require refinement - /// in order to obtain suitable precision. - FRSQRT, FRCP, - - // AVX-512 reciprocal approximations with a little more precision. - RSQRT14, RSQRT14S, RCP14, RCP14S, - - // Thread Local Storage. - TLSADDR, - - // Thread Local Storage. A call to get the start address - // of the TLS block for the current module. - TLSBASEADDR, - - // Thread Local Storage. When calling to an OS provided - // thunk at the address from an earlier relocation. - TLSCALL, + enum NodeType : unsigned { + // Start the numbering where the builtin ops leave off. + FIRST_NUMBER = ISD::BUILTIN_OP_END, + + /// Bit scan forward. + BSF, + /// Bit scan reverse. + BSR, + + /// X86 funnel/double shift i16 instructions. These correspond to + /// X86::SHLDW and X86::SHRDW instructions which have different amt + /// modulo rules to generic funnel shifts. + /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD. + FSHL, + FSHR, + + /// Bitwise logical AND of floating point values. This corresponds + /// to X86::ANDPS or X86::ANDPD. + FAND, + + /// Bitwise logical OR of floating point values. This corresponds + /// to X86::ORPS or X86::ORPD. + FOR, + + /// Bitwise logical XOR of floating point values. This corresponds + /// to X86::XORPS or X86::XORPD. + FXOR, + + /// Bitwise logical ANDNOT of floating point values. This + /// corresponds to X86::ANDNPS or X86::ANDNPD. + FANDN, + + /// These operations represent an abstract X86 call + /// instruction, which includes a bunch of information. In particular the + /// operands of these node are: + /// + /// #0 - The incoming token chain + /// #1 - The callee + /// #2 - The number of arg bytes the caller pushes on the stack. + /// #3 - The number of arg bytes the callee pops off the stack. + /// #4 - The value to pass in AL/AX/EAX (optional) + /// #5 - The value to pass in DL/DX/EDX (optional) + /// + /// The result values of these nodes are: + /// + /// #0 - The outgoing token chain + /// #1 - The first register result value (optional) + /// #2 - The second register result value (optional) + /// + CALL, - // Exception Handling helpers. - EH_RETURN, + /// Same as call except it adds the NoTrack prefix. + NT_CALL, - // SjLj exception handling setjmp. - EH_SJLJ_SETJMP, + /// X86 compare and logical compare instructions. + CMP, + FCMP, + COMI, + UCOMI, - // SjLj exception handling longjmp. - EH_SJLJ_LONGJMP, + /// X86 bit-test instructions. + BT, - // SjLj exception handling dispatch. - EH_SJLJ_SETUP_DISPATCH, + /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS + /// operand, usually produced by a CMP instruction. + SETCC, - /// Tail call return. See X86TargetLowering::LowerCall for - /// the list of operands. - TC_RETURN, + /// X86 Select + SELECTS, - // Vector move to low scalar and zero higher vector elements. - VZEXT_MOVL, + // Same as SETCC except it's materialized with a sbb and the value is all + // one's or all zero's. + SETCC_CARRY, // R = carry_bit ? ~0 : 0 - // Vector integer truncate. - VTRUNC, - // Vector integer truncate with unsigned/signed saturation. - VTRUNCUS, VTRUNCS, + /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD. + /// Operands are two FP values to compare; result is a mask of + /// 0s or 1s. Generally DTRT for C/C++ with NaNs. + FSETCC, - // Masked version of the above. Used when less than a 128-bit result is - // produced since the mask only applies to the lower elements and can't - // be represented by a select. - // SRC, PASSTHRU, MASK - VMTRUNC, VMTRUNCUS, VMTRUNCS, - - // Vector FP extend. - VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE, - - // Vector FP round. - VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND, - - // Masked version of above. Used for v2f64->v4f32. - // SRC, PASSTHRU, MASK - VMFPROUND, - - // 128-bit vector logical left / right shift - VSHLDQ, VSRLDQ, - - // Vector shift elements - VSHL, VSRL, VSRA, - - // Vector variable shift - VSHLV, VSRLV, VSRAV, - - // Vector shift elements by immediate - VSHLI, VSRLI, VSRAI, - - // Shifts of mask registers. - KSHIFTL, KSHIFTR, - - // Bit rotate by immediate - VROTLI, VROTRI, - - // Vector packed double/float comparison. - CMPP, - - // Vector integer comparisons. - PCMPEQ, PCMPGT, - - // v8i16 Horizontal minimum and position. - PHMINPOS, - - MULTISHIFT, - - /// Vector comparison generating mask bits for fp and - /// integer signed and unsigned data types. - CMPM, - // Vector comparison with SAE for FP values - CMPM_SAE, - - // Arithmetic operations with FLAGS results. - ADD, SUB, ADC, SBB, SMUL, UMUL, - OR, XOR, AND, - - // Bit field extract. - BEXTR, - - // Zero High Bits Starting with Specified Bit Position. - BZHI, - - // X86-specific multiply by immediate. - MUL_IMM, - - // Vector sign bit extraction. - MOVMSK, - - // Vector bitwise comparisons. - PTEST, - - // Vector packed fp sign bitwise comparisons. - TESTP, - - // OR/AND test for masks. - KORTEST, - KTEST, - - // ADD for masks. - KADD, - - // Several flavors of instructions with vector shuffle behaviors. - // Saturated signed/unnsigned packing. - PACKSS, - PACKUS, - // Intra-lane alignr. - PALIGNR, - // AVX512 inter-lane alignr. - VALIGN, - PSHUFD, - PSHUFHW, - PSHUFLW, - SHUFP, - // VBMI2 Concat & Shift. - VSHLD, - VSHRD, - VSHLDV, - VSHRDV, - //Shuffle Packed Values at 128-bit granularity. - SHUF128, - MOVDDUP, - MOVSHDUP, - MOVSLDUP, - MOVLHPS, - MOVHLPS, - MOVSD, - MOVSS, - UNPCKL, - UNPCKH, - VPERMILPV, - VPERMILPI, - VPERMI, - VPERM2X128, - - // Variable Permute (VPERM). - // Res = VPERMV MaskV, V0 - VPERMV, - - // 3-op Variable Permute (VPERMT2). - // Res = VPERMV3 V0, MaskV, V1 - VPERMV3, - - // Bitwise ternary logic. - VPTERNLOG, - // Fix Up Special Packed Float32/64 values. - VFIXUPIMM, VFIXUPIMM_SAE, - VFIXUPIMMS, VFIXUPIMMS_SAE, - // Range Restriction Calculation For Packed Pairs of Float32/64 values. - VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE, - // Reduce - Perform Reduction Transformation on scalar\packed FP. - VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE, - // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. - // Also used by the legacy (V)ROUND intrinsics where we mask out the - // scaling part of the immediate. - VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE, - // Tests Types Of a FP Values for packed types. - VFPCLASS, - // Tests Types Of a FP Values for scalar types. - VFPCLASSS, - - // Broadcast (splat) scalar or element 0 of a vector. If the operand is - // a vector, this node may change the vector length as part of the splat. - VBROADCAST, - // Broadcast mask to vector. - VBROADCASTM, - // Broadcast subvector to vector. - SUBV_BROADCAST, - - /// SSE4A Extraction and Insertion. - EXTRQI, INSERTQI, - - // XOP arithmetic/logical shifts. - VPSHA, VPSHL, - // XOP signed/unsigned integer comparisons. - VPCOM, VPCOMU, - // XOP packed permute bytes. - VPPERM, - // XOP two source permutation. - VPERMIL2, - - // Vector multiply packed unsigned doubleword integers. - PMULUDQ, - // Vector multiply packed signed doubleword integers. - PMULDQ, - // Vector Multiply Packed UnsignedIntegers with Round and Scale. - MULHRS, - - // Multiply and Add Packed Integers. - VPMADDUBSW, VPMADDWD, - - // AVX512IFMA multiply and add. - // NOTE: These are different than the instruction and perform - // op0 x op1 + op2. - VPMADD52L, VPMADD52H, - - // VNNI - VPDPBUSD, - VPDPBUSDS, - VPDPWSSD, - VPDPWSSDS, - - // FMA nodes. - // We use the target independent ISD::FMA for the non-inverted case. - FNMADD, - FMSUB, - FNMSUB, - FMADDSUB, - FMSUBADD, - - // FMA with rounding mode. - FMADD_RND, - FNMADD_RND, - FMSUB_RND, - FNMSUB_RND, - FMADDSUB_RND, - FMSUBADD_RND, - - // Compress and expand. - COMPRESS, - EXPAND, - - // Bits shuffle - VPSHUFBITQMB, - - // Convert Unsigned/Integer to Floating-Point Value with rounding mode. - SINT_TO_FP_RND, UINT_TO_FP_RND, - SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP, - SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND, - - // Vector float/double to signed/unsigned integer. - CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND, - // Scalar float/double to signed/unsigned integer. - CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND, - - // Vector float/double to signed/unsigned integer with truncation. - CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE, - // Scalar float/double to signed/unsigned integer with truncation. - CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE, - - // Vector signed/unsigned integer to float/double. - CVTSI2P, CVTUI2P, - - // Masked versions of above. Used for v2f64->v4f32. - // SRC, PASSTHRU, MASK - MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI, - MCVTSI2P, MCVTUI2P, - - // Vector float to bfloat16. - // Convert TWO packed single data to one packed BF16 data - CVTNE2PS2BF16, - // Convert packed single data to packed BF16 data - CVTNEPS2BF16, - // Masked version of above. - // SRC, PASSTHRU, MASK - MCVTNEPS2BF16, - - // Dot product of BF16 pairs to accumulated into - // packed single precision. - DPBF16PS, - - // Save xmm argument registers to the stack, according to %al. An operator - // is needed so that this can be expanded with control flow. - VASTART_SAVE_XMM_REGS, - - // Windows's _chkstk call to do stack probing. - WIN_ALLOCA, - - // For allocating variable amounts of stack space when using - // segmented stacks. Check if the current stacklet has enough space, and - // falls back to heap allocation if not. - SEG_ALLOCA, - - // Memory barriers. - MEMBARRIER, - MFENCE, - - // Store FP status word into i16 register. - FNSTSW16r, - - // Store contents of %ah into %eflags. - SAHF, - - // Get a random integer and indicate whether it is valid in CF. - RDRAND, - - // Get a NIST SP800-90B & C compliant random integer and - // indicate whether it is valid in CF. - RDSEED, - - // Protection keys - // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. - // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is - // value for ECX. - RDPKRU, WRPKRU, - - // SSE42 string comparisons. - // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG - // will emit one or two instructions based on which results are used. If - // flags and index/mask this allows us to use a single instruction since - // we won't have to pick and opcode for flags. Instead we can rely on the - // DAG to CSE everything and decide at isel. - PCMPISTR, - PCMPESTR, - - // Test if in transactional execution. - XTEST, - - // ERI instructions. - RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE, - RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE, - - // Conversions between float and half-float. - CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE, - - // Masked version of above. - // SRC, RND, PASSTHRU, MASK - MCVTPS2PH, - - // Galois Field Arithmetic Instructions - GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB, - - // LWP insert record. - LWPINS, - - // User level wait - UMWAIT, TPAUSE, - - // Enqueue Stores Instructions - ENQCMD, ENQCMDS, - - // For avx512-vp2intersect - VP2INTERSECT, - - /// X86 strict FP compare instructions. - STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, - STRICT_FCMPS, - - // Vector packed double/float comparison. - STRICT_CMPP, - - /// Vector comparison generating mask bits for fp and - /// integer signed and unsigned data types. - STRICT_CMPM, - - // Vector float/double to signed/unsigned integer with truncation. - STRICT_CVTTP2SI, STRICT_CVTTP2UI, - - // Vector FP extend. - STRICT_VFPEXT, - - // Vector FP round. - STRICT_VFPROUND, - - // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. - // Also used by the legacy (V)ROUND intrinsics where we mask out the - // scaling part of the immediate. - STRICT_VRNDSCALE, - - // Vector signed/unsigned integer to float/double. - STRICT_CVTSI2P, STRICT_CVTUI2P, - - // Compare and swap. - LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, - LCMPXCHG8_DAG, - LCMPXCHG16_DAG, - LCMPXCHG8_SAVE_EBX_DAG, - LCMPXCHG16_SAVE_RBX_DAG, - - /// LOCK-prefixed arithmetic read-modify-write instructions. - /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) - LADD, LSUB, LOR, LXOR, LAND, - - // Load, scalar_to_vector, and zero extend. - VZEXT_LOAD, - - // extract_vector_elt, store. - VEXTRACT_STORE, - - // scalar broadcast from memory - VBROADCAST_LOAD, - - // Store FP control world into i16 memory. - FNSTCW16m, - - /// This instruction implements FP_TO_SINT with the - /// integer destination in memory and a FP reg source. This corresponds - /// to the X86::FIST*m instructions and the rounding mode change stuff. It - /// has two inputs (token chain and address) and two outputs (int value - /// and token chain). Memory VT specifies the type to store to. - FP_TO_INT_IN_MEM, - - /// This instruction implements SINT_TO_FP with the - /// integer source in memory and FP reg result. This corresponds to the - /// X86::FILD*m instructions. It has two inputs (token chain and address) - /// and two outputs (FP value and token chain). FILD_FLAG also produces a - /// flag). The integer source type is specified by the memory VT. - FILD, - FILD_FLAG, - - /// This instruction implements a fp->int store from FP stack - /// slots. This corresponds to the fist instruction. It takes a - /// chain operand, value to store, address, and glue. The memory VT - /// specifies the type to store as. - FIST, - - /// This instruction implements an extending load to FP stack slots. - /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain - /// operand, and ptr to load from. The memory VT specifies the type to - /// load from. - FLD, + /// X86 FP SETCC, similar to above, but with output as an i1 mask and + /// and a version with SAE. + FSETCCM, + FSETCCM_SAE, - /// This instruction implements a truncating store from FP stack - /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a - /// chain operand, value to store, address, and glue. The memory VT - /// specifies the type to store as. - FST, - - /// This instruction grabs the address of the next argument - /// from a va_list. (reads and modifies the va_list in memory) - VAARG_64, - - // Vector truncating store with unsigned/signed saturation - VTRUNCSTOREUS, VTRUNCSTORES, - // Vector truncating masked store with unsigned/signed saturation - VMTRUNCSTOREUS, VMTRUNCSTORES, - - // X86 specific gather and scatter - MGATHER, MSCATTER, - - // WARNING: Do not add anything in the end unless you want the node to - // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all - // opcodes will be thought as target memory ops! - }; + /// X86 conditional moves. Operand 0 and operand 1 are the two values + /// to select from. Operand 2 is the condition code, and operand 3 is the + /// flag operand produced by a CMP or TEST instruction. + CMOV, + + /// X86 conditional branches. Operand 0 is the chain operand, operand 1 + /// is the block to branch if condition is true, operand 2 is the + /// condition code, and operand 3 is the flag operand produced by a CMP + /// or TEST instruction. + BRCOND, + + /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and + /// operand 1 is the target address. + NT_BRIND, + + /// Return with a flag operand. Operand 0 is the chain operand, operand + /// 1 is the number of bytes of stack to pop. + RET_FLAG, + + /// Return from interrupt. Operand 0 is the number of bytes to pop. + IRET, + + /// Repeat fill, corresponds to X86::REP_STOSx. + REP_STOS, + + /// Repeat move, corresponds to X86::REP_MOVSx. + REP_MOVS, + + /// On Darwin, this node represents the result of the popl + /// at function entry, used for PIC code. + GlobalBaseReg, + + /// A wrapper node for TargetConstantPool, TargetJumpTable, + /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress, + /// MCSymbol and TargetBlockAddress. + Wrapper, + + /// Special wrapper used under X86-64 PIC mode for RIP + /// relative displacements. + WrapperRIP, + + /// Copies a 64-bit value from an MMX vector to the low word + /// of an XMM vector, with the high word zero filled. + MOVQ2DQ, + + /// Copies a 64-bit value from the low word of an XMM vector + /// to an MMX vector. + MOVDQ2Q, + + /// Copies a 32-bit value from the low word of a MMX + /// vector to a GPR. + MMX_MOVD2W, + + /// Copies a GPR into the low 32-bit word of a MMX vector + /// and zero out the high word. + MMX_MOVW2D, + + /// Extract an 8-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRB. + PEXTRB, + + /// Extract a 16-bit value from a vector and zero extend it to + /// i32, corresponds to X86::PEXTRW. + PEXTRW, + + /// Insert any element of a 4 x float vector into any element + /// of a destination 4 x floatvector. + INSERTPS, + + /// Insert the lower 8-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRB. + PINSRB, + + /// Insert the lower 16-bits of a 32-bit value to a vector, + /// corresponds to X86::PINSRW. + PINSRW, + + /// Shuffle 16 8-bit values within a vector. + PSHUFB, + + /// Compute Sum of Absolute Differences. + PSADBW, + /// Compute Double Block Packed Sum-Absolute-Differences + DBPSADBW, + + /// Bitwise Logical AND NOT of Packed FP values. + ANDNP, + + /// Blend where the selector is an immediate. + BLENDI, + + /// Dynamic (non-constant condition) vector blend where only the sign bits + /// of the condition elements are used. This is used to enforce that the + /// condition mask is not valid for generic VSELECT optimizations. This + /// is also used to implement the intrinsics. + /// Operands are in VSELECT order: MASK, TRUE, FALSE + BLENDV, + + /// Combined add and sub on an FP vector. + ADDSUB, + + // FP vector ops with rounding mode. + FADD_RND, + FADDS, + FADDS_RND, + FSUB_RND, + FSUBS, + FSUBS_RND, + FMUL_RND, + FMULS, + FMULS_RND, + FDIV_RND, + FDIVS, + FDIVS_RND, + FMAX_SAE, + FMAXS_SAE, + FMIN_SAE, + FMINS_SAE, + FSQRT_RND, + FSQRTS, + FSQRTS_RND, + + // FP vector get exponent. + FGETEXP, + FGETEXP_SAE, + FGETEXPS, + FGETEXPS_SAE, + // Extract Normalized Mantissas. + VGETMANT, + VGETMANT_SAE, + VGETMANTS, + VGETMANTS_SAE, + // FP Scale. + SCALEF, + SCALEF_RND, + SCALEFS, + SCALEFS_RND, + + // Unsigned Integer average. + AVG, + + /// Integer horizontal add/sub. + HADD, + HSUB, + + /// Floating point horizontal add/sub. + FHADD, + FHSUB, + + // Detect Conflicts Within a Vector + CONFLICT, + + /// Floating point max and min. + FMAX, + FMIN, + + /// Commutative FMIN and FMAX. + FMAXC, + FMINC, + + /// Scalar intrinsic floating point max and min. + FMAXS, + FMINS, + + /// Floating point reciprocal-sqrt and reciprocal approximation. + /// Note that these typically require refinement + /// in order to obtain suitable precision. + FRSQRT, + FRCP, + + // AVX-512 reciprocal approximations with a little more precision. + RSQRT14, + RSQRT14S, + RCP14, + RCP14S, + + // Thread Local Storage. + TLSADDR, + + // Thread Local Storage. A call to get the start address + // of the TLS block for the current module. + TLSBASEADDR, + + // Thread Local Storage. When calling to an OS provided + // thunk at the address from an earlier relocation. + TLSCALL, + + // Exception Handling helpers. + EH_RETURN, + + // SjLj exception handling setjmp. + EH_SJLJ_SETJMP, + + // SjLj exception handling longjmp. + EH_SJLJ_LONGJMP, + + // SjLj exception handling dispatch. + EH_SJLJ_SETUP_DISPATCH, + + /// Tail call return. See X86TargetLowering::LowerCall for + /// the list of operands. + TC_RETURN, + + // Vector move to low scalar and zero higher vector elements. + VZEXT_MOVL, + + // Vector integer truncate. + VTRUNC, + // Vector integer truncate with unsigned/signed saturation. + VTRUNCUS, + VTRUNCS, + + // Masked version of the above. Used when less than a 128-bit result is + // produced since the mask only applies to the lower elements and can't + // be represented by a select. + // SRC, PASSTHRU, MASK + VMTRUNC, + VMTRUNCUS, + VMTRUNCS, + + // Vector FP extend. + VFPEXT, + VFPEXT_SAE, + VFPEXTS, + VFPEXTS_SAE, + + // Vector FP round. + VFPROUND, + VFPROUND_RND, + VFPROUNDS, + VFPROUNDS_RND, + + // Masked version of above. Used for v2f64->v4f32. + // SRC, PASSTHRU, MASK + VMFPROUND, + + // 128-bit vector logical left / right shift + VSHLDQ, + VSRLDQ, + + // Vector shift elements + VSHL, + VSRL, + VSRA, + + // Vector variable shift + VSHLV, + VSRLV, + VSRAV, + + // Vector shift elements by immediate + VSHLI, + VSRLI, + VSRAI, + + // Shifts of mask registers. + KSHIFTL, + KSHIFTR, + + // Bit rotate by immediate + VROTLI, + VROTRI, + + // Vector packed double/float comparison. + CMPP, + + // Vector integer comparisons. + PCMPEQ, + PCMPGT, + + // v8i16 Horizontal minimum and position. + PHMINPOS, + + MULTISHIFT, + + /// Vector comparison generating mask bits for fp and + /// integer signed and unsigned data types. + CMPM, + // Vector comparison with SAE for FP values + CMPM_SAE, + + // Arithmetic operations with FLAGS results. + ADD, + SUB, + ADC, + SBB, + SMUL, + UMUL, + OR, + XOR, + AND, + + // Bit field extract. + BEXTR, + + // Zero High Bits Starting with Specified Bit Position. + BZHI, + + // Parallel extract and deposit. + PDEP, + PEXT, + + // X86-specific multiply by immediate. + MUL_IMM, + + // Vector sign bit extraction. + MOVMSK, + + // Vector bitwise comparisons. + PTEST, + + // Vector packed fp sign bitwise comparisons. + TESTP, + + // OR/AND test for masks. + KORTEST, + KTEST, + + // ADD for masks. + KADD, + + // Several flavors of instructions with vector shuffle behaviors. + // Saturated signed/unnsigned packing. + PACKSS, + PACKUS, + // Intra-lane alignr. + PALIGNR, + // AVX512 inter-lane alignr. + VALIGN, + PSHUFD, + PSHUFHW, + PSHUFLW, + SHUFP, + // VBMI2 Concat & Shift. + VSHLD, + VSHRD, + VSHLDV, + VSHRDV, + // Shuffle Packed Values at 128-bit granularity. + SHUF128, + MOVDDUP, + MOVSHDUP, + MOVSLDUP, + MOVLHPS, + MOVHLPS, + MOVSD, + MOVSS, + UNPCKL, + UNPCKH, + VPERMILPV, + VPERMILPI, + VPERMI, + VPERM2X128, + + // Variable Permute (VPERM). + // Res = VPERMV MaskV, V0 + VPERMV, + + // 3-op Variable Permute (VPERMT2). + // Res = VPERMV3 V0, MaskV, V1 + VPERMV3, + + // Bitwise ternary logic. + VPTERNLOG, + // Fix Up Special Packed Float32/64 values. + VFIXUPIMM, + VFIXUPIMM_SAE, + VFIXUPIMMS, + VFIXUPIMMS_SAE, + // Range Restriction Calculation For Packed Pairs of Float32/64 values. + VRANGE, + VRANGE_SAE, + VRANGES, + VRANGES_SAE, + // Reduce - Perform Reduction Transformation on scalar\packed FP. + VREDUCE, + VREDUCE_SAE, + VREDUCES, + VREDUCES_SAE, + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. + // Also used by the legacy (V)ROUND intrinsics where we mask out the + // scaling part of the immediate. + VRNDSCALE, + VRNDSCALE_SAE, + VRNDSCALES, + VRNDSCALES_SAE, + // Tests Types Of a FP Values for packed types. + VFPCLASS, + // Tests Types Of a FP Values for scalar types. + VFPCLASSS, + + // Broadcast (splat) scalar or element 0 of a vector. If the operand is + // a vector, this node may change the vector length as part of the splat. + VBROADCAST, + // Broadcast mask to vector. + VBROADCASTM, + // Broadcast subvector to vector. + SUBV_BROADCAST, + + /// SSE4A Extraction and Insertion. + EXTRQI, + INSERTQI, + + // XOP arithmetic/logical shifts. + VPSHA, + VPSHL, + // XOP signed/unsigned integer comparisons. + VPCOM, + VPCOMU, + // XOP packed permute bytes. + VPPERM, + // XOP two source permutation. + VPERMIL2, + + // Vector multiply packed unsigned doubleword integers. + PMULUDQ, + // Vector multiply packed signed doubleword integers. + PMULDQ, + // Vector Multiply Packed UnsignedIntegers with Round and Scale. + MULHRS, + + // Multiply and Add Packed Integers. + VPMADDUBSW, + VPMADDWD, + + // AVX512IFMA multiply and add. + // NOTE: These are different than the instruction and perform + // op0 x op1 + op2. + VPMADD52L, + VPMADD52H, + + // VNNI + VPDPBUSD, + VPDPBUSDS, + VPDPWSSD, + VPDPWSSDS, + + // FMA nodes. + // We use the target independent ISD::FMA for the non-inverted case. + FNMADD, + FMSUB, + FNMSUB, + FMADDSUB, + FMSUBADD, + + // FMA with rounding mode. + FMADD_RND, + FNMADD_RND, + FMSUB_RND, + FNMSUB_RND, + FMADDSUB_RND, + FMSUBADD_RND, + + // Compress and expand. + COMPRESS, + EXPAND, + + // Bits shuffle + VPSHUFBITQMB, + + // Convert Unsigned/Integer to Floating-Point Value with rounding mode. + SINT_TO_FP_RND, + UINT_TO_FP_RND, + SCALAR_SINT_TO_FP, + SCALAR_UINT_TO_FP, + SCALAR_SINT_TO_FP_RND, + SCALAR_UINT_TO_FP_RND, + + // Vector float/double to signed/unsigned integer. + CVTP2SI, + CVTP2UI, + CVTP2SI_RND, + CVTP2UI_RND, + // Scalar float/double to signed/unsigned integer. + CVTS2SI, + CVTS2UI, + CVTS2SI_RND, + CVTS2UI_RND, + + // Vector float/double to signed/unsigned integer with truncation. + CVTTP2SI, + CVTTP2UI, + CVTTP2SI_SAE, + CVTTP2UI_SAE, + // Scalar float/double to signed/unsigned integer with truncation. + CVTTS2SI, + CVTTS2UI, + CVTTS2SI_SAE, + CVTTS2UI_SAE, + + // Vector signed/unsigned integer to float/double. + CVTSI2P, + CVTUI2P, + + // Masked versions of above. Used for v2f64->v4f32. + // SRC, PASSTHRU, MASK + MCVTP2SI, + MCVTP2UI, + MCVTTP2SI, + MCVTTP2UI, + MCVTSI2P, + MCVTUI2P, + + // Vector float to bfloat16. + // Convert TWO packed single data to one packed BF16 data + CVTNE2PS2BF16, + // Convert packed single data to packed BF16 data + CVTNEPS2BF16, + // Masked version of above. + // SRC, PASSTHRU, MASK + MCVTNEPS2BF16, + + // Dot product of BF16 pairs to accumulated into + // packed single precision. + DPBF16PS, + + // Save xmm argument registers to the stack, according to %al. An operator + // is needed so that this can be expanded with control flow. + VASTART_SAVE_XMM_REGS, + + // Windows's _chkstk call to do stack probing. + WIN_ALLOCA, + + // For allocating variable amounts of stack space when using + // segmented stacks. Check if the current stacklet has enough space, and + // falls back to heap allocation if not. + SEG_ALLOCA, + + // For allocating stack space when using stack clash protector. + // Allocation is performed by block, and each block is probed. + PROBED_ALLOCA, + + // Memory barriers. + MEMBARRIER, + MFENCE, + + // Get a random integer and indicate whether it is valid in CF. + RDRAND, + + // Get a NIST SP800-90B & C compliant random integer and + // indicate whether it is valid in CF. + RDSEED, + + // Protection keys + // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX. + // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is + // value for ECX. + RDPKRU, + WRPKRU, + + // SSE42 string comparisons. + // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG + // will emit one or two instructions based on which results are used. If + // flags and index/mask this allows us to use a single instruction since + // we won't have to pick and opcode for flags. Instead we can rely on the + // DAG to CSE everything and decide at isel. + PCMPISTR, + PCMPESTR, + + // Test if in transactional execution. + XTEST, + + // ERI instructions. + RSQRT28, + RSQRT28_SAE, + RSQRT28S, + RSQRT28S_SAE, + RCP28, + RCP28_SAE, + RCP28S, + RCP28S_SAE, + EXP2, + EXP2_SAE, + + // Conversions between float and half-float. + CVTPS2PH, + CVTPH2PS, + CVTPH2PS_SAE, + + // Masked version of above. + // SRC, RND, PASSTHRU, MASK + MCVTPS2PH, + + // Galois Field Arithmetic Instructions + GF2P8AFFINEINVQB, + GF2P8AFFINEQB, + GF2P8MULB, + + // LWP insert record. + LWPINS, + + // User level wait + UMWAIT, + TPAUSE, + + // Enqueue Stores Instructions + ENQCMD, + ENQCMDS, + + // For avx512-vp2intersect + VP2INTERSECT, + + /// X86 strict FP compare instructions. + STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, + STRICT_FCMPS, + + // Vector packed double/float comparison. + STRICT_CMPP, + + /// Vector comparison generating mask bits for fp and + /// integer signed and unsigned data types. + STRICT_CMPM, + + // Vector float/double to signed/unsigned integer with truncation. + STRICT_CVTTP2SI, + STRICT_CVTTP2UI, + + // Vector FP extend. + STRICT_VFPEXT, + + // Vector FP round. + STRICT_VFPROUND, + + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. + // Also used by the legacy (V)ROUND intrinsics where we mask out the + // scaling part of the immediate. + STRICT_VRNDSCALE, + + // Vector signed/unsigned integer to float/double. + STRICT_CVTSI2P, + STRICT_CVTUI2P, + + // Strict FMA nodes. + STRICT_FNMADD, + STRICT_FMSUB, + STRICT_FNMSUB, + + // Conversions between float and half-float. + STRICT_CVTPS2PH, + STRICT_CVTPH2PS, + + // Compare and swap. + LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, + LCMPXCHG8_DAG, + LCMPXCHG16_DAG, + LCMPXCHG8_SAVE_EBX_DAG, + LCMPXCHG16_SAVE_RBX_DAG, + + /// LOCK-prefixed arithmetic read-modify-write instructions. + /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) + LADD, + LSUB, + LOR, + LXOR, + LAND, + + // Load, scalar_to_vector, and zero extend. + VZEXT_LOAD, + + // extract_vector_elt, store. + VEXTRACT_STORE, + + // scalar broadcast from memory + VBROADCAST_LOAD, + + // Store FP control world into i16 memory. + FNSTCW16m, + + /// This instruction implements FP_TO_SINT with the + /// integer destination in memory and a FP reg source. This corresponds + /// to the X86::FIST*m instructions and the rounding mode change stuff. It + /// has two inputs (token chain and address) and two outputs (int value + /// and token chain). Memory VT specifies the type to store to. + FP_TO_INT_IN_MEM, + + /// This instruction implements SINT_TO_FP with the + /// integer source in memory and FP reg result. This corresponds to the + /// X86::FILD*m instructions. It has two inputs (token chain and address) + /// and two outputs (FP value and token chain). The integer source type is + /// specified by the memory VT. + FILD, + + /// This instruction implements a fp->int store from FP stack + /// slots. This corresponds to the fist instruction. It takes a + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. + FIST, + + /// This instruction implements an extending load to FP stack slots. + /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain + /// operand, and ptr to load from. The memory VT specifies the type to + /// load from. + FLD, + + /// This instruction implements a truncating store from FP stack + /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a + /// chain operand, value to store, address, and glue. The memory VT + /// specifies the type to store as. + FST, + + /// This instruction grabs the address of the next argument + /// from a va_list. (reads and modifies the va_list in memory) + VAARG_64, + + // Vector truncating store with unsigned/signed saturation + VTRUNCSTOREUS, + VTRUNCSTORES, + // Vector truncating masked store with unsigned/signed saturation + VMTRUNCSTOREUS, + VMTRUNCSTORES, + + // X86 specific gather and scatter + MGATHER, + MSCATTER, + + // WARNING: Do not add anything in the end unless you want the node to + // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all + // opcodes will be thought as target memory ops! + }; } // end namespace X86ISD /// Define some predicates that are used for node matching. @@ -717,7 +844,10 @@ namespace llvm { /// If Op is a constant whose elements are all the same constant or /// undefined, return true and return the constant value in \p SplatVal. - bool isConstantSplat(SDValue Op, APInt &SplatVal); + /// If we have undef bits that don't cover an entire element, we treat these + /// as zero if AllowPartialUndefs is set, else we fail and return false. + bool isConstantSplat(SDValue Op, APInt &SplatVal, + bool AllowPartialUndefs = true); } // end namespace X86 //===--------------------------------------------------------------------===// @@ -756,19 +886,7 @@ namespace llvm { unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override; - /// Returns the target specific optimal type for load - /// and store operations as a result of memset, memcpy, and memmove - /// lowering. If DstAlign is zero that means it's safe to destination - /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it - /// means there isn't a need to check it against alignment requirement, - /// probably because the source does not need to be loaded. If 'IsMemset' is - /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that - /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy - /// source is constant so it does not need to be loaded. - /// It returns EVT::Other if the type should be determined using generic - /// target-independent logic. - EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override; /// Returns true if it's safe to use load / store of the @@ -805,19 +923,6 @@ namespace llvm { SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - // Return true if it is profitable to combine a BUILD_VECTOR with a - // stride-pattern to a shuffle and a truncate. - // Example of such a combine: - // v4i32 build_vector((extract_elt V, 1), - // (extract_elt V, 3), - // (extract_elt V, 5), - // (extract_elt V, 7)) - // --> - // v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to - // v4i64) - bool isDesirableToCombineBuildVectorToShuffleTruncate( - ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override; - /// Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 @@ -830,15 +935,12 @@ namespace llvm { /// and some i16 instructions are slow. bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; - /// Return 1 if we can compute the negated form of the specified expression - /// for the same cost as the expression itself, or 2 if we can compute the - /// negated form more cheaply than the expression itself. Else return 0. - char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations, - bool ForCodeSize, unsigned Depth) const override; - - /// If isNegatibleForFree returns true, return the newly negated expression. + /// Return the newly negated expression if the cost is not expensive and + /// set the cost in \p Cost to indicate that if it is cheaper or neutral to + /// do the negation. SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, + NegatibleCost &Cost, unsigned Depth) const override; MachineBasicBlock * @@ -934,7 +1036,8 @@ namespace llvm { EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; - bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, + bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, + const APInt &DemandedElts, TargetLoweringOpt &TLO) const override; /// Determine which of the bits specified in Mask are known to be either @@ -958,6 +1061,12 @@ namespace llvm { TargetLoweringOpt &TLO, unsigned Depth) const override; + bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, + const APInt &DemandedElts, + unsigned MaskIndex, + TargetLoweringOpt &TLO, + unsigned Depth) const; + bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, @@ -1047,6 +1156,8 @@ namespace llvm { int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + /// This is used to enable splatted operand transforms for vector shifts + /// and vector funnel shifts. bool isVectorShiftByScalarCheap(Type *Ty) const override; /// Add x86-specific opcodes to the default list. @@ -1075,6 +1186,10 @@ namespace llvm { bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool shouldSinkOperands(Instruction *I, + SmallVectorImpl<Use *> &Ops) const override; + bool shouldConvertPhiType(Type *From, Type *To) const override; + /// Return true if folding a vector load into ExtVal (a sign, zero, or any /// extend node) is profitable. bool isVectorLoadExtDesirable(SDValue) const override; @@ -1171,7 +1286,8 @@ namespace llvm { /// Overflow nodes should get combined/lowered to optimal instructions /// (they should allow eliminating explicit compares by getting flags from /// math ops). - bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override; + bool shouldFormOverflowOp(unsigned Opcode, EVT VT, + bool MathUsed) const override; bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AddrSpace) const override { @@ -1194,12 +1310,12 @@ namespace llvm { /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. - unsigned + Register getExceptionPointerRegister(const Constant *PersonalityFn) const override; /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. - unsigned + Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override; virtual bool needsFixedCatchObjects() const override; @@ -1227,8 +1343,10 @@ namespace llvm { /// offset as appropriate. Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; - std::pair<SDValue, SDValue> BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, - SDValue StackSlot, + std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, + SDValue Chain, SDValue Pointer, + MachinePointerInfo PtrInfo, + Align Alignment, SelectionDAG &DAG) const; bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; @@ -1236,6 +1354,8 @@ namespace llvm { /// Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(MVT VT) const override; + bool softPromoteHalfType() const override { return true; } + MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; @@ -1251,6 +1371,8 @@ namespace llvm { bool supportSwiftError() const override; + bool hasStackProbeSymbol(MachineFunction &MF) const override; + bool hasInlineStackProbe(MachineFunction &MF) const override; StringRef getStackProbeSymbolName(MachineFunction &MF) const override; unsigned getStackProbeSize(MachineFunction &MF) const; @@ -1314,7 +1436,7 @@ namespace llvm { SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, - ISD::ArgFlagsTy Flags) const; + ISD::ArgFlagsTy Flags, bool isByval) const; // Call lowering helpers. @@ -1340,8 +1462,9 @@ namespace llvm { unsigned getAddressSpace(void) const; - SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned, + SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, SDValue &Chain) const; + SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -1365,8 +1488,8 @@ namespace llvm { SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTRICT_FSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; @@ -1431,7 +1554,7 @@ namespace llvm { const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; TargetLoweringBase::AtomicExpansionKind - shouldExpandAtomicLoadInIR(LoadInst *SI) const override; + shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; @@ -1464,18 +1587,15 @@ namespace llvm { MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I, - MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI, - MachineBasicBlock *BB) const; - MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI, MachineBasicBlock *BB) const; @@ -1497,32 +1617,25 @@ namespace llvm { MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI, MachineBasicBlock *MBB) const; - MachineBasicBlock *emitFMA3Instr(MachineInstr &MI, - MachineBasicBlock *MBB) const; - MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const; - /// Convert a comparison if required by the subtarget. - SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const; - /// Emit flags for the given setcc condition and operands. Also returns the /// corresponding X86 condition code constant in X86CC. SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, - SDValue &X86CC, SDValue &Chain, - bool IsSignaling) const; + SDValue &X86CC) const; /// Check if replacement of SQRT with RSQRT should be disabled. - bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override; + bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override; /// Use rsqrt* to speed up sqrt calculations. - SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override; /// Use rcp* to speed up fdiv calculations. - SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override; /// Reassociate floating point divisions into multiply by reciprocal. @@ -1537,101 +1650,14 @@ namespace llvm { const TargetLibraryInfo *libInfo); } // end namespace X86 - // Base class for all X86 non-masked store operations. - class X86StoreSDNode : public MemSDNode { - public: - X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl, - SDVTList VTs, EVT MemVT, - MachineMemOperand *MMO) - :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {} - const SDValue &getValue() const { return getOperand(1); } - const SDValue &getBasePtr() const { return getOperand(2); } - - static bool classof(const SDNode *N) { - return N->getOpcode() == X86ISD::VTRUNCSTORES || - N->getOpcode() == X86ISD::VTRUNCSTOREUS; - } - }; - - // Base class for all X86 masked store operations. - // The class has the same order of operands as MaskedStoreSDNode for - // convenience. - class X86MaskedStoreSDNode : public MemSDNode { - public: - X86MaskedStoreSDNode(unsigned Opcode, unsigned Order, - const DebugLoc &dl, SDVTList VTs, EVT MemVT, - MachineMemOperand *MMO) - : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {} - - const SDValue &getValue() const { return getOperand(1); } - const SDValue &getBasePtr() const { return getOperand(2); } - const SDValue &getMask() const { return getOperand(3); } - - static bool classof(const SDNode *N) { - return N->getOpcode() == X86ISD::VMTRUNCSTORES || - N->getOpcode() == X86ISD::VMTRUNCSTOREUS; - } - }; - - // X86 Truncating Store with Signed saturation. - class TruncSStoreSDNode : public X86StoreSDNode { - public: - TruncSStoreSDNode(unsigned Order, const DebugLoc &dl, - SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) - : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {} - - static bool classof(const SDNode *N) { - return N->getOpcode() == X86ISD::VTRUNCSTORES; - } - }; - - // X86 Truncating Store with Unsigned saturation. - class TruncUSStoreSDNode : public X86StoreSDNode { - public: - TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl, - SDVTList VTs, EVT MemVT, MachineMemOperand *MMO) - : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {} - - static bool classof(const SDNode *N) { - return N->getOpcode() == X86ISD::VTRUNCSTOREUS; - } - }; - - // X86 Truncating Masked Store with Signed saturation. - class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode { - public: - MaskedTruncSStoreSDNode(unsigned Order, - const DebugLoc &dl, SDVTList VTs, EVT MemVT, - MachineMemOperand *MMO) - : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {} - - static bool classof(const SDNode *N) { - return N->getOpcode() == X86ISD::VMTRUNCSTORES; - } - }; - - // X86 Truncating Masked Store with Unsigned saturation. - class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode { - public: - MaskedTruncUSStoreSDNode(unsigned Order, - const DebugLoc &dl, SDVTList VTs, EVT MemVT, - MachineMemOperand *MMO) - : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {} - - static bool classof(const SDNode *N) { - return N->getOpcode() == X86ISD::VMTRUNCSTOREUS; - } - }; - // X86 specific Gather/Scatter nodes. // The class has the same order of operands as MaskedGatherScatterSDNode for // convenience. - class X86MaskedGatherScatterSDNode : public MemSDNode { + class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode { public: - X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order, - const DebugLoc &dl, SDVTList VTs, EVT MemVT, - MachineMemOperand *MMO) - : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {} + // This is a intended as a utility and should never be directly created. + X86MaskedGatherScatterSDNode() = delete; + ~X86MaskedGatherScatterSDNode() = delete; const SDValue &getBasePtr() const { return getOperand(3); } const SDValue &getIndex() const { return getOperand(4); } @@ -1646,11 +1672,6 @@ namespace llvm { class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode { public: - X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, - EVT MemVT, MachineMemOperand *MMO) - : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, - MMO) {} - const SDValue &getPassThru() const { return getOperand(1); } static bool classof(const SDNode *N) { @@ -1660,11 +1681,6 @@ namespace llvm { class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode { public: - X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, - EVT MemVT, MachineMemOperand *MMO) - : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT, - MMO) {} - const SDValue &getValue() const { return getOperand(1); } static bool classof(const SDNode *N) { @@ -1673,47 +1689,15 @@ namespace llvm { }; /// Generate unpacklo/unpackhi shuffle mask. - template <typename T = int> - void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo, - bool Unary) { - assert(Mask.empty() && "Expected an empty shuffle mask vector"); - int NumElts = VT.getVectorNumElements(); - int NumEltsInLane = 128 / VT.getScalarSizeInBits(); - for (int i = 0; i < NumElts; ++i) { - unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; - int Pos = (i % NumEltsInLane) / 2 + LaneStart; - Pos += (Unary ? 0 : NumElts * (i % 2)); - Pos += (Lo ? 0 : NumEltsInLane / 2); - Mask.push_back(Pos); - } - } - - /// Helper function to scale a shuffle or target shuffle mask, replacing each - /// mask index with the scaled sequential indices for an equivalent narrowed - /// mask. This is the reverse process to canWidenShuffleElements, but can - /// always succeed. - template <typename T> - void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask, - SmallVectorImpl<T> &ScaledMask) { - assert(0 < Scale && "Unexpected scaling factor"); - size_t NumElts = Mask.size(); - ScaledMask.assign(NumElts * Scale, -1); - - for (size_t i = 0; i != NumElts; ++i) { - int M = Mask[i]; - - // Repeat sentinel values in every mask element. - if (M < 0) { - for (size_t s = 0; s != Scale; ++s) - ScaledMask[(Scale * i) + s] = M; - continue; - } - - // Scale mask element and increment across each mask element. - for (size_t s = 0; s != Scale; ++s) - ScaledMask[(Scale * i) + s] = (Scale * M) + s; - } - } + void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo, + bool Unary); + + /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation + /// imposed by AVX and specific to the unary pattern. Example: + /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> + /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> + void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo); + } // end namespace llvm #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp index 0a79b793a980..1628f85da808 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -92,9 +92,7 @@ static bool IsCallReturnTwice(llvm::MachineOperand &MOp) { if (!CalleeFn) return false; AttributeList Attrs = CalleeFn->getAttributes(); - if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice)) - return true; - return false; + return Attrs.hasFnAttribute(Attribute::ReturnsTwice); } bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { @@ -138,17 +136,38 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { if (MBB.hasAddressTaken()) Changed |= addENDBR(MBB, MBB.begin()); - // Exception handle may indirectly jump to catch pad, So we should add - // ENDBR before catch pad instructions. - bool EHPadIBTNeeded = MBB.isEHPad(); - for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { if (I->isCall() && IsCallReturnTwice(I->getOperand(0))) Changed |= addENDBR(MBB, std::next(I)); + } - if (EHPadIBTNeeded && I->isEHLabel()) { + // Exception handle may indirectly jump to catch pad, So we should add + // ENDBR before catch pad instructions. For SjLj exception model, it will + // create a new BB(new landingpad) indirectly jump to the old landingpad. + if (TM->Options.ExceptionModel == ExceptionHandling::SjLj) { + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + // New Landingpad BB without EHLabel. + if (MBB.isEHPad()) { + if (I->isDebugInstr()) + continue; + Changed |= addENDBR(MBB, I); + break; + } else if (I->isEHLabel()) { + // Old Landingpad BB (is not Landingpad now) with + // the the old "callee" EHLabel. + MCSymbol *Sym = I->getOperand(0).getMCSymbol(); + if (!MF.hasCallSiteLandingPad(Sym)) + continue; + Changed |= addENDBR(MBB, std::next(I)); + break; + } + } + } else if (MBB.isEHPad()){ + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + if (!I->isEHLabel()) + continue; Changed |= addENDBR(MBB, std::next(I)); - EHPadIBTNeeded = false; + break; } } } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp index 36b9c3ccc959..828887d96129 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp @@ -29,6 +29,7 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86Subtarget.h" +#include "llvm/CodeGen/IndirectThunks.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -40,6 +41,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -56,23 +58,6 @@ static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_"; static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11"; namespace { -template <typename Derived> class ThunkInserter { - Derived &getDerived() { return *static_cast<Derived *>(this); } - -protected: - bool InsertedThunks; - void doInitialization(Module &M) {} - void createThunkFunction(MachineModuleInfo &MMI, StringRef Name); - -public: - void init(Module &M) { - InsertedThunks = false; - getDerived().doInitialization(M); - } - // return `true` if `MMI` or `MF` was modified - bool run(MachineModuleInfo &MMI, MachineFunction &MF); -}; - struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> { const char *getThunkPrefix() { return RetpolineNamePrefix; } bool mayUseThunk(const MachineFunction &MF) { @@ -94,12 +79,9 @@ struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> { createThunkFunction(MMI, R11LVIThunkName); } void populateThunk(MachineFunction &MF) { - // Grab the entry MBB and erase any other blocks. O0 codegen appears to - // generate two bbs for the entry block. + assert (MF.size() == 1); MachineBasicBlock *Entry = &MF.front(); Entry->clear(); - while (MF.size() > 1) - MF.erase(std::next(MF.begin())); // This code mitigates LVI by replacing each indirect call/jump with a // direct call/jump to a thunk that looks like: @@ -128,12 +110,6 @@ public: bool doInitialization(Module &M) override; bool runOnMachineFunction(MachineFunction &MF) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired<MachineModuleInfoWrapperPass>(); - AU.addPreserved<MachineModuleInfoWrapperPass>(); - } - private: std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs; @@ -224,12 +200,9 @@ void RetpolineThunkInserter::populateThunk(MachineFunction &MF) { } const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); - // Grab the entry MBB and erase any other blocks. O0 codegen appears to - // generate two bbs for the entry block. + assert (MF.size() == 1); MachineBasicBlock *Entry = &MF.front(); Entry->clear(); - while (MF.size() > 1) - MF.erase(std::next(MF.begin())); MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock()); @@ -279,73 +252,6 @@ void RetpolineThunkInserter::populateThunk(MachineFunction &MF) { BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc)); } -template <typename Derived> -void ThunkInserter<Derived>::createThunkFunction(MachineModuleInfo &MMI, - StringRef Name) { - assert(Name.startswith(getDerived().getThunkPrefix()) && - "Created a thunk with an unexpected prefix!"); - - Module &M = const_cast<Module &>(*MMI.getModule()); - LLVMContext &Ctx = M.getContext(); - auto Type = FunctionType::get(Type::getVoidTy(Ctx), false); - Function *F = - Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M); - F->setVisibility(GlobalValue::HiddenVisibility); - F->setComdat(M.getOrInsertComdat(Name)); - - // Add Attributes so that we don't create a frame, unwind information, or - // inline. - AttrBuilder B; - B.addAttribute(llvm::Attribute::NoUnwind); - B.addAttribute(llvm::Attribute::Naked); - F->addAttributes(llvm::AttributeList::FunctionIndex, B); - - // Populate our function a bit so that we can verify. - BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F); - IRBuilder<> Builder(Entry); - - Builder.CreateRetVoid(); - - // MachineFunctions/MachineBasicBlocks aren't created automatically for the - // IR-level constructs we already made. Create them and insert them into the - // module. - MachineFunction &MF = MMI.getOrCreateMachineFunction(*F); - MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry); - - // Insert EntryMBB into MF. It's not in the module until we do this. - MF.insert(MF.end(), EntryMBB); - // Set MF properties. We never use vregs... - MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); -} - -template <typename Derived> -bool ThunkInserter<Derived>::run(MachineModuleInfo &MMI, MachineFunction &MF) { - // If MF is not a thunk, check to see if we need to insert a thunk. - if (!MF.getName().startswith(getDerived().getThunkPrefix())) { - // If we've already inserted a thunk, nothing else to do. - if (InsertedThunks) - return false; - - // Only add a thunk if one of the functions has the corresponding feature - // enabled in its subtarget, and doesn't enable external thunks. - // FIXME: Conditionalize on indirect calls so we don't emit a thunk when - // nothing will end up calling it. - // FIXME: It's a little silly to look at every function just to enumerate - // the subtargets, but eventually we'll want to look at them for indirect - // calls, so maybe this is OK. - if (!getDerived().mayUseThunk(MF)) - return false; - - getDerived().insertThunks(MMI); - InsertedThunks = true; - return true; - } - - // If this *is* a thunk function, we need to populate it with the correct MI. - getDerived().populateThunk(MF); - return true; -} - FunctionPass *llvm::createX86IndirectThunksPass() { return new X86IndirectThunks(); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp index 2b1e3f23efd7..53925bbfd72f 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp @@ -173,7 +173,7 @@ bool X86InsertPrefetch::doInitialization(Module &M) { void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired<MachineModuleInfoWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); } bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp new file mode 100644 index 000000000000..a82d98d88b30 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp @@ -0,0 +1,151 @@ +//- X86Insertwait.cpp - Strict-Fp:Insert wait instruction X87 instructions --// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass which insert x86 wait instructions after each +// X87 instructions when strict float is enabled. +// +// The logic to insert a wait instruction after an X87 instruction is as below: +// 1. If the X87 instruction don't raise float exception nor is a load/store +// instruction, or is a x87 control instruction, don't insert wait. +// 2. If the X87 instruction is an instruction which the following instruction +// is an X87 exception synchronizing X87 instruction, don't insert wait. +// 3. For other situations, insert wait instruction. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-insert-wait" + +namespace { + +class WaitInsert : public MachineFunctionPass { +public: + static char ID; + + WaitInsert() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "X86 insert wait instruction"; + } + +private: + const TargetInstrInfo *TII; // Machine instruction info. +}; + +} // namespace + +char WaitInsert::ID = 0; + +FunctionPass *llvm::createX86InsertX87waitPass() { return new WaitInsert(); } + +/// Return true if the Reg is X87 register. +static bool isX87Reg(unsigned Reg) { + return (Reg == X86::FPCW || Reg == X86::FPSW || + (Reg >= X86::ST0 && Reg <= X86::ST7)); +} + +/// check if the instruction is X87 instruction +static bool isX87Instruction(MachineInstr &MI) { + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg()) + continue; + if (isX87Reg(MO.getReg())) + return true; + } + return false; +} + +static bool isX87ControlInstruction(MachineInstr &MI) { + switch (MI.getOpcode()) { + case X86::FNINIT: + case X86::FLDCW16m: + case X86::FNSTCW16m: + case X86::FNSTSW16r: + case X86::FNSTSWm: + case X86::FNCLEX: + case X86::FLDENVm: + case X86::FSTENVm: + case X86::FRSTORm: + case X86::FSAVEm: + case X86::FINCSTP: + case X86::FDECSTP: + case X86::FFREE: + case X86::FFREEP: + case X86::FNOP: + case X86::WAIT: + return true; + default: + return false; + } +} + +static bool isX87NonWaitingControlInstruction(MachineInstr &MI) { + // a few special control instructions don't perform a wait operation + switch (MI.getOpcode()) { + case X86::FNINIT: + case X86::FNSTSW16r: + case X86::FNSTSWm: + case X86::FNSTCW16m: + case X86::FNCLEX: + return true; + default: + return false; + } +} + +bool WaitInsert::runOnMachineFunction(MachineFunction &MF) { + if (!MF.getFunction().hasFnAttribute(Attribute::StrictFP)) + return false; + + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); + TII = ST.getInstrInfo(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator MI = MBB.begin(); MI != MBB.end(); ++MI) { + // Jump non X87 instruction. + if (!isX87Instruction(*MI)) + continue; + // If the instruction instruction neither has float exception nor is + // a load/store instruction, or the instruction is x87 control + // instruction, do not insert wait. + if (!(MI->mayRaiseFPException() || MI->mayLoadOrStore()) || + isX87ControlInstruction(*MI)) + continue; + // If the following instruction is an X87 instruction and isn't an X87 + // non-waiting control instruction, we can omit insert wait instruction. + MachineBasicBlock::iterator AfterMI = std::next(MI); + if (AfterMI != MBB.end() && isX87Instruction(*AfterMI) && + !isX87NonWaitingControlInstruction(*AfterMI)) + continue; + + BuildMI(MBB, AfterMI, MI->getDebugLoc(), TII->get(X86::WAIT)); + LLVM_DEBUG(dbgs() << "\nInsert wait after:\t" << *MI); + // Jump the newly inserting wait + ++MI; + Changed = true; + } + } + return Changed; +} diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td new file mode 100644 index 000000000000..e26dd5050a23 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td @@ -0,0 +1,119 @@ +//===---- X86InstrAMX.td - AMX Instruction Set Extension --*- tablegen -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel AMX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AMX instructions + +let Predicates = [HasAMXTILE, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in + def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), + "ldtilecfg\t$src", + [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS; + def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), + "sttilecfg\t$src", + [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD; + def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src), + "tileloadd\t{$src, $dst|$dst, $src}", []>, + VEX, T8XD; + def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), + (ins sibmem:$src), + "tileloaddt1\t{$src, $dst|$dst, $src}", []>, + VEX, T8PD; + let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in + def TILERELEASE : I<0x49, MRM_C0, (outs), (ins), + "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS; + def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs), + (ins sibmem:$dst, TILE:$src), + "tilestored\t{$src, $dst|$dst, $src}", []>, + VEX, T8XS; + def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins), + "tilezero\t$dst", []>, + VEX, T8XD; + + let usesCustomInserter = 1 in { + // Pseudo instructions, using immediates instead of tile registers. + // To be translated to the actual instructions in X86ISelLowering.cpp + def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; + def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1, + sibmem:$src2), []>; + def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>; + def PTILEZERO : PseudoI<(outs), (ins u8imm:$src), + [(int_x86_tilezero imm:$src)]>; + } + } // SchedRW +} // HasAMXTILE + +let Predicates = [HasAMXINT8, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let Constraints = "$src1 = $dst" in { + def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, + VEX_4V, T8XD; + def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, + VEX_4V, T8XS; + def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, + VEX_4V, T8PD; + def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, + VEX_4V, T8PS; + } + + let usesCustomInserter = 1 in { + // Pseudo instructions, using immediates instead of tile registers. + // To be translated to the actual instructions in X86ISelLowering.cpp + def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbssd imm:$src1, + imm:$src2, imm:$src3)]>; + def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbsud imm:$src1, + imm:$src2, imm:$src3)]>; + def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbusd imm:$src1, + imm:$src2, imm:$src3)]>; + def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbuud imm:$src1, + imm:$src2, imm:$src3)]>; + } + } +} // HasAMXTILE + +let Predicates = [HasAMXBF16, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let Constraints = "$src1 = $dst" in + def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", + []>, VEX_4V, T8XS; + + let usesCustomInserter = 1 in { + // Pseudo instructions, using immediates instead of tile registers. + // To be translated to the actual instructions in X86ISelLowering.cpp + def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, + u8imm:$src2, u8imm:$src3), + [(int_x86_tdpbf16ps imm:$src1, + imm:$src2, imm:$src3)]>; + } + } +} // HasAMXTILE, HasAMXBF16 diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td index 32f012033fb0..a3ad0b1c8dd6 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td @@ -76,11 +76,11 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName); - ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"), - !cast<ComplexPattern>("sse_load_f32"), - !if (!eq (EltTypeName, "f64"), - !cast<ComplexPattern>("sse_load_f64"), - ?)); + PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f32"), + !cast<PatFrags>("sse_load_f32"), + !if (!eq (EltTypeName, "f64"), + !cast<PatFrags>("sse_load_f64"), + ?)); // The string to specify embedded broadcast in assembly. string BroadcastStr = "{1to" # NumElts # "}"; @@ -169,6 +169,18 @@ def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>; def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>; def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>; +// Used for matching masked operations. Ensures the operation part only has a +// single use. +def vselect_mask : PatFrag<(ops node:$mask, node:$src1, node:$src2), + (vselect node:$mask, node:$src1, node:$src2), [{ + return isProfitableToFormMaskedOp(N); +}]>; + +def X86selects_mask : PatFrag<(ops node:$mask, node:$src1, node:$src2), + (X86selects node:$mask, node:$src1, node:$src2), [{ + return isProfitableToFormMaskedOp(N); +}]>; + // This multiclass generates the masking variants from the non-masking // variant. It only provides the assembly pieces for the masking variants. // It assumes custom ISel patterns for masking which can be provided as @@ -220,7 +232,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, - SDNode Select = vselect, + SDPatternOperator Select = vselect_mask, string MaskingConstraint = "", bit IsCommutable = 0, bit IsKCommutable = 0, @@ -236,35 +248,36 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the -// perserved vector elements come from a new dummy input operand tied to $dst. +// preserved vector elements come from a new dummy input operand tied to $dst. // This version uses a separate dag for non-masking and masking. multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskRHS, bit IsCommutable = 0, bit IsKCommutable = 0, - SDNode Select = vselect> : + bit IsKZCommutable = IsCommutable> : AVX512_maskable_custom<O, F, Outs, Ins, !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, [(set _.RC:$dst, RHS)], [(set _.RC:$dst, - (Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))], + (vselect_mask _.KRCWM:$mask, MaskRHS, _.RC:$src0))], [(set _.RC:$dst, - (Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))], - "$src0 = $dst", IsCommutable, IsKCommutable>; + (vselect_mask _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))], + "$src0 = $dst", IsCommutable, IsKCommutable, + IsKZCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the -// perserved vector elements come from a new dummy input operand tied to $dst. +// preserved vector elements come from a new dummy input operand tied to $dst. multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, bit IsCommutable = 0, bit IsKCommutable = 0, bit IsKZCommutable = IsCommutable, - SDNode Select = vselect> : + SDPatternOperator Select = vselect_mask> : AVX512_maskable_common<O, F, _, Outs, Ins, !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), !con((ins _.KRCWM:$mask), Ins), @@ -280,7 +293,7 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, string AttSrcAsm, string IntelSrcAsm, dag RHS> : AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm, - RHS, 0, 0, 0, X86selects>; + RHS, 0, 0, 0, X86selects_mask>; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -292,7 +305,7 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, dag RHS, bit IsCommutable = 0, bit IsKCommutable = 0, - SDNode Select = vselect, + SDPatternOperator Select = vselect_mask, bit MaskOnly = 0> : AVX512_maskable_common<O, F, _, Outs, !con((ins _.RC:$src1), NonTiedIns), @@ -317,9 +330,9 @@ multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT, !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), OpcodeStr, AttSrcAsm, IntelSrcAsm, (null_frag), - (vselect InVT.KRCWM:$mask, RHS, + (vselect_mask InVT.KRCWM:$mask, RHS, (bitconvert InVT.RC:$src1)), - vselect, "", IsCommutable>; + vselect_mask, "", IsCommutable>; multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, @@ -330,7 +343,7 @@ multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, bit MaskOnly = 0> : AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, IsCommutable, IsKCommutable, - X86selects, MaskOnly>; + X86selects_mask, MaskOnly>; multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, @@ -399,6 +412,36 @@ multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _, OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (and _.KRCWM:$mask, RHS_su), IsCommutable>; +// Used by conversion instructions. +multiclass AVX512_maskable_cvt<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, dag ZeroMaskingRHS> : + AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, MaskingRHS)], + [(set _.RC:$dst, ZeroMaskingRHS)], + "$src0 = $dst">; + +multiclass AVX512_maskable_fma<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, bit IsCommutable, + bit IsKCommutable> : + AVX512_maskable_custom<O, F, Outs, + !con((ins _.RC:$src1), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, + (vselect_mask _.KRCWM:$mask, MaskingRHS, _.RC:$src1))], + [(set _.RC:$dst, + (vselect_mask _.KRCWM:$mask, MaskingRHS, _.ImmAllZerosV))], + "", IsCommutable, IsKCommutable>; // Alias instruction that maps zero vector to pxor / xorp* for AVX-512. // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then @@ -625,45 +668,45 @@ multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From, list<Predicate> p> { let Predicates = p in { def : Pat<(Cast.VT - (vselect Cast.KRCWM:$mask, - (bitconvert - (vinsert_insert:$ins (To.VT To.RC:$src1), - (From.VT From.RC:$src2), - (iPTR imm))), - Cast.RC:$src0)), + (vselect_mask Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))), + Cast.RC:$src0)), (!cast<Instruction>(InstrStr#"rrk") Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2, (INSERT_get_vinsert_imm To.RC:$ins))>; def : Pat<(Cast.VT - (vselect Cast.KRCWM:$mask, - (bitconvert - (vinsert_insert:$ins (To.VT To.RC:$src1), - (From.VT - (bitconvert - (From.LdFrag addr:$src2))), - (iPTR imm))), - Cast.RC:$src0)), + (vselect_mask Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT + (bitconvert + (From.LdFrag addr:$src2))), + (iPTR imm))), + Cast.RC:$src0)), (!cast<Instruction>(InstrStr#"rmk") Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, addr:$src2, (INSERT_get_vinsert_imm To.RC:$ins))>; def : Pat<(Cast.VT - (vselect Cast.KRCWM:$mask, - (bitconvert - (vinsert_insert:$ins (To.VT To.RC:$src1), - (From.VT From.RC:$src2), - (iPTR imm))), - Cast.ImmAllZerosV)), + (vselect_mask Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))), + Cast.ImmAllZerosV)), (!cast<Instruction>(InstrStr#"rrkz") Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2, (INSERT_get_vinsert_imm To.RC:$ins))>; def : Pat<(Cast.VT - (vselect Cast.KRCWM:$mask, - (bitconvert - (vinsert_insert:$ins (To.VT To.RC:$src1), - (From.VT (From.LdFrag addr:$src2)), - (iPTR imm))), - Cast.ImmAllZerosV)), + (vselect_mask Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT (From.LdFrag addr:$src2)), + (iPTR imm))), + Cast.ImmAllZerosV)), (!cast<Instruction>(InstrStr#"rmkz") Cast.KRCWM:$mask, To.RC:$src1, addr:$src2, (INSERT_get_vinsert_imm To.RC:$ins))>; @@ -981,20 +1024,20 @@ multiclass vextract_for_mask_cast<string InstrStr, X86VectorVTInfo From, SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> { let Predicates = p in { - def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, - (bitconvert - (To.VT (vextract_extract:$ext - (From.VT From.RC:$src), (iPTR imm)))), - To.RC:$src0)), + def : Pat<(Cast.VT (vselect_mask Cast.KRCWM:$mask, + (bitconvert + (To.VT (vextract_extract:$ext + (From.VT From.RC:$src), (iPTR imm)))), + To.RC:$src0)), (Cast.VT (!cast<Instruction>(InstrStr#"rrk") Cast.RC:$src0, Cast.KRCWM:$mask, From.RC:$src, (EXTRACT_get_vextract_imm To.RC:$ext)))>; - def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, - (bitconvert - (To.VT (vextract_extract:$ext - (From.VT From.RC:$src), (iPTR imm)))), - Cast.ImmAllZerosV)), + def : Pat<(Cast.VT (vselect_mask Cast.KRCWM:$mask, + (bitconvert + (To.VT (vextract_extract:$ext + (From.VT From.RC:$src), (iPTR imm)))), + Cast.ImmAllZerosV)), (Cast.VT (!cast<Instruction>(InstrStr#"rrkz") Cast.KRCWM:$mask, From.RC:$src, (EXTRACT_get_vextract_imm To.RC:$ext)))>; @@ -1101,18 +1144,18 @@ multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr, string Name, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)), - (!cast<Instruction>(Name#DestInfo.ZSuffix#r) + (!cast<Instruction>(Name#DestInfo.ZSuffix#rr) (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>; - def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, - (X86VBroadcast SrcInfo.FRC:$src), - DestInfo.RC:$src0)), - (!cast<Instruction>(Name#DestInfo.ZSuffix#rk) + def : Pat<(DestInfo.VT (vselect_mask DestInfo.KRCWM:$mask, + (X86VBroadcast SrcInfo.FRC:$src), + DestInfo.RC:$src0)), + (!cast<Instruction>(Name#DestInfo.ZSuffix#rrk) DestInfo.RC:$src0, DestInfo.KRCWM:$mask, (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>; - def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, - (X86VBroadcast SrcInfo.FRC:$src), - DestInfo.ImmAllZerosV)), - (!cast<Instruction>(Name#DestInfo.ZSuffix#rkz) + def : Pat<(DestInfo.VT (vselect_mask DestInfo.KRCWM:$mask, + (X86VBroadcast SrcInfo.FRC:$src), + DestInfo.ImmAllZerosV)), + (!cast<Instruction>(Name#DestInfo.ZSuffix#rrkz) DestInfo.KRCWM:$mask, (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>; } @@ -1128,83 +1171,83 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr, SDPatternOperator UnmaskedOp = X86VBroadcast, SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> { let hasSideEffects = 0 in - def r : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set MaskInfo.RC:$dst, - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))], - DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>; - def rkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), - (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", - "${dst} {${mask}} {z}, $src}"), - [(set MaskInfo.RC:$dst, - (vselect MaskInfo.KRCWM:$mask, - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))), - MaskInfo.ImmAllZerosV))], - DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>; - let Constraints = "$src0 = $dst" in - def rk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), - (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, - SrcInfo.RC:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|", - "${dst} {${mask}}, $src}"), + def rr : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set MaskInfo.RC:$dst, - (vselect MaskInfo.KRCWM:$mask, - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))), - MaskInfo.RC:$src0))], - DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>; + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))], + DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>; + def rrkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), + (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect_mask MaskInfo.KRCWM:$mask, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))), + MaskInfo.ImmAllZerosV))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>; + let Constraints = "$src0 = $dst" in + def rrk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), + (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, + SrcInfo.RC:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|", + "${dst} {${mask}}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect_mask MaskInfo.KRCWM:$mask, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))), + MaskInfo.RC:$src0))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>; let hasSideEffects = 0, mayLoad = 1 in - def m : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), - (ins SrcInfo.ScalarMemOp:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set MaskInfo.RC:$dst, - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (UnmaskedBcastOp addr:$src)))))], - DestInfo.ExeDomain>, T8PD, EVEX, - EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; - - def mkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), - (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", - "${dst} {${mask}} {z}, $src}"), - [(set MaskInfo.RC:$dst, - (vselect MaskInfo.KRCWM:$mask, - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (SrcInfo.BroadcastLdFrag addr:$src)))), - MaskInfo.ImmAllZerosV))], - DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, - EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; + def rm : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), + (ins SrcInfo.ScalarMemOp:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set MaskInfo.RC:$dst, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (UnmaskedBcastOp addr:$src)))))], + DestInfo.ExeDomain>, T8PD, EVEX, + EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; + + def rmkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), + (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect_mask MaskInfo.KRCWM:$mask, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (SrcInfo.BroadcastLdFrag addr:$src)))), + MaskInfo.ImmAllZerosV))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, + EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = IsConvertibleToThreeAddress in - def mk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), - (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, - SrcInfo.ScalarMemOp:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|", - "${dst} {${mask}}, $src}"), - [(set MaskInfo.RC:$dst, - (vselect MaskInfo.KRCWM:$mask, - (MaskInfo.VT - (bitconvert - (DestInfo.VT - (SrcInfo.BroadcastLdFrag addr:$src)))), - MaskInfo.RC:$src0))], - DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, - EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; + def rmk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst), + (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, + SrcInfo.ScalarMemOp:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|", + "${dst} {${mask}}, $src}"), + [(set MaskInfo.RC:$dst, + (vselect_mask MaskInfo.KRCWM:$mask, + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (SrcInfo.BroadcastLdFrag addr:$src)))), + MaskInfo.RC:$src0))], + DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, + EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>; } // Helper class to force mask and broadcast result to same type. @@ -1267,35 +1310,38 @@ defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd", multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR, X86VectorVTInfo _, SDPatternOperator OpNode, RegisterClass SrcRC> { + // Fold with a mask even if it has multiple uses since it is cheap. let ExeDomain = _.ExeDomain in - defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins SrcRC:$src), - "vpbroadcast"##_.Suffix, "$src", "$src", - (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX, - Sched<[SchedRR]>; + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins SrcRC:$src), + "vpbroadcast"#_.Suffix, "$src", "$src", + (_.VT (OpNode SrcRC:$src)), /*IsCommutable*/0, + /*IsKCommutable*/0, /*IsKZCommutable*/0, vselect>, + T8PD, EVEX, Sched<[SchedRR]>; } multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite SchedRR, X86VectorVTInfo _, SDPatternOperator OpNode, RegisterClass SrcRC, SubRegIndex Subreg> { let hasSideEffects = 0, ExeDomain = _.ExeDomain in - defm r : AVX512_maskable_custom<opc, MRMSrcReg, - (outs _.RC:$dst), (ins GR32:$src), - !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)), - !con((ins _.KRCWM:$mask), (ins GR32:$src)), - "vpbroadcast"##_.Suffix, "$src", "$src", [], [], [], - "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>; + defm rr : AVX512_maskable_custom<opc, MRMSrcReg, + (outs _.RC:$dst), (ins GR32:$src), + !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)), + !con((ins _.KRCWM:$mask), (ins GR32:$src)), + "vpbroadcast"#_.Suffix, "$src", "$src", [], [], [], + "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>; def : Pat <(_.VT (OpNode SrcRC:$src)), - (!cast<Instruction>(Name#r) + (!cast<Instruction>(Name#rr) (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>; + // Fold with a mask even if it has multiple uses since it is cheap. def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.RC:$src0), - (!cast<Instruction>(Name#rk) _.RC:$src0, _.KRCWM:$mask, + (!cast<Instruction>(Name#rrk) _.RC:$src0, _.KRCWM:$mask, (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>; def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.ImmAllZerosV), - (!cast<Instruction>(Name#rkz) _.KRCWM:$mask, + (!cast<Instruction>(Name#rrkz) _.KRCWM:$mask, (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>; } @@ -1392,72 +1438,6 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr, AVX5128IBase, EVEX; } -let Predicates = [HasAVX512] in { - // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), - (VPBROADCASTQZm addr:$src)>; - - // FIXME this is to handle aligned extloads from i8. - def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDZm addr:$src)>; -} - -let Predicates = [HasVLX] in { - // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), - (VPBROADCASTQZ128m addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), - (VPBROADCASTQZ256m addr:$src)>; - - // FIXME this is to handle aligned extloads from i8. - def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDZ128m addr:$src)>; - def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDZ256m addr:$src)>; -} -let Predicates = [HasVLX, HasBWI] in { - // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. - // This means we'll encounter truncated i32 loads; match that here. - def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), - (VPBROADCASTWZ128m addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), - (VPBROADCASTWZ256m addr:$src)>; - def : Pat<(v8i16 (X86VBroadcast - (i16 (trunc (i32 (extloadi16 addr:$src)))))), - (VPBROADCASTWZ128m addr:$src)>; - def : Pat<(v8i16 (X86VBroadcast - (i16 (trunc (i32 (zextloadi16 addr:$src)))))), - (VPBROADCASTWZ128m addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast - (i16 (trunc (i32 (extloadi16 addr:$src)))))), - (VPBROADCASTWZ256m addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast - (i16 (trunc (i32 (zextloadi16 addr:$src)))))), - (VPBROADCASTWZ256m addr:$src)>; - - // FIXME this is to handle aligned extloads from i8. - def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWZ128m addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWZ256m addr:$src)>; -} -let Predicates = [HasBWI] in { - // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. - // This means we'll encounter truncated i32 loads; match that here. - def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), - (VPBROADCASTWZm addr:$src)>; - def : Pat<(v32i16 (X86VBroadcast - (i16 (trunc (i32 (extloadi16 addr:$src)))))), - (VPBROADCASTWZm addr:$src)>; - def : Pat<(v32i16 (X86VBroadcast - (i16 (trunc (i32 (zextloadi16 addr:$src)))))), - (VPBROADCASTWZm addr:$src)>; - - // FIXME this is to handle aligned extloads from i8. - def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWZm addr:$src)>; -} - //===----------------------------------------------------------------------===// // AVX-512 BROADCAST SUBVECTORS // @@ -1516,38 +1496,38 @@ def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))), (VBROADCASTI32X4rm addr:$src)>; // Patterns for selects of bitcasted operations. -def : Pat<(vselect VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), - (v16f32 immAllZerosV)), +def : Pat<(vselect_mask VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (v16f32 immAllZerosV)), (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>; -def : Pat<(vselect VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), - VR512:$src0), +def : Pat<(vselect_mask VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + VR512:$src0), (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; -def : Pat<(vselect VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), - (v16i32 immAllZerosV)), +def : Pat<(vselect_mask VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (v16i32 immAllZerosV)), (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>; -def : Pat<(vselect VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), - VR512:$src0), +def : Pat<(vselect_mask VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + VR512:$src0), (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), - (v8f64 immAllZerosV)), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), + (v8f64 immAllZerosV)), (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), - VR512:$src0), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), + VR512:$src0), (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), - (v8i64 immAllZerosV)), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), + (v8i64 immAllZerosV)), (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), - VR512:$src0), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))), + VR512:$src0), (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } @@ -1569,21 +1549,21 @@ def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), (VBROADCASTI32X4Z256rm addr:$src)>; // Patterns for selects of bitcasted operations. -def : Pat<(vselect VK8WM:$mask, - (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), - (v8f32 immAllZerosV)), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (v8f32 immAllZerosV)), (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), - VR256X:$src0), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + VR256X:$src0), (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), - (v8i32 immAllZerosV)), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (v8i32 immAllZerosV)), (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), - VR256X:$src0), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + VR256X:$src0), (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; @@ -1618,21 +1598,21 @@ defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2" EVEX_V256, EVEX_CD8<64, CD8VT2>; // Patterns for selects of bitcasted operations. -def : Pat<(vselect VK4WM:$mask, - (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), - (v4f64 immAllZerosV)), +def : Pat<(vselect_mask VK4WM:$mask, + (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (v4f64 immAllZerosV)), (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>; -def : Pat<(vselect VK4WM:$mask, - (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), - VR256X:$src0), +def : Pat<(vselect_mask VK4WM:$mask, + (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + VR256X:$src0), (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; -def : Pat<(vselect VK4WM:$mask, - (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), - (v4i64 immAllZerosV)), +def : Pat<(vselect_mask VK4WM:$mask, + (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), + (v4i64 immAllZerosV)), (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>; -def : Pat<(vselect VK4WM:$mask, - (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), - VR256X:$src0), +def : Pat<(vselect_mask VK4WM:$mask, + (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), + VR256X:$src0), (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; } @@ -1651,38 +1631,38 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", EVEX_V512, EVEX_CD8<32, CD8VT8>; // Patterns for selects of bitcasted operations. -def : Pat<(vselect VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), - (v16f32 immAllZerosV)), +def : Pat<(vselect_mask VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), + (v16f32 immAllZerosV)), (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>; -def : Pat<(vselect VK16WM:$mask, - (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), - VR512:$src0), +def : Pat<(vselect_mask VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), + VR512:$src0), (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; -def : Pat<(vselect VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), - (v16i32 immAllZerosV)), +def : Pat<(vselect_mask VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), + (v16i32 immAllZerosV)), (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>; -def : Pat<(vselect VK16WM:$mask, - (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), - VR512:$src0), +def : Pat<(vselect_mask VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), + VR512:$src0), (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), - (v8f64 immAllZerosV)), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (v8f64 immAllZerosV)), (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), - VR512:$src0), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + VR512:$src0), (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), - (v8i64 immAllZerosV)), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), + (v8i64 immAllZerosV)), (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>; -def : Pat<(vselect VK8WM:$mask, - (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), - VR512:$src0), +def : Pat<(vselect_mask VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))), + VR512:$src0), (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; } @@ -1836,24 +1816,27 @@ defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", WriteFVarShuffle256, multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _, X86VectorVTInfo IdxVT, X86VectorVTInfo CastVT> { - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86VPermt2 (_.VT _.RC:$src2), - (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), _.RC:$src3), - (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, + (X86VPermt2 (_.VT _.RC:$src2), + (IdxVT.VT (bitconvert + (CastVT.VT _.RC:$src1))), + _.RC:$src3), + (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), (!cast<Instruction>(InstrStr#"rrk") _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, _.RC:$src3)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86VPermt2 _.RC:$src2, - (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), - (_.LdFrag addr:$src3)), - (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, + (X86VPermt2 _.RC:$src2, + (IdxVT.VT (bitconvert + (CastVT.VT _.RC:$src1))), + (_.LdFrag addr:$src3)), + (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), (!cast<Instruction>(InstrStr#"rmk") _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86VPermt2 _.RC:$src2, - (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), - (_.BroadcastLdFrag addr:$src3)), - (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, + (X86VPermt2 _.RC:$src2, + (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), + (_.BroadcastLdFrag addr:$src3)), + (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), (!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; } @@ -2085,9 +2068,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE, (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, + (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2), timm:$cc), - (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, + (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2), timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; @@ -2646,13 +2629,13 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, let Predicates = [prd], ExeDomain = _.ExeDomain, Uses = [MXCSR] in { def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1), (i32 timm:$src2)))]>, Sched<[sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix# + OpcodeStr#_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclasss_su (_.VT _.RC:$src1), @@ -2660,18 +2643,18 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, EVEX_K, Sched<[sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.IntScalarMemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix## + OpcodeStr#_.Suffix# "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst, - (X86Vfpclasss _.ScalarIntMemCPat:$src1, - (i32 timm:$src2)))]>, + (X86Vfpclasss (_.ScalarIntMemFrags addr:$src1), + (i32 timm:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix## + OpcodeStr#_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, - (X86Vfpclasss_su _.ScalarIntMemCPat:$src1, + (X86Vfpclasss_su (_.ScalarIntMemFrags addr:$src1), (i32 timm:$src2))))]>, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2686,13 +2669,13 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, let ExeDomain = _.ExeDomain, Uses = [MXCSR] in { def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1), (i32 timm:$src2)))]>, Sched<[sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix# + OpcodeStr#_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su (_.VT _.RC:$src1), @@ -2700,7 +2683,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, EVEX_K, Sched<[sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix#"{"#mem#"}"# + OpcodeStr#_.Suffix#"{"#mem#"}"# "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(X86Vfpclass (_.VT (_.LdFrag addr:$src1)), @@ -2708,7 +2691,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix#"{"#mem#"}"# + OpcodeStr#_.Suffix#"{"#mem#"}"# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su (_.VT (_.LdFrag addr:$src1)), @@ -2716,18 +2699,18 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix##"\t{$src2, ${src1}"## - _.BroadcastStr##", $dst|$dst, ${src1}" - ##_.BroadcastStr##", $src2}", + OpcodeStr#_.Suffix#"\t{$src2, ${src1}"# + _.BroadcastStr#", $dst|$dst, ${src1}" + #_.BroadcastStr#", $src2}", [(set _.KRC:$dst,(X86Vfpclass (_.VT (_.BroadcastLdFrag addr:$src1)), (i32 timm:$src2)))]>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix##"\t{$src2, ${src1}"## - _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"## - _.BroadcastStr##", $src2}", + OpcodeStr#_.Suffix#"\t{$src2, ${src1}"# + _.BroadcastStr#", $dst {${mask}}|$dst {${mask}}, ${src1}"# + _.BroadcastStr#", $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su (_.VT (_.BroadcastLdFrag addr:$src1)), (i32 timm:$src2))))]>, @@ -2979,6 +2962,8 @@ def : Pat<(vnot VK4:$src), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>; def : Pat<(vnot VK2:$src), (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>; +def : Pat<(vnot VK1:$src), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK1:$src, VK16)), VK2)>; // Mask binary operation // - KAND, KANDN, KOR, KXNOR, KXOR @@ -3008,8 +2993,6 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS; } -def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; -def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; // These nodes use 'vnot' instead of 'not' to support vectors. def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>; def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>; @@ -3022,7 +3005,7 @@ defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SchedWriteVecLogic.XM defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SchedWriteVecLogic.XMM, 0>; defm KADD : avx512_mask_binop_all<0x4A, "kadd", X86kadd, SchedWriteVecLogic.XMM, 1, HasDQI>; -multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode, +multiclass avx512_binop_pat<SDPatternOperator VOpNode, Instruction Inst> { // With AVX512F, 8-bit mask is promoted to 16-bit mask, // for the DQI set, this type is legal and KxxxB instruction is used @@ -3033,25 +3016,25 @@ multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode, (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; // All types smaller than 8 bits require conversion anyway - def : Pat<(OpNode VK1:$src1, VK1:$src2), + def : Pat<(VOpNode VK1:$src1, VK1:$src2), (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS VK1:$src1, VK16), (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; def : Pat<(VOpNode VK2:$src1, VK2:$src2), (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS VK2:$src1, VK16), - (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>; + (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>; def : Pat<(VOpNode VK4:$src1, VK4:$src2), (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS VK4:$src1, VK16), - (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>; + (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>; } -defm : avx512_binop_pat<and, and, KANDWrr>; -defm : avx512_binop_pat<vandn, andn, KANDNWrr>; -defm : avx512_binop_pat<or, or, KORWrr>; -defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>; -defm : avx512_binop_pat<xor, xor, KXORWrr>; +defm : avx512_binop_pat<and, KANDWrr>; +defm : avx512_binop_pat<vandn, KANDNWrr>; +defm : avx512_binop_pat<or, KORWrr>; +defm : avx512_binop_pat<vxnor, KXNORWrr>; +defm : avx512_binop_pat<xor, KXORWrr>; // Mask unpacking multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst, @@ -3065,7 +3048,7 @@ multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst, VEX_4V, VEX_L, Sched<[sched]>; def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)), - (!cast<Instruction>(NAME##rr) Src.KRC:$src2, Src.KRC:$src1)>; + (!cast<Instruction>(NAME#rr) Src.KRC:$src2, Src.KRC:$src1)>; } } @@ -3201,8 +3184,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr, X86VectorVTInfo Narrow, X86VectorVTInfo Wide> { -def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2), timm:$cc)), +def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), timm:$cc)), (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrri") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3219,8 +3202,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, timm:$cc), Narrow.KRC)>; // Broadcast load. -def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1), - (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)), +def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1), + (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)), (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbi") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3235,8 +3218,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, addr:$src2, timm:$cc), Narrow.KRC)>; // Commuted with broadcast load. -def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), - (Narrow.VT Narrow.RC:$src1), timm:$cc)), +def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), + (Narrow.VT Narrow.RC:$src1), timm:$cc)), (COPY_TO_REGCLASS (!cast<Instruction>(InstStr#"Zrmbi") (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), @@ -3301,7 +3284,7 @@ multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { let Predicates = [HasAVX512] in let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1, SchedRW = [WriteZero] in - def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "", + def NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "", [(set KRC:$dst, (VT Val))]>; } @@ -3409,7 +3392,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name, !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", "${dst} {${mask}}, $src1}"), [(set _.RC:$dst, (_.VT - (vselect _.KRCWM:$mask, + (vselect_mask _.KRCWM:$mask, (_.VT (ld_frag addr:$src1)), (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K, Sched<[Sched.RM]>; @@ -3418,18 +3401,18 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name, (ins _.KRCWM:$mask, _.MemOp:$src), OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"# "${dst} {${mask}} {z}, $src}", - [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + [(set _.RC:$dst, (_.VT (vselect_mask _.KRCWM:$mask, (_.VT (ld_frag addr:$src)), _.ImmAllZerosV)))], _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>; } def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)), - (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$ptr)>; def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)), - (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$ptr)>; def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))), - (!cast<Instruction>(Name#_.ZSuffix##rmk) _.RC:$src0, + (!cast<Instruction>(Name#_.ZSuffix#rmk) _.RC:$src0, _.KRCWM:$mask, addr:$ptr)>; } @@ -4286,6 +4269,17 @@ def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), (f64 FR64X:$src0))) def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), fp64imm0)), (COPY_TO_REGCLASS (v2f64 (VMOVSDZrmkz VK1WM:$mask, addr:$src)), FR64X)>; + +def : Pat<(v4f32 (X86selects VK1WM:$mask, (v4f32 VR128X:$src1), (v4f32 VR128X:$src2))), + (VMOVSSZrrk VR128X:$src2, VK1WM:$mask, VR128X:$src1, VR128X:$src1)>; +def : Pat<(v2f64 (X86selects VK1WM:$mask, (v2f64 VR128X:$src1), (v2f64 VR128X:$src2))), + (VMOVSDZrrk VR128X:$src2, VK1WM:$mask, VR128X:$src1, VR128X:$src1)>; + +def : Pat<(v4f32 (X86selects VK1WM:$mask, (v4f32 VR128X:$src1), (v4f32 immAllZerosV))), + (VMOVSSZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>; +def : Pat<(v2f64 (X86selects VK1WM:$mask, (v2f64 VR128X:$src1), (v2f64 immAllZerosV))), + (VMOVSDZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), @@ -4439,8 +4433,6 @@ let Predicates = [HasAVX512] in { (VMOV64toPQIZrr GR64:$src)>; // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. - def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), - (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v4i32 (X86vzload32 addr:$src)), (VMOVDI2PDIZrm addr:$src)>; def : Pat<(v8i32 (X86vzload32 addr:$src)), @@ -4624,8 +4616,8 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512_binop_rm<opc, OpcodeStr, OpNode, _, sched, IsCommutable> { defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, - "${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr, + "${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr, (_.VT (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src2)))>, AVX512BIBase, EVEX_4V, EVEX_B, @@ -4750,8 +4742,8 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2), OpcodeStr, - "${src2}"##_Brdct.BroadcastStr##", $src1", - "$src1, ${src2}"##_Brdct.BroadcastStr, + "${src2}"#_Brdct.BroadcastStr#", $src1", + "$src1, ${src2}"#_Brdct.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>, AVX512BIBase, EVEX_4V, EVEX_B, @@ -4822,8 +4814,8 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr, - "${src2}"##_Src.BroadcastStr##", $src1", - "$src1, ${src2}"##_Src.BroadcastStr, + "${src2}"#_Src.BroadcastStr#", $src1", + "$src1, ${src2}"#_Src.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>, @@ -5159,26 +5151,26 @@ multiclass avx512_logical_lowering<string InstrStr, SDNode OpNode, X86VectorVTInfo _, X86VectorVTInfo IntInfo> { // Masked register-register logical operations. - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))), _.RC:$src0)), (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))), _.ImmAllZerosV)), (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1, _.RC:$src2)>; // Masked register-memory logical operations. - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, (load addr:$src2)))), _.RC:$src0)), (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, (load addr:$src2)))), _.ImmAllZerosV)), @@ -5190,14 +5182,14 @@ multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode, X86VectorVTInfo _, X86VectorVTInfo IntInfo> { // Register-broadcast logical operations. - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.RC:$src0)), (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), @@ -5304,7 +5296,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (VecNode _.RC:$src1, - _.ScalarIntMemCPat:$src2))>, + (_.ScalarIntMemFrags addr:$src2)))>, Sched<[sched.Folded, sched.ReadAfterFold]>; let isCodeGenOnly = 1, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), @@ -5350,7 +5342,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (VecNode _.RC:$src1, - _.ScalarIntMemCPat:$src2))>, + (_.ScalarIntMemFrags addr:$src2)))>, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; let isCodeGenOnly = 1, Predicates = [HasAVX512], @@ -5463,28 +5455,32 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, EVEX_CD8<64, CD8VT1>, SIMD_EXC; multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, bit IsCommutable, bit IsKCommutable = IsCommutable> { let ExeDomain = _.ExeDomain, hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { - defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + defm rr: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), IsCommutable, IsKCommutable, IsKCommutable>, EVEX_4V, Sched<[sched]>; let mayLoad = 1 in { - defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + defm rm: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", - (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, + (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), + (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; - defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, - "${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, + defm rmb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix, + "${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr, + (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))), + (MaskOpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5496,7 +5492,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr#_.Suffix, "$rc, $src2, $src1", "$src1, $src2, $rc", (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; @@ -5507,38 +5503,39 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix, "{sae}, $src2, $src1", "$src1, $src2, {sae}", (_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>, EVEX_4V, EVEX_B, Sched<[sched]>; } multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, + SDPatternOperator MaskOpNode, Predicate prd, X86SchedWriteSizes sched, bit IsCommutable = 0, bit IsPD128Commutable = IsCommutable> { let Predicates = [prd] in { - defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, + defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f32_info, sched.PS.ZMM, IsCommutable>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info, + defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f64_info, sched.PD.ZMM, IsCommutable>, EVEX_V512, PD, VEX_W, EVEX_CD8<64, CD8VF>; } // Define only if AVX512VL feature is present. let Predicates = [prd, HasVLX] in { - defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info, + defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f32x_info, sched.PS.XMM, IsCommutable>, EVEX_V128, PS, EVEX_CD8<32, CD8VF>; - defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info, + defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f32x_info, sched.PS.YMM, IsCommutable>, EVEX_V256, PS, EVEX_CD8<32, CD8VF>; - defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info, + defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v2f64x_info, sched.PD.XMM, IsPD128Commutable, IsCommutable>, EVEX_V128, PD, VEX_W, EVEX_CD8<64, CD8VF>; - defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info, + defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f64x_info, sched.PD.YMM, IsCommutable>, EVEX_V256, PD, VEX_W, EVEX_CD8<64, CD8VF>; } @@ -5566,38 +5563,38 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; } -defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, HasAVX512, +defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512, SchedWriteFAddSizes, 1>, avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>; -defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, HasAVX512, +defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, fmul, HasAVX512, SchedWriteFMulSizes, 1>, avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>; -defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, HasAVX512, +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, fsub, HasAVX512, SchedWriteFAddSizes>, avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>; -defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, HasAVX512, +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, fdiv, HasAVX512, SchedWriteFDivSizes>, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>; -defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, X86fmin, HasAVX512, SchedWriteFCmpSizes, 0>, avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>; -defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512, +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, X86fmax, HasAVX512, SchedWriteFCmpSizes, 0>, avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>; let isCodeGenOnly = 1 in { - defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512, + defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, X86fminc, HasAVX512, SchedWriteFCmpSizes, 1>; - defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512, + defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, X86fmaxc, HasAVX512, SchedWriteFCmpSizes, 1>; } let Uses = []<Register>, mayRaiseFPException = 0 in { -defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI, +defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, null_frag, HasDQI, SchedWriteFLogicSizes, 1>; -defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI, +defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, null_frag, HasDQI, SchedWriteFLogicSizes, 0>; -defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI, +defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, null_frag, HasDQI, SchedWriteFLogicSizes, 1>; -defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI, +defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, null_frag, HasDQI, SchedWriteFLogicSizes, 1>; } @@ -5605,19 +5602,19 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, EVEX_4V, Sched<[sched]>; defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, - "${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr, + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix, + "${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr, (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5627,14 +5624,14 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2))>, Sched<[sched]>; defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", - (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2)>, + (OpNode _.RC:$src1, (_.ScalarIntMemFrags addr:$src2))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -5648,11 +5645,11 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>, - avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, + avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info, X86scalefsRnd, sched.Scl>, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>, - avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, + avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info, X86scalefsRnd, sched.Scl>, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W; @@ -5679,7 +5676,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG. - // There are just too many permuations due to commutability and bitcasts. + // There are just too many permutations due to commutability and bitcasts. let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, @@ -5701,8 +5698,8 @@ multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, - "${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr, + "${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr, (null_frag), (null_frag)>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -5790,7 +5787,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, let ExeDomain = _.ExeDomain in defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, - "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", + "$src2, ${src1}"#_.BroadcastStr, "${src1}"#_.BroadcastStr#", $src2", (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>, EVEX_B, Sched<[sched.Folded]>; } @@ -5973,8 +5970,8 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, let ExeDomain = _.ExeDomain in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, - "${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr, + "${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr, (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>, AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -6245,8 +6242,8 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, - "${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr, + "${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr, (_.VT (OpNode _.RC:$src1, (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>, @@ -6370,9 +6367,6 @@ defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd, let Predicates = [HasAVX512] in { // VMOVHPD patterns - def : Pat<(v2f64 (X86Unpckl VR128X:$src1, - (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), - (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))), (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; @@ -6419,29 +6413,33 @@ let Predicates = [HasAVX512] in { // multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86FoldableSchedWrite sched, + SDNode MaskOpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Suff> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { - defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>, + (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), + (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>, AVX512FMA3Base, Sched<[sched]>; - defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>, + (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), + (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>, AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>; - defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, + _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), + (MaskOpNode _.RC:$src2, _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>, - AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6450,74 +6448,88 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string Suff> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, Uses = [MXCSR] in - defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo _, string Suff> { let Predicates = [HasAVX512] in { - defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.ZMM, - _.info512, Suff>, + defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.ZMM, _.info512, Suff>, avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM, _.info512, Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } let Predicates = [HasVLX, HasAVX512] in { - defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.YMM, - _.info256, Suff>, + defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.YMM, _.info256, Suff>, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; - defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.XMM, - _.info128, Suff>, + defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.XMM, _.info128, Suff>, EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd> { - defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, - SchedWriteFMA, avx512vl_f32_info, "PS">; - defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, - SchedWriteFMA, avx512vl_f64_info, "PD">, - VEX_W; -} - -defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>; -defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>; -defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>; -defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>; -defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>; + SDNode MaskOpNode, SDNode OpNodeRnd> { + defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode, + OpNodeRnd, SchedWriteFMA, + avx512vl_f32_info, "PS">; + defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode, + OpNodeRnd, SchedWriteFMA, + avx512vl_f64_info, "PD">, VEX_W; +} + +defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, + X86Fmadd, X86FmaddRnd>; +defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub, + X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, + X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, + X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86any_Fnmadd, + X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86any_Fnmsub, + X86Fnmsub, X86FnmsubRnd>; multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86FoldableSchedWrite sched, + SDNode MaskOpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Suff> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { - defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1, - vselect, 1>, AVX512FMA3Base, Sched<[sched]>; + (null_frag), + (_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>, + AVX512FMA3Base, Sched<[sched]>; - defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>, + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), + (_.VT (MaskOpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>, AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>; - defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), - OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", - "$src2, ${src3}"##_.BroadcastStr, + OpcodeStr, "${src3}"#_.BroadcastStr#", $src2", + "$src2, ${src3}"#_.BroadcastStr, (_.VT (OpNode _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3)), - _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B, + _.RC:$src1)), + (_.VT (MaskOpNode _.RC:$src2, + (_.VT (_.BroadcastLdFrag addr:$src3)), + _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6527,77 +6539,89 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string Suff> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, Uses = [MXCSR] in - defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))), - 1, 1, vselect, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; + (null_frag), + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))), + 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo _, string Suff> { let Predicates = [HasAVX512] in { - defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.ZMM, - _.info512, Suff>, + defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.ZMM, _.info512, Suff>, avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM, _.info512, Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } let Predicates = [HasVLX, HasAVX512] in { - defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.YMM, - _.info256, Suff>, + defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.YMM, _.info256, Suff>, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; - defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.XMM, - _.info128, Suff>, + defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.XMM, _.info128, Suff>, EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd > { - defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, - SchedWriteFMA, avx512vl_f32_info, "PS">; - defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, - SchedWriteFMA, avx512vl_f64_info, "PD">, - VEX_W; -} - -defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>; -defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>; -defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>; -defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>; -defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>; + SDNode MaskOpNode, SDNode OpNodeRnd > { + defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode, + OpNodeRnd, SchedWriteFMA, + avx512vl_f32_info, "PS">; + defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode, + OpNodeRnd, SchedWriteFMA, + avx512vl_f64_info, "PD">, VEX_W; +} + +defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, + X86Fmadd, X86FmaddRnd>; +defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub, + X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, + X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, + X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86any_Fnmadd, + X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86any_Fnmsub, + X86Fnmsub, X86FnmsubRnd>; multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86FoldableSchedWrite sched, + SDNode MaskOpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Suff> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { - defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1, vselect, 1>, + (null_frag), + (_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>, AVX512FMA3Base, Sched<[sched]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. - defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>, + (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), + (_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. - defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), - OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", - "$src2, ${src3}"##_.BroadcastStr, + OpcodeStr, "${src3}"#_.BroadcastStr#", $src2", + "$src2, ${src3}"#_.BroadcastStr, (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)), - _.RC:$src1, _.RC:$src2)), 1, 0>, + _.RC:$src1, _.RC:$src2)), + (_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)), + _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6607,49 +6631,57 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string Suff> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0, Uses = [MXCSR] in - defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))), - 1, 1, vselect, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; + (null_frag), + (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))), + 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; } multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched, + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo _, string Suff> { let Predicates = [HasAVX512] in { - defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.ZMM, - _.info512, Suff>, + defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.ZMM, _.info512, Suff>, avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM, _.info512, Suff>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } let Predicates = [HasVLX, HasAVX512] in { - defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.YMM, - _.info256, Suff>, + defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.YMM, _.info256, Suff>, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; - defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.XMM, - _.info128, Suff>, + defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.XMM, _.info128, Suff>, EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd > { - defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, - SchedWriteFMA, avx512vl_f32_info, "PS">; - defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, - SchedWriteFMA, avx512vl_f64_info, "PD">, - VEX_W; -} - -defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>; -defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>; -defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>; -defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>; -defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>; + SDNode MaskOpNode, SDNode OpNodeRnd > { + defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode, + OpNodeRnd, SchedWriteFMA, + avx512vl_f32_info, "PS">; + defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode, + OpNodeRnd, SchedWriteFMA, + avx512vl_f64_info, "PD">, VEX_W; +} + +defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, + X86Fmadd, X86FmaddRnd>; +defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub, + X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, + X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, + X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86any_Fnmadd, + X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86any_Fnmsub, + X86Fnmsub, X86FnmsubRnd>; // Scalar FMA multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, @@ -6742,11 +6774,12 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, } defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>; -defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; -defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; -defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>; -multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, +multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode MaskedOp, + SDNode RndOp, string Prefix, string Suffix, SDNode Move, X86VectorVTInfo _, PatLeaf ZeroFP> { let Predicates = [HasAVX512] in { @@ -6788,8 +6821,8 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op _.FRC:$src2, + (X86selects_mask VK1WM:$mask, + (MaskedOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), _.FRC:$src3), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), @@ -6799,8 +6832,8 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op _.FRC:$src2, + (X86selects_mask VK1WM:$mask, + (MaskedOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), (_.ScalarLdFrag addr:$src3)), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), @@ -6809,18 +6842,18 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (_.ScalarLdFrag addr:$src3), _.FRC:$src2), + (X86selects_mask VK1WM:$mask, + (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src3), _.FRC:$src2), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), (!cast<I>(Prefix#"132"#Suffix#"Zm_Intk") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op _.FRC:$src2, _.FRC:$src3, - (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (X86selects_mask VK1WM:$mask, + (MaskedOp _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), (!cast<I>(Prefix#"231"#Suffix#"Zr_Intk") VR128X:$src1, VK1WM:$mask, @@ -6828,19 +6861,19 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3), - (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (X86selects_mask VK1WM:$mask, + (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), (!cast<I>(Prefix#"231"#Suffix#"Zm_Intk") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op _.FRC:$src2, - (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - _.FRC:$src3), + (X86selects_mask VK1WM:$mask, + (MaskedOp _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3), (_.EltVT ZeroFP)))))), (!cast<I>(Prefix#"213"#Suffix#"Zr_Intkz") VR128X:$src1, VK1WM:$mask, @@ -6848,9 +6881,9 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op _.FRC:$src2, _.FRC:$src3, - (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (X86selects_mask VK1WM:$mask, + (MaskedOp _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), (_.EltVT ZeroFP)))))), (!cast<I>(Prefix#"231"#Suffix#"Zr_Intkz") VR128X:$src1, VK1WM:$mask, @@ -6858,28 +6891,28 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op _.FRC:$src2, - (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (_.ScalarLdFrag addr:$src3)), + (X86selects_mask VK1WM:$mask, + (MaskedOp _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src3)), (_.EltVT ZeroFP)))))), (!cast<I>(Prefix#"213"#Suffix#"Zm_Intkz") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), - _.FRC:$src2, (_.ScalarLdFrag addr:$src3)), + (X86selects_mask VK1WM:$mask, + (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2, (_.ScalarLdFrag addr:$src3)), (_.EltVT ZeroFP)))))), (!cast<I>(Prefix#"132"#Suffix#"Zm_Intkz") VR128X:$src1, VK1WM:$mask, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, - (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3), - (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (X86selects_mask VK1WM:$mask, + (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), (_.EltVT ZeroFP)))))), (!cast<I>(Prefix#"231"#Suffix#"Zm_Intkz") VR128X:$src1, VK1WM:$mask, @@ -6903,7 +6936,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, + (X86selects_mask VK1WM:$mask, (RndOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), _.FRC:$src3, (i32 timm:$rc)), @@ -6914,7 +6947,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, + (X86selects_mask VK1WM:$mask, (RndOp _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), (i32 timm:$rc)), @@ -6925,7 +6958,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, + (X86selects_mask VK1WM:$mask, (RndOp _.FRC:$src2, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), _.FRC:$src3, (i32 timm:$rc)), @@ -6936,7 +6969,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>; def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector - (X86selects VK1WM:$mask, + (X86selects_mask VK1WM:$mask, (RndOp _.FRC:$src2, _.FRC:$src3, (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), (i32 timm:$rc)), @@ -6948,23 +6981,23 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, } } -defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SS", - X86Movss, v4f32x_info, fp32imm0>; -defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS", - X86Movss, v4f32x_info, fp32imm0>; -defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS", - X86Movss, v4f32x_info, fp32imm0>; -defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS", - X86Movss, v4f32x_info, fp32imm0>; +defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD", + "SS", X86Movss, v4f32x_info, fp32imm0>; +defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB", + "SS", X86Movss, v4f32x_info, fp32imm0>; +defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD", + "SS", X86Movss, v4f32x_info, fp32imm0>; +defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB", + "SS", X86Movss, v4f32x_info, fp32imm0>; -defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SD", - X86Movsd, v2f64x_info, fp64imm0>; -defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD", - X86Movsd, v2f64x_info, fp64imm0>; -defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SD", - X86Movsd, v2f64x_info, fp64imm0>; -defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD", - X86Movsd, v2f64x_info, fp64imm0>; +defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD", + "SD", X86Movsd, v2f64x_info, fp64imm0>; +defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB", + "SD", X86Movsd, v2f64x_info, fp64imm0>; +defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD", + "SD", X86Movsd, v2f64x_info, fp64imm0>; +defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB", + "SD", X86Movsd, v2f64x_info, fp64imm0>; //===----------------------------------------------------------------------===// // AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA @@ -7194,7 +7227,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT, def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstVT.RC:$dst, (OpNode - (SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>, + (SrcVT.ScalarIntMemFrags addr:$src)))]>, EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } // Predicates = [HasAVX512] @@ -7233,6 +7266,45 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2u X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; +multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT, + X86VectorVTInfo DstVT, SDNode OpNode, + X86FoldableSchedWrite sched, + string aliasStr> { + let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in { + let isCodeGenOnly = 1 in { + def rr : AVX512<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.FRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstVT.RC:$dst, (OpNode SrcVT.FRC:$src))]>, + EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC; + def rm : AVX512<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstVT.RC:$dst, (OpNode (SrcVT.ScalarLdFrag addr:$src)))]>, + EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; + } + } // Predicates = [HasAVX512] +} + +defm VCVTSS2SIZ: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i32x_info, + lrint, WriteCvtSS2I, + "{l}">, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2SI64Z: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i64x_info, + llrint, WriteCvtSS2I, + "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSD2SIZ: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i32x_info, + lrint, WriteCvtSD2I, + "{l}">, XD, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2SI64Z: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i64x_info, + llrint, WriteCvtSD2I, + "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>; + +let Predicates = [HasAVX512] in { + def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64Zrr FR32:$src)>; + def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64Zrm addr:$src)>; + + def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64Zrr FR64:$src)>; + def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64Zrm addr:$src)>; +} + // Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang // which produce unnecessary vmovs{s,d} instructions let Predicates = [HasAVX512] in { @@ -7347,7 +7419,7 @@ let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in { (ins _SrcRC.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, - (OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>, + (OpNodeInt (_SrcRC.ScalarIntMemFrags addr:$src)))]>, EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } //HasAVX512 @@ -7404,7 +7476,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _ (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), - (_Src.VT _Src.ScalarIntMemCPat:$src2)))>, + (_Src.ScalarIntMemFrags addr:$src2)))>, EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -7421,7 +7493,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _ } } -// Scalar Coversion with SAE - suppress all exceptions +// Scalar Conversion with SAE - suppress all exceptions multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { @@ -7506,55 +7578,63 @@ def : Pat<(v2f64 (X86Movsd //===----------------------------------------------------------------------===// multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNode, + X86VectorVTInfo _Src, SDNode OpNode, SDNode MaskOpNode, X86FoldableSchedWrite sched, string Broadcast = _.BroadcastStr, string Alias = "", X86MemOperand MemOp = _Src.MemOp, RegisterClass MaskRC = _.KRCWM, - dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> { + dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src)))), + dag MaskLdDAG = (_.VT (MaskOpNode (_Src.VT (_Src.LdFrag addr:$src))))> { let Uses = [MXCSR], mayRaiseFPException = 1 in { - defm rr : AVX512_maskable_common<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rr : AVX512_maskable_cvt<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src), (ins _.RC:$src0, MaskRC:$mask, _Src.RC:$src), (ins MaskRC:$mask, _Src.RC:$src), OpcodeStr, "$src", "$src", (_.VT (OpNode (_Src.VT _Src.RC:$src))), - (vselect MaskRC:$mask, - (_.VT (OpNode (_Src.VT _Src.RC:$src))), - _.RC:$src0), - vselect, "$src0 = $dst">, + (vselect_mask MaskRC:$mask, + (_.VT (MaskOpNode (_Src.VT _Src.RC:$src))), + _.RC:$src0), + (vselect_mask MaskRC:$mask, + (_.VT (MaskOpNode (_Src.VT _Src.RC:$src))), + _.ImmAllZerosV)>, EVEX, Sched<[sched]>; - defm rm : AVX512_maskable_common<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm rm : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins MemOp:$src), (ins _.RC:$src0, MaskRC:$mask, MemOp:$src), (ins MaskRC:$mask, MemOp:$src), OpcodeStr#Alias, "$src", "$src", LdDAG, - (vselect MaskRC:$mask, LdDAG, _.RC:$src0), - vselect, "$src0 = $dst">, + (vselect_mask MaskRC:$mask, MaskLdDAG, _.RC:$src0), + (vselect_mask MaskRC:$mask, MaskLdDAG, _.ImmAllZerosV)>, EVEX, Sched<[sched.Folded]>; - defm rmb : AVX512_maskable_common<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm rmb : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _Src.ScalarMemOp:$src), (ins _.RC:$src0, MaskRC:$mask, _Src.ScalarMemOp:$src), (ins MaskRC:$mask, _Src.ScalarMemOp:$src), OpcodeStr, - "${src}"##Broadcast, "${src}"##Broadcast, + "${src}"#Broadcast, "${src}"#Broadcast, (_.VT (OpNode (_Src.VT (_Src.BroadcastLdFrag addr:$src)) )), - (vselect MaskRC:$mask, - (_.VT - (OpNode - (_Src.VT - (_Src.BroadcastLdFrag addr:$src)))), - _.RC:$src0), - vselect, "$src0 = $dst">, + (vselect_mask MaskRC:$mask, + (_.VT + (MaskOpNode + (_Src.VT + (_Src.BroadcastLdFrag addr:$src)))), + _.RC:$src0), + (vselect_mask MaskRC:$mask, + (_.VT + (MaskOpNode + (_Src.VT + (_Src.BroadcastLdFrag addr:$src)))), + _.ImmAllZerosV)>, EVEX, EVEX_B, Sched<[sched.Folded]>; } } -// Coversion with SAE - suppress all exceptions +// Conversion with SAE - suppress all exceptions multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNodeSAE, X86FoldableSchedWrite sched> { @@ -7581,12 +7661,14 @@ multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, // Similar to avx512_vcvt_fp, but uses an extload for the memory form. multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, X86VectorVTInfo _Src, SDNode OpNode, + SDNode MaskOpNode, X86FoldableSchedWrite sched, string Broadcast = _.BroadcastStr, string Alias = "", X86MemOperand MemOp = _Src.MemOp, RegisterClass MaskRC = _.KRCWM> - : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, sched, Broadcast, Alias, - MemOp, MaskRC, + : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, MaskOpNode, sched, Broadcast, + Alias, MemOp, MaskRC, + (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src)), (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src))>; // Extend Float to Double @@ -7594,69 +7676,72 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info, - any_fpextend, sched.ZMM>, + any_fpextend, fpextend, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info, X86vfpextSAE, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info, - X86any_vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; - defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, any_fpextend, - sched.YMM>, EVEX_V256; + X86any_vfpext, X86vfpext, sched.XMM, "{1to2}", + "", f64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, + any_fpextend, fpextend, sched.YMM>, EVEX_V256; } } // Truncate Double to Float multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86any_vfpround, sched.ZMM>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, + X86any_vfpround, X86vfpround, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info, X86vfproundRnd, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info, - null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>, - EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86any_vfpround, + null_frag, null_frag, sched.XMM, "{1to2}", "{x}", + f128mem, VK2WM>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, + X86any_vfpround, X86vfpround, sched.YMM, "{1to4}", "{y}">, EVEX_V256; } - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst, VK2WM:$mask, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|" "$dst {${mask}} {z}, $src}", (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst, VK2WM:$mask, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|" + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|" "$dst {${mask}}, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst, VK2WM:$mask, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|" "$dst {${mask}} {z}, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst, VK2WM:$mask, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst, VK4WM:$mask, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|" "$dst {${mask}} {z}, $src}", (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst, VK4WM:$mask, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|" + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|" "$dst {${mask}}, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst, VK4WM:$mask, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|" "$dst {${mask}} {z}, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst, VK4WM:$mask, f64mem:$src), 0, "att">; @@ -7701,81 +7786,91 @@ let Predicates = [HasVLX] in { // Convert Signed/Unsigned Doubleword to Double let Uses = []<Register>, mayRaiseFPException = 0 in multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNode128, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNode128, + SDNode MaskOpNode128, + X86SchedWriteWidths sched> { // No rounding in this op let Predicates = [HasAVX512] in defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode, - sched.ZMM>, EVEX_V512; + MaskOpNode, sched.ZMM>, EVEX_V512; let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info, - OpNode128, sched.XMM, "{1to2}", "", i64mem, VK2WM, + OpNode128, MaskOpNode128, sched.XMM, "{1to2}", + "", i64mem, VK2WM, (v2f64 (OpNode128 (bc_v4i32 (v2i64 + (scalar_to_vector (loadi64 addr:$src)))))), + (v2f64 (MaskOpNode128 (bc_v4i32 + (v2i64 (scalar_to_vector (loadi64 addr:$src))))))>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode, - sched.YMM>, EVEX_V256; + MaskOpNode, sched.YMM>, EVEX_V256; } } // Convert Signed/Unsigned Doubleword to Float multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info, OpNodeRnd, sched.ZMM>, EVEX_V512; let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode, - sched.XMM>, EVEX_V128; + MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode, - sched.YMM>, EVEX_V256; + MaskOpNode, sched.YMM>, EVEX_V256; } } // Convert Float to Signed/Unsigned Doubleword with truncation multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info, OpNodeSAE, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode, - sched.XMM>, EVEX_V128; + MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode, - sched.YMM>, EVEX_V256; + MaskOpNode, sched.YMM>, EVEX_V256; } } // Convert Float to Signed/Unsigned Doubleword multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info, OpNodeRnd, sched.ZMM>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode, - sched.XMM>, EVEX_V128; + MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode, - sched.YMM>, EVEX_V256; + MaskOpNode, sched.YMM>, EVEX_V256; } } // Convert Double to Signed/Unsigned Doubleword with truncation multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeSAE, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeSAE, + X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNodeSAE, sched.ZMM>, EVEX_V512; } @@ -7785,50 +7880,50 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, - null_frag, sched.XMM, "{1to2}", "{x}", f128mem, + null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, - sched.YMM, "{1to4}", "{y}">, EVEX_V256; + MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256; } - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst, VK2WM:$mask, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst, VK2WM:$mask, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|" + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|" "$dst {${mask}}, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst, VK2WM:$mask, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|" "$dst {${mask}} {z}, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst, VK2WM:$mask, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst, VK4WM:$mask, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst, VK4WM:$mask, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|" + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|" "$dst {${mask}}, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst, VK4WM:$mask, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|" "$dst {${mask}} {z}, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst, VK4WM:$mask, f64mem:$src), 0, "att">; @@ -7836,10 +7931,11 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Convert Double to Signed/Unsigned Doubleword multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { let Predicates = [HasAVX512] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNodeRnd, sched.ZMM>, EVEX_V512; } @@ -7849,48 +7945,48 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, - null_frag, sched.XMM, "{1to2}", "{x}", f128mem, + null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, - sched.YMM, "{1to4}", "{y}">, EVEX_V256; + MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256; } - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst, VK2WM:$mask, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst, VK2WM:$mask, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|" + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|" "$dst {${mask}}, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst, VK2WM:$mask, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|" "$dst {${mask}} {z}, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst, VK2WM:$mask, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst, VK4WM:$mask, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst, VK4WM:$mask, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|" + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|" "$dst {${mask}}, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst, VK4WM:$mask, f64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|" "$dst {${mask}} {z}, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst, VK4WM:$mask, f64mem:$src), 0, "att">; @@ -7898,61 +7994,65 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Convert Double to Signed/Unsigned Quardword multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { let Predicates = [HasDQI] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info, OpNodeRnd, sched.ZMM>, EVEX_V512; } let Predicates = [HasDQI, HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode, - sched.XMM>, EVEX_V128; + MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode, - sched.YMM>, EVEX_V256; + MaskOpNode, sched.YMM>, EVEX_V256; } } // Convert Double to Signed/Unsigned Quardword with truncation multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { let Predicates = [HasDQI] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info, OpNodeRnd, sched.ZMM>, EVEX_V512; } let Predicates = [HasDQI, HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode, - sched.XMM>, EVEX_V128; + MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode, - sched.YMM>, EVEX_V256; + MaskOpNode, sched.YMM>, EVEX_V256; } } // Convert Signed/Unsigned Quardword to Double multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { let Predicates = [HasDQI] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info, OpNodeRnd, sched.ZMM>, EVEX_V512; } let Predicates = [HasDQI, HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode, - sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible; + MaskOpNode, sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode, - sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible; + MaskOpNode, sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible; } } // Convert Float to Signed/Unsigned Quardword multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { let Predicates = [HasDQI] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNodeRnd, sched.ZMM>, EVEX_V512; } @@ -7960,21 +8060,26 @@ multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, - sched.XMM, "{1to2}", "", f64mem, VK2WM, + MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM, (v2i64 (OpNode (bc_v4f32 (v2f64 + (scalar_to_vector (loadf64 addr:$src)))))), + (v2i64 (MaskOpNode (bc_v4f32 + (v2f64 (scalar_to_vector (loadf64 addr:$src))))))>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, - sched.YMM>, EVEX_V256; + MaskOpNode, sched.YMM>, EVEX_V256; } } // Convert Float to Signed/Unsigned Quardword with truncation multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { let Predicates = [HasDQI] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, sched.ZMM>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNodeRnd, sched.ZMM>, EVEX_V512; } @@ -7982,22 +8087,26 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, - sched.XMM, "{1to2}", "", f64mem, VK2WM, + MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM, (v2i64 (OpNode (bc_v4f32 (v2f64 + (scalar_to_vector (loadf64 addr:$src)))))), + (v2i64 (MaskOpNode (bc_v4f32 + (v2f64 (scalar_to_vector (loadf64 addr:$src))))))>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, - sched.YMM>, EVEX_V256; + MaskOpNode, sched.YMM>, EVEX_V256; } } // Convert Signed/Unsigned Quardword to Float multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched> { + SDNode MaskOpNode, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { let Predicates = [HasDQI] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode, - sched.ZMM>, + MaskOpNode, sched.ZMM>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNodeRnd, sched.ZMM>, EVEX_V512; } @@ -8007,152 +8116,159 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, null_frag, - sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>, + null_frag, sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>, EVEX_V128, NotEVEX2VEXConvertible; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode, - sched.YMM, "{1to4}", "{y}">, EVEX_V256, + MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256, NotEVEX2VEXConvertible; } - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst, VK2WM:$mask, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", + def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}", (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst, VK2WM:$mask, VR128X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, i64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|" + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|" "$dst {${mask}}, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst, VK2WM:$mask, i64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|" "$dst {${mask}} {z}, ${src}{1to2}}", (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst, VK2WM:$mask, i64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|" + def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|" "$dst {${mask}}, $src}", (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst, VK4WM:$mask, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|" "$dst {${mask}} {z}, $src}", (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst, VK4WM:$mask, VR256X:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, i64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|" + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|" "$dst {${mask}}, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst, VK4WM:$mask, i64mem:$src), 0, "att">; - def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|" + def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|" "$dst {${mask}} {z}, ${src}{1to4}}", (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst, VK4WM:$mask, i64mem:$src), 0, "att">; } -defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, X86any_VSintToFP, +defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, sint_to_fp, + X86any_VSintToFP, X86VSintToFP, SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>; -defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, +defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86any_cvttp2si, - X86cvttp2siSAE, SchedWriteCvtPS2DQ>, - XS, EVEX_CD8<32, CD8VF>; + X86cvttp2si, X86cvttp2siSAE, + SchedWriteCvtPS2DQ>, XS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86any_cvttp2si, - X86cvttp2siSAE, SchedWriteCvtPD2DQ>, + X86cvttp2si, X86cvttp2siSAE, + SchedWriteCvtPD2DQ>, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86any_cvttp2ui, - X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PS, - EVEX_CD8<32, CD8VF>; + X86cvttp2ui, X86cvttp2uiSAE, + SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86any_cvttp2ui, - X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, + X86cvttp2ui, X86cvttp2uiSAE, + SchedWriteCvtPD2DQ>, PS, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp, - X86any_VUintToFP, SchedWriteCvtDQ2PD>, XS, - EVEX_CD8<32, CD8VH>; + uint_to_fp, X86any_VUintToFP, X86VUintToFP, + SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>; defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", any_uint_to_fp, - X86VUintToFpRnd, SchedWriteCvtDQ2PS>, XD, - EVEX_CD8<32, CD8VF>; + uint_to_fp, X86VUintToFpRnd, + SchedWriteCvtDQ2PS>, XD, EVEX_CD8<32, CD8VF>; -defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int, +defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int, X86cvtp2Int, X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VF>; -defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, +defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, X86cvtp2Int, X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, XD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, +defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, X86cvtp2UInt, X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>; -defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, +defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, X86cvtp2UInt, X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W, PS, EVEX_CD8<64, CD8VF>; -defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, +defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, X86cvtp2Int, X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; -defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int, +defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int, X86cvtp2Int, X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; -defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt, +defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt, X86cvtp2UInt, X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; -defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, +defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, X86cvtp2UInt, X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86any_cvttp2si, - X86cvttp2siSAE, SchedWriteCvtPD2DQ>, VEX_W, + X86cvttp2si, X86cvttp2siSAE, + SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86any_cvttp2si, - X86cvttp2siSAE, SchedWriteCvtPS2DQ>, PD, + X86cvttp2si, X86cvttp2siSAE, + SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86any_cvttp2ui, - X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, VEX_W, + X86cvttp2ui, X86cvttp2uiSAE, + SchedWriteCvtPD2DQ>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86any_cvttp2ui, - X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD, + X86cvttp2ui, X86cvttp2uiSAE, + SchedWriteCvtPS2DQ>, PD, EVEX_CD8<32, CD8VH>; defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", any_sint_to_fp, - X86VSintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, - EVEX_CD8<64, CD8VF>; + sint_to_fp, X86VSintToFpRnd, + SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>; defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp, - X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, - EVEX_CD8<64, CD8VF>; + uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>, + VEX_W, XS, EVEX_CD8<64, CD8VF>; defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp, - X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS, - EVEX_CD8<64, CD8VF>; + sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>, + VEX_W, PS, EVEX_CD8<64, CD8VF>; defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp, - X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD, - EVEX_CD8<64, CD8VF>; + uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>, + VEX_W, XD, EVEX_CD8<64, CD8VF>; let Predicates = [HasVLX] in { // Special patterns to allow use of X86mcvtp2Int for masking. Instruction @@ -8275,70 +8391,70 @@ let Predicates = [HasVLX] in { let Predicates = [HasDQI, HasVLX] in { def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTPS2QQZ128rm addr:$src)>; - def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), - VR128X:$src0)), + def : Pat<(v2i64 (vselect_mask VK2WM:$mask, + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), (VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), - v2i64x_info.ImmAllZerosV)), + def : Pat<(v2i64 (vselect_mask VK2WM:$mask, + (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), (VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTPS2UQQZ128rm addr:$src)>; - def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), - VR128X:$src0)), + def : Pat<(v2i64 (vselect_mask VK2WM:$mask, + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), (VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), - v2i64x_info.ImmAllZerosV)), + def : Pat<(v2i64 (vselect_mask VK2WM:$mask, + (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; def : Pat<(v2i64 (X86any_cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTTPS2QQZ128rm addr:$src)>; - def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), - VR128X:$src0)), + def : Pat<(v2i64 (vselect_mask VK2WM:$mask, + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), (VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), - v2i64x_info.ImmAllZerosV)), + def : Pat<(v2i64 (vselect_mask VK2WM:$mask, + (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>; def : Pat<(v2i64 (X86any_cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))), (VCVTTPS2UQQZ128rm addr:$src)>; - def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), - VR128X:$src0)), + def : Pat<(v2i64 (vselect_mask VK2WM:$mask, + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + VR128X:$src0)), (VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(v2i64 (vselect VK2WM:$mask, - (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), - v2i64x_info.ImmAllZerosV)), + def : Pat<(v2i64 (vselect_mask VK2WM:$mask, + (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))), + v2i64x_info.ImmAllZerosV)), (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>; } let Predicates = [HasVLX] in { def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTDQ2PDZ128rm addr:$src)>; - def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), - VR128X:$src0)), + def : Pat<(v2f64 (vselect_mask VK2WM:$mask, + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + VR128X:$src0)), (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), - v2f64x_info.ImmAllZerosV)), + def : Pat<(v2f64 (vselect_mask VK2WM:$mask, + (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + v2f64x_info.ImmAllZerosV)), (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; def : Pat<(v2f64 (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), (VCVTUDQ2PDZ128rm addr:$src)>; - def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), - VR128X:$src0)), + def : Pat<(v2f64 (vselect_mask VK2WM:$mask, + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + VR128X:$src0)), (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(v2f64 (vselect VK2WM:$mask, - (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), - v2f64x_info.ImmAllZerosV)), + def : Pat<(v2f64 (vselect_mask VK2WM:$mask, + (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))), + v2f64x_info.ImmAllZerosV)), (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>; } @@ -8408,16 +8524,17 @@ let Predicates = [HasDQI, HasVLX] in { let Uses = [MXCSR], mayRaiseFPException = 1 in multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, - X86MemOperand x86memop, PatFrag ld_frag, + X86MemOperand x86memop, dag ld_dag, X86FoldableSchedWrite sched> { - defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), + defm rr : AVX512_maskable_split<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), "vcvtph2ps", "$src", "$src", + (X86any_cvtph2ps (_src.VT _src.RC:$src)), (X86cvtph2ps (_src.VT _src.RC:$src))>, T8PD, Sched<[sched]>; - defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), + defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), "vcvtph2ps", "$src", "$src", - (X86cvtph2ps (_src.VT - (ld_frag addr:$src)))>, + (X86any_cvtph2ps (_src.VT ld_dag)), + (X86cvtph2ps (_src.VT ld_dag))>, T8PD, Sched<[sched.Folded]>; } @@ -8432,23 +8549,22 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src, } let Predicates = [HasAVX512] in - defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, load, - WriteCvtPH2PSZ>, + defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, + (load addr:$src), WriteCvtPH2PSZ>, avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, - load, WriteCvtPH2PSY>, EVEX, EVEX_V256, + (load addr:$src), WriteCvtPH2PSY>, EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, - load, WriteCvtPH2PS>, EVEX, EVEX_V128, + (bitconvert (v2i64 (X86vzload64 addr:$src))), + WriteCvtPH2PS>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), - (VCVTPH2PSZ128rm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert + def : Pat<(v4f32 (X86any_cvtph2ps (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), (VCVTPH2PSZ128rm addr:$src)>; } @@ -8460,7 +8576,7 @@ let ExeDomain = GenericDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _dest.RC:$dst, - (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>, + (X86any_cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>, Sched<[RR]>; let Constraints = "$src0 = $dst" in def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst), @@ -8505,54 +8621,35 @@ let Predicates = [HasAVX512] in { WriteCvtPS2PHZ, WriteCvtPS2PHZSt>, avx512_cvtps2ph_sae<v16i16x_info, v16f32_info, WriteCvtPS2PHZ>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; - let Predicates = [HasVLX] in { - defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem, - WriteCvtPS2PHY, WriteCvtPS2PHYSt>, - EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; - defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem, - WriteCvtPS2PH, WriteCvtPS2PHSt>, - EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; - } + + def : Pat<(store (v16i16 (X86any_cvtps2ph VR512:$src1, timm:$src2)), addr:$dst), + (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>; +} + +let Predicates = [HasVLX] in { + defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem, + WriteCvtPS2PHY, WriteCvtPS2PHYSt>, + EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem, + WriteCvtPS2PH, WriteCvtPS2PHSt>, + EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; def : Pat<(store (f64 (extractelt - (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))), + (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>; def : Pat<(store (i64 (extractelt - (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))), + (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>; - def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst), + def : Pat<(store (v8i16 (X86any_cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst), (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>; - def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, timm:$src2)), addr:$dst), - (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>; -} - -// Patterns for matching conversions from float to half-float and vice versa. -let Predicates = [HasVLX] in { - // Use MXCSR.RC for rounding instead of explicitly specifying the default - // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the - // configurations we support (the default). However, falling back to MXCSR is - // more consistent with other instructions, which are always controlled by it. - // It's encoded as 0b100. - def : Pat<(fp_to_f16 FR32X:$src), - (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (v8i16 (VCVTPS2PHZ128rr - (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4))), sub_16bit))>; - - def : Pat<(f16_to_fp GR16:$src), - (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr - (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)))), FR32X)) >; - - def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))), - (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr - (v8i16 (VCVTPS2PHZ128rr - (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4)))), FR32X)) >; } // Unordered/Ordered scalar fp compare with Sae and set EFLAGS multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, string OpcodeStr, Domain d, - X86FoldableSchedWrite sched = WriteFCom> { + X86FoldableSchedWrite sched = WriteFComX> { let hasSideEffects = 0, Uses = [MXCSR] in def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>, @@ -8613,7 +8710,7 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), - _.ScalarIntMemCPat:$src2)>, EVEX_4V, VEX_LIG, + (_.ScalarIntMemFrags addr:$src2))>, EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8646,7 +8743,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, - "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr, (OpNode (_.VT (_.BroadcastLdFrag addr:$src)))>, EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -8701,7 +8798,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>, + (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2))>, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } } @@ -8741,7 +8838,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, - "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr, (OpNode (_.VT (_.BroadcastLdFrag addr:$src)))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -8811,20 +8908,21 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _>{ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { - defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm r: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (_.VT (any_fsqrt _.RC:$src))>, EVEX, + (_.VT (any_fsqrt _.RC:$src)), + (_.VT (fsqrt _.RC:$src))>, EVEX, Sched<[sched]>; - defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm m: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", - (any_fsqrt (_.VT - (bitconvert (_.LdFrag addr:$src))))>, EVEX, - Sched<[sched.Folded, sched.ReadAfterFold]>; - defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (any_fsqrt (_.VT (_.LdFrag addr:$src))), + (fsqrt (_.VT (_.LdFrag addr:$src)))>, EVEX, + Sched<[sched.Folded, sched.ReadAfterFold]>; + defm mb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, - "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, - (any_fsqrt (_.VT - (_.BroadcastLdFrag addr:$src)))>, + "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr, + (any_fsqrt (_.VT (_.BroadcastLdFrag addr:$src))), + (fsqrt (_.VT (_.BroadcastLdFrag addr:$src)))>, EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8879,7 +8977,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (X86fsqrts (_.VT _.RC:$src1), - _.ScalarIntMemCPat:$src2)>, + (_.ScalarIntMemFrags addr:$src2))>, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; let Uses = [MXCSR] in defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), @@ -8952,7 +9050,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86RndScales _.RC:$src1, - _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>, + (_.ScalarIntMemFrags addr:$src2), (i32 timm:$src3)))>, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in { @@ -8971,13 +9069,13 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, let Predicates = [HasAVX512] in { def : Pat<(X86any_VRndScale _.FRC:$src1, timm:$src2), - (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + (_.EltVT (!cast<Instruction>(NAME#r) (_.EltVT (IMPLICIT_DEF)), _.FRC:$src1, timm:$src2))>; } let Predicates = [HasAVX512, OptForSize] in { def : Pat<(X86any_VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2), - (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + (_.EltVT (!cast<Instruction>(NAME#m) (_.EltVT (IMPLICIT_DEF)), addr:$src1, timm:$src2))>; } } @@ -8996,13 +9094,13 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move, dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP, dag OutMask, Predicate BasePredicate> { let Predicates = [BasePredicate] in { - def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask, (OpNode (extractelt _.VT:$src2, (iPTR 0))), (extractelt _.VT:$dst, (iPTR 0))))), (!cast<Instruction>("V"#OpcPrefix#r_Intk) _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>; - def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask, (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))), (!cast<Instruction>("V"#OpcPrefix#r_Intkz) @@ -9026,14 +9124,14 @@ defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd, // same order as X86vmtrunc, X86vmtruncs, X86vmtruncus. This allows us to pass // either to the multiclasses. def select_trunc : PatFrag<(ops node:$src, node:$src0, node:$mask), - (vselect node:$mask, - (trunc node:$src), node:$src0)>; + (vselect_mask node:$mask, + (trunc node:$src), node:$src0)>; def select_truncs : PatFrag<(ops node:$src, node:$src0, node:$mask), - (vselect node:$mask, - (X86vtruncs node:$src), node:$src0)>; + (vselect_mask node:$mask, + (X86vtruncs node:$src), node:$src0)>; def select_truncus : PatFrag<(ops node:$src, node:$src0, node:$mask), - (vselect node:$mask, - (X86vtruncus node:$src), node:$src0)>; + (vselect_mask node:$mask, + (X86vtruncus node:$src), node:$src0)>; multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, SDPatternOperator MaskNode, @@ -9083,12 +9181,12 @@ multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo, string Name> { def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst), - (!cast<Instruction>(Name#SrcInfo.ZSuffix##mr) + (!cast<Instruction>(Name#SrcInfo.ZSuffix#mr) addr:$dst, SrcInfo.RC:$src)>; def : Pat<(mtruncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst, SrcInfo.KRCWM:$mask), - (!cast<Instruction>(Name#SrcInfo.ZSuffix##mrk) + (!cast<Instruction>(Name#SrcInfo.ZSuffix#mrk) addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>; } @@ -9548,6 +9646,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp, let Predicates = [HasVLX] in { def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; @@ -9558,6 +9658,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp, def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; } @@ -9565,6 +9667,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp, let Predicates = [HasAVX512] in { def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#BQZrm) addr:$src)>; + def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#BQZrm) addr:$src)>; + def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), + (!cast<I>(OpcPrefix#BQZrm) addr:$src)>; } } @@ -9586,54 +9692,49 @@ def: Pat<(v16i8 (trunc (loadv16i16 addr:$src))), // FIXME: Improve scheduling of gather/scatter instructions. multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86MemOperand memop, PatFrag GatherNode, - RegisterClass MaskRC = _.KRCWM> { + X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> { let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb", - ExeDomain = _.ExeDomain in + ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, MaskRC:$mask_wb), (ins _.RC:$src1, MaskRC:$mask, memop:$src2), !strconcat(OpcodeStr#_.Suffix, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - [(set _.RC:$dst, MaskRC:$mask_wb, - (GatherNode (_.VT _.RC:$src1), MaskRC:$mask, - vectoraddr:$src2))]>, EVEX, EVEX_K, - EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>; + []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>; } multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { - defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, - vy512xmem, mgatherv8i32>, EVEX_V512, VEX_W; - defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512, - vz512mem, mgatherv8i64>, EVEX_V512, VEX_W; + defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512, + vy512xmem>, EVEX_V512, VEX_W; + defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info512, + vz512mem>, EVEX_V512, VEX_W; let Predicates = [HasVLX] in { - defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256, - vx256xmem, mgatherv4i32>, EVEX_V256, VEX_W; - defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256, - vy256xmem, mgatherv4i64>, EVEX_V256, VEX_W; - defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, - vx128xmem, mgatherv4i32>, EVEX_V128, VEX_W; - defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, - vx128xmem, mgatherv2i64>, EVEX_V128, VEX_W; + defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256, + vx256xmem>, EVEX_V256, VEX_W; + defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info256, + vy256xmem>, EVEX_V256, VEX_W; + defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128, + vx128xmem>, EVEX_V128, VEX_W; + defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128, + vx128xmem>, EVEX_V128, VEX_W; } } multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { - defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem, - mgatherv16i32>, EVEX_V512; - defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256mem, - mgatherv8i64>, EVEX_V512; + defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512, vz512mem>, + EVEX_V512; + defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info256, vz256mem>, + EVEX_V512; let Predicates = [HasVLX] in { - defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256, - vy256xmem, mgatherv8i32>, EVEX_V256; - defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128, - vy128xmem, mgatherv4i64>, EVEX_V256; - defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, - vx128xmem, mgatherv4i32>, EVEX_V128; - defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, - vx64xmem, mgatherv2i64, VK2WM>, - EVEX_V128; + defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256, + vy256xmem>, EVEX_V256; + defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info128, + vy128xmem>, EVEX_V256; + defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128, + vx128xmem>, EVEX_V128; + defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128, + vx64xmem, VK2WM>, EVEX_V128; } } @@ -9645,55 +9746,52 @@ defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">; multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86MemOperand memop, PatFrag ScatterNode, - RegisterClass MaskRC = _.KRCWM> { + X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> { -let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in +let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain, + hasSideEffects = 0 in def mr : AVX5128I<opc, MRMDestMem, (outs MaskRC:$mask_wb), (ins memop:$dst, MaskRC:$mask, _.RC:$src), !strconcat(OpcodeStr#_.Suffix, "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), - [(set MaskRC:$mask_wb, (ScatterNode (_.VT _.RC:$src), - MaskRC:$mask, vectoraddr:$dst))]>, - EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, + []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteStore]>; } multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { - defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, - vy512xmem, mscatterv8i32>, EVEX_V512, VEX_W; - defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512, - vz512mem, mscatterv8i64>, EVEX_V512, VEX_W; + defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512, + vy512xmem>, EVEX_V512, VEX_W; + defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info512, + vz512mem>, EVEX_V512, VEX_W; let Predicates = [HasVLX] in { - defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256, - vx256xmem, mscatterv4i32>, EVEX_V256, VEX_W; - defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256, - vy256xmem, mscatterv4i64>, EVEX_V256, VEX_W; - defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128, - vx128xmem, mscatterv4i32>, EVEX_V128, VEX_W; - defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128, - vx128xmem, mscatterv2i64>, EVEX_V128, VEX_W; + defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256, + vx256xmem>, EVEX_V256, VEX_W; + defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info256, + vy256xmem>, EVEX_V256, VEX_W; + defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128, + vx128xmem>, EVEX_V128, VEX_W; + defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128, + vx128xmem>, EVEX_V128, VEX_W; } } multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc, AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { - defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem, - mscatterv16i32>, EVEX_V512; - defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256mem, - mscatterv8i64>, EVEX_V512; + defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512, vz512mem>, + EVEX_V512; + defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info256, vz256mem>, + EVEX_V512; let Predicates = [HasVLX] in { - defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256, - vy256xmem, mscatterv8i32>, EVEX_V256; - defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128, - vy128xmem, mscatterv4i64>, EVEX_V256; - defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128, - vx128xmem, mscatterv4i32>, EVEX_V128; - defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128, - vx64xmem, mscatterv2i64, VK2WM>, - EVEX_V128; + defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256, + vy256xmem>, EVEX_V256; + defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info128, + vy128xmem>, EVEX_V256; + defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128, + vx128xmem>, EVEX_V128; + defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128, + vx64xmem, VK2WM>, EVEX_V128; } } @@ -9762,13 +9860,9 @@ defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), - !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"), + !strconcat(OpcodeStr#Vec.Suffix, "\t{$src, $dst|$dst, $src}"), [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>, EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc? - -// Also need a pattern for anyextend. -def : Pat<(Vec.VT (anyext Vec.KRC:$src)), - (!cast<Instruction>(NAME#"rr") Vec.KRC:$src)>; } multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo, @@ -9842,19 +9936,11 @@ let Predicates = [HasDQI, NoBWI] in { (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>; def : Pat<(v16i16 (sext (v16i1 VK16:$src))), (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>; - - def : Pat<(v16i8 (anyext (v16i1 VK16:$src))), - (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>; - def : Pat<(v16i16 (anyext (v16i1 VK16:$src))), - (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>; } let Predicates = [HasDQI, NoBWI, HasVLX] in { def : Pat<(v8i16 (sext (v8i1 VK8:$src))), (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>; - - def : Pat<(v8i16 (anyext (v8i1 VK8:$src))), - (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>; } //===----------------------------------------------------------------------===// @@ -9885,14 +9971,14 @@ multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _, multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> { def : Pat<(X86mCompressingStore (_.VT _.RC:$src), addr:$dst, _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix##mrk) + (!cast<Instruction>(Name#_.ZSuffix#mrk) addr:$dst, _.KRCWM:$mask, _.RC:$src)>; def : Pat<(X86compress (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix##rrk) + (!cast<Instruction>(Name#_.ZSuffix#rrk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>; def : Pat<(X86compress (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix##rrkz) + (!cast<Instruction>(Name#_.ZSuffix#rrkz) _.KRCWM:$mask, _.RC:$src)>; } @@ -9940,23 +10026,23 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> { def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)), - (!cast<Instruction>(Name#_.ZSuffix##rmkz) + (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$src)>; def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, _.ImmAllZerosV)), - (!cast<Instruction>(Name#_.ZSuffix##rmkz) + (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$src)>; def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, (_.VT _.RC:$src0))), - (!cast<Instruction>(Name#_.ZSuffix##rmk) + (!cast<Instruction>(Name#_.ZSuffix#rmk) _.RC:$src0, _.KRCWM:$mask, addr:$src)>; def : Pat<(X86expand (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix##rrk) + (!cast<Instruction>(Name#_.ZSuffix#rrk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>; def : Pat<(X86expand (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask), - (!cast<Instruction>(Name#_.ZSuffix##rrkz) + (!cast<Instruction>(Name#_.ZSuffix#rrkz) _.KRCWM:$mask, _.RC:$src)>; } @@ -9990,26 +10076,33 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256, // op(mem_vec,imm) // op(broadcast(eltVt),imm) //all instruction created with FROUND_CURRENT -multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo _> { +multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode MaskOpNode, + X86FoldableSchedWrite sched, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in { - defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rri : AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), - (i32 timm:$src2))>, Sched<[sched]>; - defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (i32 timm:$src2)), + (MaskOpNode (_.VT _.RC:$src1), (i32 timm:$src2))>, + Sched<[sched]>; + defm rmi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", + OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 timm:$src2))>, + (i32 timm:$src2)), + (MaskOpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 timm:$src2))>, Sched<[sched.Folded, sched.ReadAfterFold]>; - defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + defm rmbi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, - "${src1}"##_.BroadcastStr##", $src2", + OpcodeStr#_.Suffix, "$src2, ${src1}"#_.BroadcastStr, + "${src1}"#_.BroadcastStr#", $src2", (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)), - (i32 timm:$src2))>, EVEX_B, + (i32 timm:$src2)), + (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src1)), + (i32 timm:$src2))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10021,7 +10114,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, let ExeDomain = _.ExeDomain, Uses = [MXCSR] in defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), - OpcodeStr##_.Suffix, "$src2, {sae}, $src1", + OpcodeStr#_.Suffix, "$src2, {sae}, $src1", "$src1, {sae}, $src2", (OpNode (_.VT _.RC:$src1), (i32 timm:$src2))>, @@ -10030,18 +10123,19 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, - SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{ + SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched, + Predicate prd>{ let Predicates = [prd] in { - defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, - _.info512>, + defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.ZMM, _.info512>, avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE, sched.ZMM, _.info512>, EVEX_V512; } let Predicates = [prd, HasVLX] in { - defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM, - _.info128>, EVEX_V128; - defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM, - _.info256>, EVEX_V256; + defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.XMM, _.info128>, EVEX_V128; + defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode, + sched.YMM, _.info256>, EVEX_V256; } } @@ -10068,8 +10162,8 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), - OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr##", $src3", + OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr#", $src3", (OpNode (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), (i32 timm:$src3))>, EVEX_B, @@ -10111,8 +10205,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, let ExeDomain = _.ExeDomain in defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), - OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr##", $src3", + OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr#", $src3", (OpNode (_.VT _.RC:$src1), (_.VT (_.BroadcastLdFrag addr:$src2)), (i8 timm:$src3))>, EVEX_B, @@ -10135,7 +10229,7 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), - (_.VT _.ScalarIntMemCPat:$src2), + (_.ScalarIntMemFrags addr:$src2), (i32 timm:$src3))>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10228,24 +10322,26 @@ multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr, multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr, bits<8> opcPs, bits<8> opcPd, SDNode OpNode, - SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{ + SDNode MaskOpNode, SDNode OpNodeSAE, + X86SchedWriteWidths sched, Predicate prd>{ defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info, - opcPs, OpNode, OpNodeSAE, sched, prd>, + opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, prd>, EVEX_CD8<32, CD8VF>; defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info, - opcPd, OpNode, OpNodeSAE, sched, prd>, + opcPd, OpNode, MaskOpNode, OpNodeSAE, sched, prd>, EVEX_CD8<64, CD8VF>, VEX_W; } defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, - X86VReduce, X86VReduceSAE, SchedWriteFRnd, HasDQI>, - AVX512AIi8Base, EVEX; + X86VReduce, X86VReduce, X86VReduceSAE, + SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX; defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, - X86any_VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>, + X86any_VRndScale, X86VRndScale, X86VRndScaleSAE, + SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX; defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, - X86VGetMant, X86VGetMantSAE, SchedWriteFRnd, HasAVX512>, - AVX512AIi8Base, EVEX; + X86VGetMant, X86VGetMant, X86VGetMantSAE, + SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX; defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, 0x50, X86VRange, X86VRangeSAE, @@ -10302,8 +10398,8 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), - OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr##", $src3", + OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr#", $src3", (_.VT (bitconvert (CastInfo.VT @@ -10391,8 +10487,8 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr, defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), - OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr##", $src3", + OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1", + "$src1, ${src2}"#_.BroadcastStr#", $src3", (X86VAlign _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)), (i8 timm:$src3))>, EVEX_B, @@ -10441,40 +10537,40 @@ def ValigndImm8XForm : SDNodeXForm<timm, [{ multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode, X86VectorVTInfo From, X86VectorVTInfo To, SDNodeXForm ImmXForm> { - def : Pat<(To.VT (vselect To.KRCWM:$mask, - (bitconvert - (From.VT (OpNode From.RC:$src1, From.RC:$src2, - timm:$src3))), - To.RC:$src0)), + def : Pat<(To.VT (vselect_mask To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + timm:$src3))), + To.RC:$src0)), (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask, To.RC:$src1, To.RC:$src2, (ImmXForm timm:$src3))>; - def : Pat<(To.VT (vselect To.KRCWM:$mask, - (bitconvert - (From.VT (OpNode From.RC:$src1, From.RC:$src2, - timm:$src3))), - To.ImmAllZerosV)), + def : Pat<(To.VT (vselect_mask To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + timm:$src3))), + To.ImmAllZerosV)), (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask, To.RC:$src1, To.RC:$src2, (ImmXForm timm:$src3))>; - def : Pat<(To.VT (vselect To.KRCWM:$mask, - (bitconvert - (From.VT (OpNode From.RC:$src1, - (From.LdFrag addr:$src2), - timm:$src3))), - To.RC:$src0)), + def : Pat<(To.VT (vselect_mask To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (From.LdFrag addr:$src2), + timm:$src3))), + To.RC:$src0)), (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask, To.RC:$src1, addr:$src2, (ImmXForm timm:$src3))>; - def : Pat<(To.VT (vselect To.KRCWM:$mask, - (bitconvert - (From.VT (OpNode From.RC:$src1, - (From.LdFrag addr:$src2), - timm:$src3))), - To.ImmAllZerosV)), + def : Pat<(To.VT (vselect_mask To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (From.LdFrag addr:$src2), + timm:$src3))), + To.ImmAllZerosV)), (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask, To.RC:$src1, addr:$src2, (ImmXForm timm:$src3))>; @@ -10491,24 +10587,24 @@ multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode, (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, (ImmXForm timm:$src3))>; - def : Pat<(To.VT (vselect To.KRCWM:$mask, - (bitconvert - (From.VT (OpNode From.RC:$src1, - (bitconvert - (To.VT (To.BroadcastLdFrag addr:$src2))), - timm:$src3))), - To.RC:$src0)), + def : Pat<(To.VT (vselect_mask To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (To.BroadcastLdFrag addr:$src2))), + timm:$src3))), + To.RC:$src0)), (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, To.RC:$src1, addr:$src2, (ImmXForm timm:$src3))>; - def : Pat<(To.VT (vselect To.KRCWM:$mask, - (bitconvert - (From.VT (OpNode From.RC:$src1, - (bitconvert - (To.VT (To.BroadcastLdFrag addr:$src2))), - timm:$src3))), - To.ImmAllZerosV)), + def : Pat<(To.VT (vselect_mask To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (To.BroadcastLdFrag addr:$src2))), + timm:$src3))), + To.ImmAllZerosV)), (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask, To.RC:$src1, addr:$src2, (ImmXForm timm:$src3))>; @@ -10567,8 +10663,8 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512_unary_rm<opc, OpcodeStr, OpNode, sched, _> { defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1), OpcodeStr, - "${src1}"##_.BroadcastStr, - "${src1}"##_.BroadcastStr, + "${src1}"#_.BroadcastStr, + "${src1}"#_.BroadcastStr, (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>, EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded]>; @@ -10751,32 +10847,14 @@ defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle> let Predicates = [HasVLX] in { def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), - (VMOVDDUPZ128rm addr:$src)>; -def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), - (VMOVDDUPZ128rm addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), - (v2f64 VR128X:$src0)), +def : Pat<(vselect_mask (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), + (v2f64 VR128X:$src0)), (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), - immAllZerosV), +def : Pat<(vselect_mask (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), + immAllZerosV), (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; - -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), - (v2f64 VR128X:$src0)), - (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), - immAllZerosV), - (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; - -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), - (v2f64 VR128X:$src0)), - (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), - immAllZerosV), - (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } //===----------------------------------------------------------------------===// @@ -10784,9 +10862,9 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load //===----------------------------------------------------------------------===// let Uses = []<Register>, mayRaiseFPException = 0 in { -defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512, +defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, X86Unpckh, HasAVX512, SchedWriteFShuffleSizes, 0, 1>; -defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512, +defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, X86Unpckl, HasAVX512, SchedWriteFShuffleSizes>; } @@ -10945,16 +11023,15 @@ defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, // AVX-512 - Byte shift Left/Right //===----------------------------------------------------------------------===// -// FIXME: The SSE/AVX names are PSLLDQri etc. - should we add the i here as well? multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr, Format MRMm, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _>{ - def rr : AVX512<opc, MRMr, + def ri : AVX512<opc, MRMr, (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 timm:$src2))))]>, Sched<[sched]>; - def rm : AVX512<opc, MRMm, + def mi : AVX512<opc, MRMm, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst,(_.VT (OpNode @@ -11106,8 +11183,8 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4), - OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2", - "$src2, ${src3}"##_.BroadcastStr##", $src4", + OpcodeStr, "$src4, ${src3}"#_.BroadcastStr#", $src2", + "$src2, ${src3}"#_.BroadcastStr#", $src4", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT (_.BroadcastLdFrag addr:$src3)), @@ -11117,12 +11194,12 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, }// Constraints = "$src1 = $dst" // Additional patterns for matching passthru operand in other positions. - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, @@ -11141,13 +11218,13 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, // Additional patterns for matching zero masking with loads in other // positions. - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src3)), _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), _.RC:$src2, (i8 timm:$src4)), _.ImmAllZerosV)), @@ -11156,31 +11233,31 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, // Additional patterns for matching masked loads with different // operand orders. - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src3)), _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)), _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src3)), _.RC:$src1, _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), @@ -11200,14 +11277,14 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, // Additional patterns for matching zero masking with broadcasts in other // positions. - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3), _.RC:$src2, (i8 timm:$src4)), @@ -11218,32 +11295,32 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, // Additional patterns for matching masked broadcasts with different // operand orders. - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3), _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, (_.BroadcastLdFrag addr:$src3), (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode _.RC:$src2, (_.BroadcastLdFrag addr:$src3), _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, + def : Pat<(_.VT (vselect_mask _.KRCWM:$mask, (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src1, _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), @@ -11288,6 +11365,36 @@ let Predicates = [HasVLX] in { (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, + (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v16i8 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))), + VR128X:$src2, VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, + (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, + (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v16i8 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))), + VR128X:$src2, VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v16i8 (X86vpternlog VR128X:$src1, + (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3, (i8 timm:$src4))), (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3, @@ -11305,6 +11412,66 @@ let Predicates = [HasVLX] in { (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, + (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v8i16 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))), + VR128X:$src2, VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, + (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, + (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v8i16 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))), + VR128X:$src2, VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v8i16 (X86vpternlog VR128X:$src1, + (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v4i32 (X86vpternlog VR128X:$src1, VR128X:$src2, + (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v4i32 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))), + VR128X:$src2, VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v4i32 (X86vpternlog VR128X:$src1, + (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v2i64 (X86vpternlog VR128X:$src1, VR128X:$src2, + (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v2i64 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))), + VR128X:$src2, VR128X:$src1, (i8 timm:$src4))), + (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v2i64 (X86vpternlog VR128X:$src1, + (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))), + VR128X:$src2, (i8 timm:$src4))), + (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 timm:$src4))), (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, @@ -11322,6 +11489,36 @@ let Predicates = [HasVLX] in { (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, + (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v32i8 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))), + VR256X:$src2, VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, + (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, + (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v32i8 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))), + VR256X:$src2, VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v32i8 (X86vpternlog VR256X:$src1, + (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3, (i8 timm:$src4))), (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3, @@ -11338,6 +11535,66 @@ let Predicates = [HasVLX] in { VR256X:$src2, (i8 timm:$src4))), (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, + (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v16i16 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))), + VR256X:$src2, VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, + (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, + (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v16i16 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))), + VR256X:$src2, VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v16i16 (X86vpternlog VR256X:$src1, + (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v8i32 (X86vpternlog VR256X:$src1, VR256X:$src2, + (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v8i32 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))), + VR256X:$src2, VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v8i32 (X86vpternlog VR256X:$src1, + (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v4i64 (X86vpternlog VR256X:$src1, VR256X:$src2, + (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v4i64 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))), + VR256X:$src2, VR256X:$src1, (i8 timm:$src4))), + (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v4i64 (X86vpternlog VR256X:$src1, + (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))), + VR256X:$src2, (i8 timm:$src4))), + (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; } let Predicates = [HasAVX512] in { @@ -11358,6 +11615,36 @@ let Predicates = [HasAVX512] in { (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; + def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, + (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v64i8 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + VR512:$src2, VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v64i8 (X86vpternlog VR512:$src1, + (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2, + (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v64i8 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))), + VR512:$src2, VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v64i8 (X86vpternlog VR512:$src1, + (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3, (i8 timm:$src4))), (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3, @@ -11371,9 +11658,84 @@ let Predicates = [HasAVX512] in { (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3), - VR512:$src2, (i8 timm:$src4))), + VR512:$src2, (i8 timm:$src4))), (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, + (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + VR512:$src2, VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v32i16 (X86vpternlog VR512:$src1, + (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, + (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v32i16 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))), + VR512:$src2, VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v32i16 (X86vpternlog VR512:$src1, + (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, + (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + VR512:$src2, VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v32i16 (X86vpternlog VR512:$src1, + (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v16i32 (X86vpternlog VR512:$src1, VR512:$src2, + (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v16i32 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))), + VR512:$src2, VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v16i32 (X86vpternlog VR512:$src1, + (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; + + def : Pat<(v8i64 (X86vpternlog VR512:$src1, VR512:$src2, + (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + timm:$src4)>; + def : Pat<(v8i64 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + VR512:$src2, VR512:$src1, (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG321_imm8 timm:$src4))>; + def : Pat<(v8i64 (X86vpternlog VR512:$src1, + (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))), + VR512:$src2, (i8 timm:$src4))), + (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, + (VPTERNLOG132_imm8 timm:$src4))>; } // Patterns to implement vnot using vpternlog instead of creating all ones @@ -11484,14 +11846,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, Uses = [MXCSR], mayRaiseFPException = 1 in { defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), - OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", + OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), (TblVT.VT _.RC:$src3), (i32 timm:$src4))>, Sched<[sched]>; defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4), - OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", + OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))), @@ -11499,8 +11861,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), - OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2", - "$src2, ${src3}"##_.BroadcastStr##", $src4", + OpcodeStr#_.Suffix, "$src4, ${src3}"#_.BroadcastStr#", $src2", + "$src2, ${src3}"#_.BroadcastStr#", $src4", (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)), @@ -11516,7 +11878,7 @@ multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr, let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, Uses = [MXCSR] in { defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), - OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", + OpcodeStr#_.Suffix, "$src4, {sae}, $src3, $src2", "$src2, $src3, {sae}, $src4", (X86VFixupimmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), @@ -11533,7 +11895,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), - OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", + OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", (X86VFixupimms (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), @@ -11541,7 +11903,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, let Uses = [MXCSR] in defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), - OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", + OpcodeStr#_.Suffix, "$src4, {sae}, $src3, $src2", "$src2, $src3, {sae}, $src4", (X86VFixupimmSAEs (_.VT _.RC:$src1), (_.VT _.RC:$src2), @@ -11550,7 +11912,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), - OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", + OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", (X86VFixupimms (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_src3VT.VT (scalar_to_vector @@ -11630,8 +11992,9 @@ defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info, // TODO: Some canonicalization in lowering would simplify the number of // patterns we have to try to match. -multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode MoveNode, - X86VectorVTInfo _, PatLeaf ZeroFP> { +multiclass AVX512_scalar_math_fp_patterns<SDNode Op, SDNode MaskedOp, + string OpcPrefix, SDNode MoveNode, + X86VectorVTInfo _, PatLeaf ZeroFP> { let Predicates = [HasAVX512] in { // extracted scalar math op with insert via movss def : Pat<(MoveNode @@ -11639,79 +12002,79 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo (_.VT (scalar_to_vector (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))), _.FRC:$src)))), - (!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst, + (!cast<Instruction>("V"#OpcPrefix#"Zrr_Int") _.VT:$dst, (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>; def : Pat<(MoveNode (_.VT VR128X:$dst), (_.VT (scalar_to_vector (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))), (_.ScalarLdFrag addr:$src))))), - (!cast<Instruction>("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>; + (!cast<Instruction>("V"#OpcPrefix#"Zrm_Int") _.VT:$dst, addr:$src)>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), (scalar_to_vector - (X86selects VK1WM:$mask, - (Op (_.EltVT - (extractelt (_.VT VR128X:$src1), (iPTR 0))), - _.FRC:$src2), + (X86selects_mask VK1WM:$mask, + (MaskedOp (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2), _.FRC:$src0))), - (!cast<Instruction>("V"#OpcPrefix#Zrr_Intk) + (!cast<Instruction>("V"#OpcPrefix#"Zrr_Intk") (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), VK1WM:$mask, _.VT:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; def : Pat<(MoveNode (_.VT VR128X:$src1), (scalar_to_vector - (X86selects VK1WM:$mask, - (Op (_.EltVT - (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (_.ScalarLdFrag addr:$src2)), + (X86selects_mask VK1WM:$mask, + (MaskedOp (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src2)), _.FRC:$src0))), - (!cast<Instruction>("V"#OpcPrefix#Zrm_Intk) + (!cast<Instruction>("V"#OpcPrefix#"Zrm_Intk") (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), VK1WM:$mask, _.VT:$src1, addr:$src2)>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), (scalar_to_vector - (X86selects VK1WM:$mask, - (Op (_.EltVT - (extractelt (_.VT VR128X:$src1), (iPTR 0))), - _.FRC:$src2), (_.EltVT ZeroFP)))), - (!cast<I>("V"#OpcPrefix#Zrr_Intkz) + (X86selects_mask VK1WM:$mask, + (MaskedOp (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2), (_.EltVT ZeroFP)))), + (!cast<I>("V"#OpcPrefix#"Zrr_Intkz") VK1WM:$mask, _.VT:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; def : Pat<(MoveNode (_.VT VR128X:$src1), (scalar_to_vector - (X86selects VK1WM:$mask, - (Op (_.EltVT - (extractelt (_.VT VR128X:$src1), (iPTR 0))), - (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))), - (!cast<I>("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>; + (X86selects_mask VK1WM:$mask, + (MaskedOp (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))), + (!cast<I>("V"#OpcPrefix#"Zrm_Intkz") VK1WM:$mask, _.VT:$src1, addr:$src2)>; } } -defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>; -defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>; -defm : AVX512_scalar_math_fp_patterns<fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>; -defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>; +defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>; +defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>; +defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>; +defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>; -defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>; -defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>; -defm : AVX512_scalar_math_fp_patterns<fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>; -defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>; +defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>; +defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>; +defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>; +defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>; multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, X86VectorVTInfo _> { let Predicates = [HasAVX512] in { def : Pat<(_.VT (Move _.VT:$dst, (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))), - (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src)>; + (!cast<Instruction>("V"#OpcPrefix#"Zr_Int") _.VT:$dst, _.VT:$src)>; } } -defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32x_info>; -defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64x_info>; +defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32x_info>; +defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64x_info>; //===----------------------------------------------------------------------===// // AES instructions @@ -11724,13 +12087,13 @@ multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> { loadv2i64, 0, VR128X, i128mem>, EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, VEX_WIG; defm Z256 : AESI_binop_rm_int<Op, OpStr, - !cast<Intrinsic>(IntPrefix##"_256"), + !cast<Intrinsic>(IntPrefix#"_256"), loadv4i64, 0, VR256X, i256mem>, EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512, HasVAES] in defm Z : AESI_binop_rm_int<Op, OpStr, - !cast<Intrinsic>(IntPrefix##"_512"), + !cast<Intrinsic>(IntPrefix#"_512"), loadv8i64, 0, VR512, i512mem>, EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_WIG; } @@ -11792,8 +12155,8 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, ExeDomain = VTI.ExeDomain in defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr, - "${src3}"##VTI.BroadcastStr##", $src2", - "$src2, ${src3}"##VTI.BroadcastStr, + "${src3}"#VTI.BroadcastStr#", $src2", + "$src2, ${src3}"#VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, AVX512FMA3Base, EVEX_B, @@ -11827,22 +12190,22 @@ multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, } multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix, SDNode OpNode, X86SchedWriteWidths sched> { - defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, sched, + defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; - defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, sched, + defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; - defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, sched, + defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; } multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix, SDNode OpNode, X86SchedWriteWidths sched> { - defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", sched, + defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix#"w", sched, avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>, VEX_W, EVEX_CD8<16, CD8VF>; - defm D : avx512_common_3Op_imm8<Prefix##"d", avx512vl_i32_info, dqOp, + defm D : avx512_common_3Op_imm8<Prefix#"d", avx512vl_i32_info, dqOp, OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; - defm Q : avx512_common_3Op_imm8<Prefix##"q", avx512vl_i64_info, dqOp, OpNode, + defm Q : avx512_common_3Op_imm8<Prefix#"q", avx512vl_i64_info, dqOp, OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; } @@ -11890,8 +12253,8 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), - OpStr, "${src3}"##VTI.BroadcastStr##", $src2", - "$src2, ${src3}"##VTI.BroadcastStr, + OpStr, "${src3}"#VTI.BroadcastStr#", $src2", + "$src2, ${src3}"#VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, @@ -12027,8 +12390,8 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode, let ExeDomain = VTI.ExeDomain in defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), (ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3), - OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1", - "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3", + OpStr, "$src3, ${src2}"#BcstVTI.BroadcastStr#", $src1", + "$src1, ${src2}"#BcstVTI.BroadcastStr#", $src3", (OpNode (VTI.VT VTI.RC:$src1), (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))), (i8 timm:$src3))>, EVEX_B, @@ -12184,41 +12547,44 @@ multiclass avx512_binop_all2<bits<8> opc, string OpcodeStr, } } +let ExeDomain = SSEPackedSingle in defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16", - SchedWriteCvtPD2PS, //FIXME: Shoulod be SchedWriteCvtPS2BF + SchedWriteCvtPD2PS, //FIXME: Should be SchedWriteCvtPS2BF avx512vl_f32_info, avx512vl_i16_info, X86cvtne2ps2bf16, HasBF16, 0>, T8XD; // Truncate Float to BFloat16 multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { + let ExeDomain = SSEPackedSingle in { let Predicates = [HasBF16], Uses = []<Register>, mayRaiseFPException = 0 in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info, - X86cvtneps2bf16, sched.ZMM>, EVEX_V512; + X86cvtneps2bf16, X86cvtneps2bf16, sched.ZMM>, EVEX_V512; } let Predicates = [HasBF16, HasVLX] in { let Uses = []<Register>, mayRaiseFPException = 0 in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info, - null_frag, sched.XMM, "{1to4}", "{x}", f128mem, + null_frag, null_frag, sched.XMM, "{1to4}", "{x}", f128mem, VK4WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info, - X86cvtneps2bf16, + X86cvtneps2bf16, X86cvtneps2bf16, sched.YMM, "{1to8}", "{y}">, EVEX_V256; } + } // Predicates = [HasBF16, HasVLX] + } // ExeDomain = SSEPackedSingle - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, - VR128X:$src), 0>; - def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, - f128mem:$src), 0, "intel">; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, - VR256X:$src), 0>; - def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", - (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, - f256mem:$src), 0, "intel">; - } + def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, + VR128X:$src), 0>; + def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, + f128mem:$src), 0, "intel">; + def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, + VR256X:$src), 0>; + def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, + f256mem:$src), 0, "intel">; } defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16", @@ -12262,25 +12628,24 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _, X86VectorVTInfo src_v> { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.RC:$src3), + (ins src_v.RC:$src2, src_v.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, + (_.VT (OpNode _.RC:$src1, src_v.RC:$src2, src_v.RC:$src3))>, EVEX_4V, Sched<[sched]>; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.MemOp:$src3), + (ins src_v.RC:$src2, src_v.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, - (src_v.VT (bitconvert - (src_v.LdFrag addr:$src3)))))>, EVEX_4V, + (_.VT (OpNode _.RC:$src1, src_v.RC:$src2, + (src_v.LdFrag addr:$src3)))>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.ScalarMemOp:$src3), + (ins src_v.RC:$src2, src_v.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr), - (_.VT (OpNode _.RC:$src1, _.RC:$src2, + (_.VT (OpNode _.RC:$src1, src_v.RC:$src2, (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>, EVEX_B, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -12302,6 +12667,7 @@ multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, } } +let ExeDomain = SSEPackedSingle in defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA, avx512vl_f32_info, avx512vl_i32_info, HasBF16>, T8XS, EVEX_CD8<32, CD8VF>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td index 1e399a894490..f7f22285bd15 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -605,16 +605,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">; def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, - Imm8, i8imm, relocImm8_su, i8imm, invalid_node, + Imm8, i8imm, imm_su, i8imm, invalid_node, 0, OpSizeFixed, 0>; def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, - Imm16, i16imm, relocImm16_su, i16i8imm, i16immSExt8_su, + Imm16, i16imm, imm_su, i16i8imm, i16immSExt8_su, 1, OpSize16, 0>; def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, - Imm32, i32imm, relocImm32_su, i32i8imm, i32immSExt8_su, + Imm32, i32imm, imm_su, i32i8imm, i32immSExt8_su, 1, OpSize32, 0>; def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, - Imm32S, i64i32imm, i64relocImmSExt32_su, i64i8imm, i64immSExt8_su, + Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su, 1, OpSizeFixed, 1>; /// ITy - This instruction base class takes the type info for the instruction. @@ -1217,6 +1217,146 @@ def : Pat<(store (X86adc_flag GR64:$src, (loadi64 addr:$dst), EFLAGS), addr:$dst), (ADC64mr addr:$dst, GR64:$src)>; +// Patterns for basic arithmetic ops with relocImm for the immediate field. +multiclass ArithBinOp_RF_relocImm_Pats<SDNode OpNodeFlag, SDNode OpNode> { + def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2), + (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>; + def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2), + (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2), + (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>; + def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2), + (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2), + (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>; + def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2), + (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2), + (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>; + + def : Pat<(store (OpNode (load addr:$dst), relocImm8_su:$src), addr:$dst), + (!cast<Instruction>(NAME#"8mi") addr:$dst, relocImm8_su:$src)>; + def : Pat<(store (OpNode (load addr:$dst), i16relocImmSExt8_su:$src), addr:$dst), + (!cast<Instruction>(NAME#"16mi8") addr:$dst, i16relocImmSExt8_su:$src)>; + def : Pat<(store (OpNode (load addr:$dst), relocImm16_su:$src), addr:$dst), + (!cast<Instruction>(NAME#"16mi") addr:$dst, relocImm16_su:$src)>; + def : Pat<(store (OpNode (load addr:$dst), i32relocImmSExt8_su:$src), addr:$dst), + (!cast<Instruction>(NAME#"32mi8") addr:$dst, i32relocImmSExt8_su:$src)>; + def : Pat<(store (OpNode (load addr:$dst), relocImm32_su:$src), addr:$dst), + (!cast<Instruction>(NAME#"32mi") addr:$dst, relocImm32_su:$src)>; + def : Pat<(store (OpNode (load addr:$dst), i64relocImmSExt8_su:$src), addr:$dst), + (!cast<Instruction>(NAME#"64mi8") addr:$dst, i64relocImmSExt8_su:$src)>; + def : Pat<(store (OpNode (load addr:$dst), i64relocImmSExt32_su:$src), addr:$dst), + (!cast<Instruction>(NAME#"64mi32") addr:$dst, i64relocImmSExt32_su:$src)>; +} + +multiclass ArithBinOp_RFF_relocImm_Pats<SDNode OpNodeFlag> { + def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2, EFLAGS), + (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>; + def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2, EFLAGS), + (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2, EFLAGS), + (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>; + def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2, EFLAGS), + (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2, EFLAGS), + (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>; + def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2, EFLAGS), + (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2, EFLAGS), + (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>; + + def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm8_su:$src, EFLAGS), addr:$dst), + (!cast<Instruction>(NAME#"8mi") addr:$dst, relocImm8_su:$src)>; + def : Pat<(store (OpNodeFlag (load addr:$dst), i16relocImmSExt8_su:$src, EFLAGS), addr:$dst), + (!cast<Instruction>(NAME#"16mi8") addr:$dst, i16relocImmSExt8_su:$src)>; + def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm16_su:$src, EFLAGS), addr:$dst), + (!cast<Instruction>(NAME#"16mi") addr:$dst, relocImm16_su:$src)>; + def : Pat<(store (OpNodeFlag (load addr:$dst), i32relocImmSExt8_su:$src, EFLAGS), addr:$dst), + (!cast<Instruction>(NAME#"32mi8") addr:$dst, i32relocImmSExt8_su:$src)>; + def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm32_su:$src, EFLAGS), addr:$dst), + (!cast<Instruction>(NAME#"32mi") addr:$dst, relocImm32_su:$src)>; + def : Pat<(store (OpNodeFlag (load addr:$dst), i64relocImmSExt8_su:$src, EFLAGS), addr:$dst), + (!cast<Instruction>(NAME#"64mi8") addr:$dst, i64relocImmSExt8_su:$src)>; + def : Pat<(store (OpNodeFlag (load addr:$dst), i64relocImmSExt32_su:$src, EFLAGS), addr:$dst), + (!cast<Instruction>(NAME#"64mi32") addr:$dst, i64relocImmSExt32_su:$src)>; +} + +multiclass ArithBinOp_F_relocImm_Pats<SDNode OpNodeFlag> { + def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2), + (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>; + def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2), + (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2), + (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>; + def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2), + (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2), + (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>; + def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2), + (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2), + (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>; + + def : Pat<(OpNodeFlag (loadi8 addr:$src1), relocImm8_su:$src2), + (!cast<Instruction>(NAME#"8mi") addr:$src1, relocImm8_su:$src2)>; + def : Pat<(OpNodeFlag (loadi16 addr:$src1), i16relocImmSExt8_su:$src2), + (!cast<Instruction>(NAME#"16mi8") addr:$src1, i16relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag (loadi16 addr:$src1), relocImm16_su:$src2), + (!cast<Instruction>(NAME#"16mi") addr:$src1, relocImm16_su:$src2)>; + def : Pat<(OpNodeFlag (loadi32 addr:$src1), i32relocImmSExt8_su:$src2), + (!cast<Instruction>(NAME#"32mi8") addr:$src1, i32relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag (loadi32 addr:$src1), relocImm32_su:$src2), + (!cast<Instruction>(NAME#"32mi") addr:$src1, relocImm32_su:$src2)>; + def : Pat<(OpNodeFlag (loadi64 addr:$src1), i64relocImmSExt8_su:$src2), + (!cast<Instruction>(NAME#"64mi8") addr:$src1, i64relocImmSExt8_su:$src2)>; + def : Pat<(OpNodeFlag (loadi64 addr:$src1), i64relocImmSExt32_su:$src2), + (!cast<Instruction>(NAME#"64mi32") addr:$src1, i64relocImmSExt32_su:$src2)>; +} + +defm AND : ArithBinOp_RF_relocImm_Pats<X86and_flag, and>; +defm OR : ArithBinOp_RF_relocImm_Pats<X86or_flag, or>; +defm XOR : ArithBinOp_RF_relocImm_Pats<X86xor_flag, xor>; +defm ADD : ArithBinOp_RF_relocImm_Pats<X86add_flag, add>; +defm SUB : ArithBinOp_RF_relocImm_Pats<X86sub_flag, sub>; + +defm ADC : ArithBinOp_RFF_relocImm_Pats<X86adc_flag>; +defm SBB : ArithBinOp_RFF_relocImm_Pats<X86sbb_flag>; + +defm CMP : ArithBinOp_F_relocImm_Pats<X86cmp>; + +// ADC is commutable, but we can't indicate that to tablegen. So manually +// reverse the operands. +def : Pat<(X86adc_flag GR8:$src1, relocImm8_su:$src2, EFLAGS), + (ADC8ri relocImm8_su:$src2, GR8:$src1)>; +def : Pat<(X86adc_flag i16relocImmSExt8_su:$src2, GR16:$src1, EFLAGS), + (ADC16ri8 GR16:$src1, i16relocImmSExt8_su:$src2)>; +def : Pat<(X86adc_flag relocImm16_su:$src2, GR16:$src1, EFLAGS), + (ADC16ri GR16:$src1, relocImm16_su:$src2)>; +def : Pat<(X86adc_flag i32relocImmSExt8_su:$src2, GR32:$src1, EFLAGS), + (ADC32ri8 GR32:$src1, i32relocImmSExt8_su:$src2)>; +def : Pat<(X86adc_flag relocImm32_su:$src2, GR32:$src1, EFLAGS), + (ADC32ri GR32:$src1, relocImm32_su:$src2)>; +def : Pat<(X86adc_flag i64relocImmSExt8_su:$src2, GR64:$src1, EFLAGS), + (ADC64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>; +def : Pat<(X86adc_flag i64relocImmSExt32_su:$src2, GR64:$src1, EFLAGS), + (ADC64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>; + +def : Pat<(store (X86adc_flag relocImm8_su:$src, (load addr:$dst), EFLAGS), addr:$dst), + (ADC8mi addr:$dst, relocImm8_su:$src)>; +def : Pat<(store (X86adc_flag i16relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst), + (ADC16mi8 addr:$dst, i16relocImmSExt8_su:$src)>; +def : Pat<(store (X86adc_flag relocImm16_su:$src, (load addr:$dst), EFLAGS), addr:$dst), + (ADC16mi addr:$dst, relocImm16_su:$src)>; +def : Pat<(store (X86adc_flag i32relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst), + (ADC32mi8 addr:$dst, i32relocImmSExt8_su:$src)>; +def : Pat<(store (X86adc_flag relocImm32_su:$src, (load addr:$dst), EFLAGS), addr:$dst), + (ADC32mi addr:$dst, relocImm32_su:$src)>; +def : Pat<(store (X86adc_flag i64relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst), + (ADC64mi8 addr:$dst, i64relocImmSExt8_su:$src)>; +def : Pat<(store (X86adc_flag i64relocImmSExt32_su:$src, (load addr:$dst), EFLAGS), addr:$dst), + (ADC64mi32 addr:$dst, i64relocImmSExt32_su:$src)>; + //===----------------------------------------------------------------------===// // Semantically, test instructions are similar like AND, except they don't // generate a result. From an encoding perspective, they are very different: @@ -1247,7 +1387,6 @@ let isCompare = 1 in { def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; - let Predicates = [In64BitMode] in def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>; @@ -1267,6 +1406,25 @@ let isCompare = 1 in { "{$src, %rax|rax, $src}">; } // isCompare +// Patterns to match a relocImm into the immediate field. +def : Pat<(X86testpat GR8:$src1, relocImm8_su:$src2), + (TEST8ri GR8:$src1, relocImm8_su:$src2)>; +def : Pat<(X86testpat GR16:$src1, relocImm16_su:$src2), + (TEST16ri GR16:$src1, relocImm16_su:$src2)>; +def : Pat<(X86testpat GR32:$src1, relocImm32_su:$src2), + (TEST32ri GR32:$src1, relocImm32_su:$src2)>; +def : Pat<(X86testpat GR64:$src1, i64relocImmSExt32_su:$src2), + (TEST64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>; + +def : Pat<(X86testpat (loadi8 addr:$src1), relocImm8_su:$src2), + (TEST8mi addr:$src1, relocImm8_su:$src2)>; +def : Pat<(X86testpat (loadi16 addr:$src1), relocImm16_su:$src2), + (TEST16mi addr:$src1, relocImm16_su:$src2)>; +def : Pat<(X86testpat (loadi32 addr:$src1), relocImm32_su:$src2), + (TEST32mi addr:$src1, relocImm32_su:$src2)>; +def : Pat<(X86testpat (loadi64 addr:$src1), i64relocImmSExt32_su:$src2), + (TEST64mi32 addr:$src1, i64relocImmSExt32_su:$src2)>; + //===----------------------------------------------------------------------===// // ANDN Instruction // @@ -1306,7 +1464,6 @@ let Predicates = [HasBMI], AddedComplexity = -6 in { multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop, X86FoldableSchedWrite sched> { let hasSideEffects = 0 in { - let isCommutable = 1 in def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), []>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>; @@ -1314,7 +1471,17 @@ let hasSideEffects = 0 in { let mayLoad = 1 in def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>; + + // Pseudo instructions to be used when the low result isn't used. The + // instruction is defined to keep the high if both destinations are the same. + def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src), + []>, Sched<[sched]>; + + let mayLoad = 1 in + def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src), + []>, Sched<[sched.Folded]>; } } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h index aa45e9b191c1..07079ef87fd4 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h @@ -207,7 +207,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { Flags |= MachineMemOperand::MOStore; MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); return addOffset(MIB.addFrameIndex(FI), Offset) .addMemOperand(MMO); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td index 1fdac104cb73..4df93fb2ed60 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td @@ -111,8 +111,30 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), [(set GR64:$dst, (X86SegAlloca GR64:$size))]>, Requires<[In64BitMode]>; + +// To protect against stack clash, dynamic allocation should perform a memory +// probe at each page. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in +def PROBED_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), + "# variable sized alloca with probing", + [(set GR32:$dst, + (X86ProbedAlloca GR32:$size))]>, + Requires<[NotLP64]>; + +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in +def PROBED_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), + "# variable sized alloca with probing", + [(set GR64:$dst, + (X86ProbedAlloca GR64:$size))]>, + Requires<[In64BitMode]>; } +let hasNoSchedulingInfo = 1 in +def STACKALLOC_W_PROBING : I<0, Pseudo, (outs), (ins i64imm:$stacksize), + "# fixed size alloca with probing", + []>; + // Dynamic stack allocation yields a _chkstk or _alloca call for all Windows // targets. These calls are needed to probe the stack when allocating more than // 4k bytes in one go. Touching the stack at 4K increments is necessary to @@ -177,18 +199,6 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, [(catchret bb:$dst, bb:$from)]>; } -let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1, - usesCustomInserter = 1 in -def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>; - -// This instruction is responsible for re-establishing stack pointers after an -// exception has been caught and we are rejoining normal control flow in the -// parent function or funclet. It generally sets ESP and EBP, and optionally -// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us -// elsewhere. -let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in -def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>; - let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), @@ -308,69 +318,26 @@ def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "", // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. -let isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteMove] in -def MOV32ri64 : I<0, Pseudo, (outs GR64:$dst), (ins i64i32imm:$src), "", []>; - -// This 64-bit pseudo-move can be used for both a 64-bit constant that is -// actually the zero-extension of a 32-bit constant and for labels in the -// x86-64 small code model. -def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>; - +let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, + isPseudo = 1, SchedRW = [WriteMove] in +def MOV32ri64 : I<0, Pseudo, (outs GR64:$dst), (ins i64i32imm:$src), "", + [(set GR64:$dst, i64immZExt32:$src)]>; + +// This 64-bit pseudo-move can also be used for labels in the x86-64 small code +// model. +def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [X86Wrapper]>; def : Pat<(i64 mov64imm32:$src), (MOV32ri64 mov64imm32:$src)>; // Use sbb to materialize carry bit. -let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in { +let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteADC], + hasSideEffects = 0 in { // FIXME: These are pseudo ops that should be replaced with Pat<> patterns. // However, Pat<> can't replicate the destination reg into the inputs of the // result. -def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "", - [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; -def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "", - [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; -def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", - [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; -def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", - [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", []>; +def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", []>; } // isCodeGenOnly - -def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), - (SETB_C16r)>; -def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), - (SETB_C32r)>; -def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), - (SETB_C64r)>; - -def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), - (SETB_C16r)>; -def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), - (SETB_C32r)>; -def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), - (SETB_C64r)>; - -// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and -// will be eliminated and that the sbb can be extended up to a wider type. When -// this happens, it is great. However, if we are left with an 8-bit sbb and an -// and, we might as well just match it as a setb. -def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), - (SETCCr (i8 2))>; - -// Patterns to give priority when both inputs are zero so that we don't use -// an immediate for the RHS. -// TODO: Should we use a 32-bit sbb for 8/16 to push the extract_subreg out? -def : Pat<(X86sbb_flag (i8 0), (i8 0), EFLAGS), - (SBB8rr (EXTRACT_SUBREG (MOV32r0), sub_8bit), - (EXTRACT_SUBREG (MOV32r0), sub_8bit))>; -def : Pat<(X86sbb_flag (i16 0), (i16 0), EFLAGS), - (SBB16rr (EXTRACT_SUBREG (MOV32r0), sub_16bit), - (EXTRACT_SUBREG (MOV32r0), sub_16bit))>; -def : Pat<(X86sbb_flag (i32 0), (i32 0), EFLAGS), - (SBB32rr (MOV32r0), (MOV32r0))>; -def : Pat<(X86sbb_flag (i64 0), (i64 0), EFLAGS), - (SBB64rr (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit), - (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit))>; - //===----------------------------------------------------------------------===// // String Pseudo Instructions // @@ -568,10 +535,13 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>; - let Predicates = [NoAVX512] in { + let Predicates = [HasMMX] in + defm _VR64 : CMOVrr_PSEUDO<VR64, x86mmx>; + + let Predicates = [HasSSE1,NoAVX512] in defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; + let Predicates = [HasSSE2,NoAVX512] in defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; - } let Predicates = [HasAVX512] in { defm _FR32X : CMOVrr_PSEUDO<FR32X, f32>; defm _FR64X : CMOVrr_PSEUDO<FR64X, f64>; @@ -585,6 +555,7 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { defm _VR256X : CMOVrr_PSEUDO<VR256X, v4i64>; } defm _VR512 : CMOVrr_PSEUDO<VR512, v8i64>; + defm _VK1 : CMOVrr_PSEUDO<VK1, v1i1>; defm _VK2 : CMOVrr_PSEUDO<VK2, v2i1>; defm _VK4 : CMOVrr_PSEUDO<VK4, v4i1>; defm _VK8 : CMOVrr_PSEUDO<VK8, v8i1>; @@ -880,7 +851,7 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>; // it. In other words, the register will not fix the clobbering of // RBX that will happen when setting the arguments for the instrucion. // -// Unlike the actual related instuction, we mark that this one +// Unlike the actual related instruction, we mark that this one // defines EBX (instead of using EBX). // The rationale is that we will define RBX during the expansion of // the pseudo. The argument feeding EBX is ebx_input. @@ -1815,21 +1786,24 @@ multiclass MaskedRotateAmountPats<SDNode frag, string name> { defm : MaskedRotateAmountPats<rotl, "ROL">; defm : MaskedRotateAmountPats<rotr, "ROR">; -// Double shift amount is implicitly masked. -multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> { - // (shift x (and y, 31)) ==> (shift x, y) - def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)), - (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>; - def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)), - (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>; - - // (shift x (and y, 63)) ==> (shift x, y) - def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)), - (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>; -} - -defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">; -defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">; +// Double "funnel" shift amount is implicitly masked. +// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) (NOTE: modulo32) +def : Pat<(X86fshl GR16:$src1, GR16:$src2, (shiftMask32 CL)), + (SHLD16rrCL GR16:$src1, GR16:$src2)>; +def : Pat<(X86fshr GR16:$src2, GR16:$src1, (shiftMask32 CL)), + (SHRD16rrCL GR16:$src1, GR16:$src2)>; + +// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) +def : Pat<(fshl GR32:$src1, GR32:$src2, (shiftMask32 CL)), + (SHLD32rrCL GR32:$src1, GR32:$src2)>; +def : Pat<(fshr GR32:$src2, GR32:$src1, (shiftMask32 CL)), + (SHRD32rrCL GR32:$src1, GR32:$src2)>; + +// (fshl/fshr x (and y, 63)) ==> (fshl/fshr x, y) +def : Pat<(fshl GR64:$src1, GR64:$src2, (shiftMask64 CL)), + (SHLD64rrCL GR64:$src1, GR64:$src2)>; +def : Pat<(fshr GR64:$src2, GR64:$src1, (shiftMask64 CL)), + (SHRD64rrCL GR64:$src1, GR64:$src2)>; let Predicates = [HasBMI2] in { let AddedComplexity = 1 in { @@ -1919,15 +1893,6 @@ defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>; defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>; defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>; - -// (anyext (setcc_carry)) -> (setcc_carry) -def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), - (SETB_C16r)>; -def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), - (SETB_C32r)>; -def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))), - (SETB_C32r)>; - //===----------------------------------------------------------------------===// // EFLAGS-defining Patterns //===----------------------------------------------------------------------===// @@ -1999,10 +1964,6 @@ def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>; def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>; def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; -// sub reg, relocImm -def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2), - (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>; - // mul reg, reg def : Pat<(mul GR16:$src1, GR16:$src2), (IMUL16rr GR16:$src1, GR16:$src2)>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td index 1842dc19ec2e..4f7867744017 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td @@ -193,14 +193,16 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { "ljmp{l}\t$seg, $off", []>, OpSize32, Sched<[WriteJump]>; } - def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst), - "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>; - - let AsmVariantName = "att" in - def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst), - "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>; - def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst), - "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>; + let mayLoad = 1 in { + def FARJMP64m : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst), + "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>; + + let AsmVariantName = "att" in + def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst), + "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>; + def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst), + "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>; + } } // Loop instructions @@ -275,10 +277,12 @@ let isCall = 1 in OpSize32, Sched<[WriteJump]>; } - def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst), - "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>; - def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst), - "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>; + let mayLoad = 1 in { + def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst), + "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>; + def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst), + "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>; + } } @@ -351,7 +355,8 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { Requires<[In64BitMode,FavorMemIndirectCall]>, NOTRACK; } - def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst), + let mayLoad = 1 in + def FARCALL64m : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst), "lcall{q}\t{*}$dst", []>; } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td index 9e43a532a3f8..4dbd6bb8cd7e 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td @@ -126,7 +126,7 @@ let ExeDomain = SSEPackedSingle in { loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32, SchedWriteFMA>; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", - loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32, + loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32, SchedWriteFMA>; defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS", loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32, @@ -141,7 +141,7 @@ let ExeDomain = SSEPackedDouble in { loadv2f64, loadv4f64, X86any_Fmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD", - loadv2f64, loadv4f64, X86Fmsub, v2f64, + loadv2f64, loadv4f64, X86any_Fmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W; defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD", loadv2f64, loadv4f64, X86Fmaddsub, @@ -154,19 +154,19 @@ let ExeDomain = SSEPackedDouble in { // Fused Negative Multiply-Add let ExeDomain = SSEPackedSingle in { defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32, - loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>; + loadv8f32, X86any_Fnmadd, v4f32, v8f32, SchedWriteFMA>; defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32, - loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>; + loadv8f32, X86any_Fnmsub, v4f32, v8f32, SchedWriteFMA>; } let ExeDomain = SSEPackedDouble in { defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64, - loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; + loadv4f64, X86any_Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64, - loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W; + loadv4f64, X86any_Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W; } // All source register operands of FMA opcodes defined in fma3s_rm multiclass -// can be commuted. In many cases such commute transformation requres an opcode +// can be commuted. In many cases such commute transformation requires an opcode // adjustment, for example, commuting the operands 1 and 2 in FMA*132 form // would require an opcode change to FMA*231: // FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2; @@ -283,7 +283,7 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, []>, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; } -// The FMA 213 form is created for lowering of scalar FMA intrinscis +// The FMA 213 form is created for lowering of scalar FMA intrinsics // to machine instructions. // The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands // of FMA 213 form. @@ -321,12 +321,12 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd, SchedWriteFMA.Scl>, VEX_LIG; -defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub, +defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub, SchedWriteFMA.Scl>, VEX_LIG; -defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd, +defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86any_Fnmadd, SchedWriteFMA.Scl>, VEX_LIG; -defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub, +defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub, SchedWriteFMA.Scl>, VEX_LIG; multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, @@ -373,14 +373,14 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, } defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; -defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; -defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; -defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; -defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; -defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; -defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; //===----------------------------------------------------------------------===// // FMA4 - AMD 4 operand Fused Multiply-Add instructions @@ -542,26 +542,26 @@ let ExeDomain = SSEPackedSingle in { SchedWriteFMA.Scl>, fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, SchedWriteFMA.Scl>; - defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32, + defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32, SchedWriteFMA.Scl>, fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, SchedWriteFMA.Scl>; defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, - X86Fnmadd, loadf32, SchedWriteFMA.Scl>, + X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>, fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, SchedWriteFMA.Scl>; defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, - X86Fnmsub, loadf32, SchedWriteFMA.Scl>, + X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>, fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, SchedWriteFMA.Scl>; // Packed Instructions defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; - defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, + defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; - defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, + defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86any_Fnmadd, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; - defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, + defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86any_Fnmsub, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, loadv4f32, loadv8f32, SchedWriteFMA>; @@ -575,26 +575,26 @@ let ExeDomain = SSEPackedDouble in { SchedWriteFMA.Scl>, fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, SchedWriteFMA.Scl>; - defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64, + defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64, SchedWriteFMA.Scl>, fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, SchedWriteFMA.Scl>; defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, - X86Fnmadd, loadf64, SchedWriteFMA.Scl>, + X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>, fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, SchedWriteFMA.Scl>; defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, - X86Fnmsub, loadf64, SchedWriteFMA.Scl>, + X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>, fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, SchedWriteFMA.Scl>; // Packed Instructions defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; - defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, + defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; - defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, + defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86any_Fnmadd, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; - defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, + defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86any_Fnmsub, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, loadv2f64, loadv4f64, SchedWriteFMA>; @@ -630,11 +630,11 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name, } defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>; -defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>; -defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>; -defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>; -defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>; -defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>; -defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp index 25bbdddb7a21..6d803e931b68 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -116,11 +116,8 @@ static void verifyTables() { #ifndef NDEBUG static std::atomic<bool> TableChecked(false); if (!TableChecked.load(std::memory_order_relaxed)) { - assert(std::is_sorted(std::begin(Groups), std::end(Groups)) && - std::is_sorted(std::begin(RoundGroups), std::end(RoundGroups)) && - std::is_sorted(std::begin(BroadcastGroups), - std::end(BroadcastGroups)) && - "FMA3 tables not sorted!"); + assert(llvm::is_sorted(Groups) && llvm::is_sorted(RoundGroups) && + llvm::is_sorted(BroadcastGroups) && "FMA3 tables not sorted!"); TableChecked.store(true, std::memory_order_relaxed); } #endif diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h index 7fa6f5917862..ce0a7cc7f82e 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h @@ -14,11 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H #define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H -#include "X86.h" -#include "llvm/ADT/DenseMap.h" -#include <cassert> #include <cstdint> -#include <set> namespace llvm { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td index 1830262205c6..67dcb8d00ea5 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td @@ -22,24 +22,17 @@ def SDTX86Fst : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86Fild : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; -def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, - [SDNPHasChain, SDNPOptInGlue, SDNPMayStore, - SDNPMemOperand]>; + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, - [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, - SDNPMemOperand]>; def X86fist : SDNode<"X86ISD::FIST", SDTX86Fist, - [SDNPHasChain, SDNPOptInGlue, SDNPMayStore, - SDNPMemOperand]>; -def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>; + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, @@ -79,8 +72,9 @@ def X86fild64 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; }]>; -def X86fildflag64 : PatFrag<(ops node:$ptr), (X86fildflag node:$ptr), [{ - return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; +def X86fist32 : PatFrag<(ops node:$val, node:$ptr), + (X86fist node:$val, node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; }]>; def X86fist64 : PatFrag<(ops node:$val, node:$ptr), @@ -292,7 +286,7 @@ defm MUL : FPBinary_rr<any_fmul>; defm DIV : FPBinary_rr<any_fdiv>; } -// Sets the scheduling resources for the actual NAME#_F<size>m defintions. +// Sets the scheduling resources for the actual NAME#_F<size>m definitions. let SchedRW = [WriteFAddLd] in { defm ADD : FPBinary<any_fadd, MRM0m, "add">; defm SUB : FPBinary<any_fsub, MRM4m, "sub">; @@ -381,7 +375,8 @@ def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">; // Versions of FP instructions that take a single memory operand. Added for the // disassembler; remove as they are included with patterns elsewhere. -let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1 in { +let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1, + mayLoad = 1 in { def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">; def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">; @@ -396,21 +391,22 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">; } // SchedRW let SchedRW = [WriteMicrocoded] in { -let Defs = [FPSW, FPCW] in { +let Defs = [FPSW, FPCW], mayLoad = 1 in { def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">; def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">; } -let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW] in { +let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW], mayStore = 1 in { def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">; def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">; } -let Uses = [FPSW] in +let Uses = [FPSW], mayStore = 1 in def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">; +let mayLoad = 1 in def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">; -let Uses = [FPCW] ,mayRaiseFPException = 1 in +let Uses = [FPCW] ,mayRaiseFPException = 1, mayStore = 1 in def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">; } // SchedRW @@ -534,14 +530,20 @@ def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP, let mayStore = 1, hasSideEffects = 0 in { def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>; -def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>; -def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, + [(X86fist32 RFP32:$src, addr:$op)]>; +def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, + [(X86fist64 RFP32:$src, addr:$op)]>; def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>; -def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>; -def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, + [(X86fist32 RFP64:$src, addr:$op)]>; +def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, + [(X86fist64 RFP64:$src, addr:$op)]>; def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>; -def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; -def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; +def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, + [(X86fist32 RFP80:$src, addr:$op)]>; +def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, + [(X86fist64 RFP80:$src, addr:$op)]>; } // mayStore } // SchedRW, Uses = [FPCW] @@ -601,6 +603,7 @@ let SchedRW = [WriteMove], Uses = [FPCW] in { def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RSTi:$op), "fld\t$op">; def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RSTi:$op), "fst\t$op">; def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RSTi:$op), "fstp\t$op">; +let mayRaiseFPException = 0 in def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RSTi:$op), "fxch\t$op">; } @@ -620,13 +623,13 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, [(set RFP80:$dst, fpimm1)]>; } -let SchedRW = [WriteFLD0], Uses = [FPCW] in +let SchedRW = [WriteFLD0], Uses = [FPCW], mayRaiseFPException = 0 in def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">; -let SchedRW = [WriteFLD1], Uses = [FPCW] in +let SchedRW = [WriteFLD1], Uses = [FPCW], mayRaiseFPException = 0 in def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">; -let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW] in { +let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW], mayRaiseFPException = 0 in { def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>; def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>; def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>; @@ -635,25 +638,19 @@ def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>; } // SchedRW // Floating point compares. -let SchedRW = [WriteFCom], Uses = [FPCW] in { -def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, - [(set FPSW, (trunc (X86any_fcmp RFP32:$lhs, RFP32:$rhs)))]>; -def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, - [(set FPSW, (trunc (X86any_fcmp RFP64:$lhs, RFP64:$rhs)))]>; -def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, - [(set FPSW, (trunc (X86any_fcmp RFP80:$lhs, RFP80:$rhs)))]>; -def COM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, - [(set FPSW, (trunc (X86strict_fcmps RFP32:$lhs, RFP32:$rhs)))]>; -def COM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, - [(set FPSW, (trunc (X86strict_fcmps RFP64:$lhs, RFP64:$rhs)))]>; -def COM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, - [(set FPSW, (trunc (X86strict_fcmps RFP80:$lhs, RFP80:$rhs)))]>; +let SchedRW = [WriteFCom], Uses = [FPCW], hasSideEffects = 0 in { +def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, []>; +def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, []>; +def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, []>; +def COM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, []>; +def COM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, []>; +def COM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, []>; } // SchedRW } // mayRaiseFPException = 1 let SchedRW = [WriteFCom], mayRaiseFPException = 1 in { // CC = ST(0) cmp ST(i) -let Defs = [EFLAGS, FPCW], Uses = [FPCW] in { +let Defs = [EFLAGS, FPSW], Uses = [FPCW] in { def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, [(set EFLAGS, (X86any_fcmp RFP32:$lhs, RFP32:$rhs))]>, Requires<[FPStackf32, HasCMov]>; @@ -698,10 +695,9 @@ def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RSTi:$reg), // Floating point flag ops. let SchedRW = [WriteALU] in { -let Defs = [AX, FPSW], Uses = [FPSW] in +let Defs = [AX, FPSW], Uses = [FPSW], hasSideEffects = 0 in def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags - (outs), (ins), "fnstsw\t{%ax|ax}", - [(set AX, (X86fp_stsw FPSW))]>; + (outs), (ins), "fnstsw\t{%ax|ax}", []>; let Defs = [FPSW], Uses = [FPCW] in def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world (outs), (ins i16mem:$dst), "fnstcw\t$dst", @@ -754,20 +750,20 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>; let Uses = [FPSW, FPCW] in { def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst), - "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB, + "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, PS, Requires<[HasFXSR]>; def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst), "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>, - TB, Requires<[HasFXSR, In64BitMode]>; + PS, Requires<[HasFXSR, In64BitMode]>; } // Uses = [FPSW, FPCW] let Defs = [FPSW, FPCW] in { def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src), "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>, - TB, Requires<[HasFXSR]>; + PS, Requires<[HasFXSR]>; def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src), "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>, - TB, Requires<[HasFXSR, In64BitMode]>; + PS, Requires<[HasFXSR, In64BitMode]>; } // Defs = [FPSW, FPCW] } // SchedRW @@ -799,13 +795,6 @@ def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>; def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>; def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>; -// Used to conv. i64 to f64 since there isn't a SSE version. -def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m64 addr:$src)>; - -// Used to conv. between f80 and i64 for i64 atomic loads. -def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m80 addr:$src)>; -def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>; - // FP extensions map onto simple pseudo-value conversions if they are to/from // the FP stack. def : Pat<(f64 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp index f3b286e0375c..e16382e956c5 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -486,7 +486,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE }, + { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 }, { X86::CVTSD2SI64rr_Int, X86::CVTSD2SI64rm_Int, TB_NO_REVERSE }, + { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, { X86::CVTSD2SIrr_Int, X86::CVTSD2SIrm_Int, TB_NO_REVERSE }, { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 }, { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 }, @@ -494,7 +496,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::CVTSI642SDrr, X86::CVTSI642SDrm, 0 }, { X86::CVTSI642SSrr, X86::CVTSI642SSrm, 0 }, { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 }, + { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, { X86::CVTSS2SI64rr_Int, X86::CVTSS2SI64rm_Int, TB_NO_REVERSE }, + { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, { X86::CVTSS2SIrr_Int, X86::CVTSS2SIrm_Int, TB_NO_REVERSE }, { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, @@ -627,18 +631,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::UCOMISSrr_Int, X86::UCOMISSrm_Int, TB_NO_REVERSE }, { X86::VAESIMCrr, X86::VAESIMCrm, 0 }, { X86::VAESKEYGENASSIST128rr,X86::VAESKEYGENASSIST128rm,0 }, - { X86::VBROADCASTF32X2Z256r, X86::VBROADCASTF32X2Z256m, TB_NO_REVERSE }, - { X86::VBROADCASTF32X2Zr, X86::VBROADCASTF32X2Zm, TB_NO_REVERSE }, - { X86::VBROADCASTI32X2Z128r, X86::VBROADCASTI32X2Z128m, TB_NO_REVERSE }, - { X86::VBROADCASTI32X2Z256r, X86::VBROADCASTI32X2Z256m, TB_NO_REVERSE }, - { X86::VBROADCASTI32X2Zr, X86::VBROADCASTI32X2Zm, TB_NO_REVERSE }, + { X86::VBROADCASTF32X2Z256rr,X86::VBROADCASTF32X2Z256rm,TB_NO_REVERSE }, + { X86::VBROADCASTF32X2Zrr, X86::VBROADCASTF32X2Zrm, TB_NO_REVERSE }, + { X86::VBROADCASTI32X2Z128rr,X86::VBROADCASTI32X2Z128rm,TB_NO_REVERSE }, + { X86::VBROADCASTI32X2Z256rr,X86::VBROADCASTI32X2Z256rm,TB_NO_REVERSE }, + { X86::VBROADCASTI32X2Zrr, X86::VBROADCASTI32X2Zrm, TB_NO_REVERSE }, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rr, X86::VBROADCASTSDZ256rm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrr, X86::VBROADCASTSDZrm, TB_NO_REVERSE }, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128rr, X86::VBROADCASTSSZ128rm, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256rr, X86::VBROADCASTSSZ256rm, TB_NO_REVERSE }, + { X86::VBROADCASTSSZrr, X86::VBROADCASTSSZrm, TB_NO_REVERSE }, { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, { X86::VCOMISDZrr, X86::VCOMISDZrm, 0 }, { X86::VCOMISDZrr_Int, X86::VCOMISDZrm_Int, TB_NO_REVERSE }, @@ -710,15 +714,23 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VCVTQQ2PSZ128rr, X86::VCVTQQ2PSZ128rm, 0 }, { X86::VCVTQQ2PSZ256rr, X86::VCVTQQ2PSZ256rm, 0 }, { X86::VCVTQQ2PSZrr, X86::VCVTQQ2PSZrm, 0 }, + { X86::VCVTSD2SI64Zrr, X86::VCVTSD2SI64Zrm, 0 }, { X86::VCVTSD2SI64Zrr_Int, X86::VCVTSD2SI64Zrm_Int, TB_NO_REVERSE }, + { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 }, { X86::VCVTSD2SI64rr_Int, X86::VCVTSD2SI64rm_Int, TB_NO_REVERSE }, + { X86::VCVTSD2SIZrr, X86::VCVTSD2SIZrm, 0 }, { X86::VCVTSD2SIZrr_Int, X86::VCVTSD2SIZrm_Int, TB_NO_REVERSE }, + { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, { X86::VCVTSD2SIrr_Int, X86::VCVTSD2SIrm_Int, TB_NO_REVERSE }, { X86::VCVTSD2USI64Zrr_Int, X86::VCVTSD2USI64Zrm_Int, TB_NO_REVERSE }, { X86::VCVTSD2USIZrr_Int, X86::VCVTSD2USIZrm_Int, TB_NO_REVERSE }, + { X86::VCVTSS2SI64Zrr, X86::VCVTSS2SI64Zrm, 0 }, { X86::VCVTSS2SI64Zrr_Int, X86::VCVTSS2SI64Zrm_Int, TB_NO_REVERSE }, + { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, { X86::VCVTSS2SI64rr_Int, X86::VCVTSS2SI64rm_Int, TB_NO_REVERSE }, + { X86::VCVTSS2SIZrr, X86::VCVTSS2SIZrm, 0 }, { X86::VCVTSS2SIZrr_Int, X86::VCVTSS2SIZrm_Int, TB_NO_REVERSE }, + { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, { X86::VCVTSS2SIrr_Int, X86::VCVTSS2SIrm_Int, TB_NO_REVERSE }, { X86::VCVTSS2USI64Zrr_Int, X86::VCVTSS2USI64Zrm_Int, TB_NO_REVERSE }, { X86::VCVTSS2USIZrr_Int, X86::VCVTSS2USIZrm_Int, TB_NO_REVERSE }, @@ -906,24 +918,24 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VPABSWZrr, X86::VPABSWZrm, 0 }, { X86::VPABSWrr, X86::VPABSWrm, 0 }, { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTBZ128r, X86::VPBROADCASTBZ128m, TB_NO_REVERSE }, - { X86::VPBROADCASTBZ256r, X86::VPBROADCASTBZ256m, TB_NO_REVERSE }, - { X86::VPBROADCASTBZr, X86::VPBROADCASTBZm, TB_NO_REVERSE }, - { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE }, + { X86::VPBROADCASTBZ128rr, X86::VPBROADCASTBZ128rm, TB_NO_REVERSE }, + { X86::VPBROADCASTBZ256rr, X86::VPBROADCASTBZ256rm, TB_NO_REVERSE }, + { X86::VPBROADCASTBZrr, X86::VPBROADCASTBZrm, TB_NO_REVERSE }, + { X86::VPBROADCASTBrr , X86::VPBROADCASTBrm, TB_NO_REVERSE }, { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTDZ128r, X86::VPBROADCASTDZ128m, TB_NO_REVERSE }, - { X86::VPBROADCASTDZ256r, X86::VPBROADCASTDZ256m, TB_NO_REVERSE }, - { X86::VPBROADCASTDZr, X86::VPBROADCASTDZm, TB_NO_REVERSE }, + { X86::VPBROADCASTDZ128rr, X86::VPBROADCASTDZ128rm, TB_NO_REVERSE }, + { X86::VPBROADCASTDZ256rr, X86::VPBROADCASTDZ256rm, TB_NO_REVERSE }, + { X86::VPBROADCASTDZrr, X86::VPBROADCASTDZrm, TB_NO_REVERSE }, { X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE }, { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTQZ128r, X86::VPBROADCASTQZ128m, TB_NO_REVERSE }, - { X86::VPBROADCASTQZ256r, X86::VPBROADCASTQZ256m, TB_NO_REVERSE }, - { X86::VPBROADCASTQZr, X86::VPBROADCASTQZm, TB_NO_REVERSE }, + { X86::VPBROADCASTQZ128rr, X86::VPBROADCASTQZ128rm, TB_NO_REVERSE }, + { X86::VPBROADCASTQZ256rr, X86::VPBROADCASTQZ256rm, TB_NO_REVERSE }, + { X86::VPBROADCASTQZrr, X86::VPBROADCASTQZrm, TB_NO_REVERSE }, { X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE }, { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE }, - { X86::VPBROADCASTWZ128r, X86::VPBROADCASTWZ128m, TB_NO_REVERSE }, - { X86::VPBROADCASTWZ256r, X86::VPBROADCASTWZ256m, TB_NO_REVERSE }, - { X86::VPBROADCASTWZr, X86::VPBROADCASTWZm, TB_NO_REVERSE }, + { X86::VPBROADCASTWZ128rr, X86::VPBROADCASTWZ128rm, TB_NO_REVERSE }, + { X86::VPBROADCASTWZ256rr, X86::VPBROADCASTWZ256rm, TB_NO_REVERSE }, + { X86::VPBROADCASTWZrr, X86::VPBROADCASTWZrm, TB_NO_REVERSE }, { X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE }, { X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 }, { X86::VPCMPESTRMrr, X86::VPCMPESTRMrm, 0 }, @@ -1100,9 +1112,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 }, { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 }, { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, - { X86::VPSLLDQZ128rr, X86::VPSLLDQZ128rm, 0 }, - { X86::VPSLLDQZ256rr, X86::VPSLLDQZ256rm, 0 }, - { X86::VPSLLDQZrr, X86::VPSLLDQZrm, 0 }, + { X86::VPSLLDQZ128ri, X86::VPSLLDQZ128mi, 0 }, + { X86::VPSLLDQZ256ri, X86::VPSLLDQZ256mi, 0 }, + { X86::VPSLLDQZri, X86::VPSLLDQZmi, 0 }, { X86::VPSLLDZ128ri, X86::VPSLLDZ128mi, 0 }, { X86::VPSLLDZ256ri, X86::VPSLLDZ256mi, 0 }, { X86::VPSLLDZri, X86::VPSLLDZmi, 0 }, @@ -1121,9 +1133,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::VPSRAWZ128ri, X86::VPSRAWZ128mi, 0 }, { X86::VPSRAWZ256ri, X86::VPSRAWZ256mi, 0 }, { X86::VPSRAWZri, X86::VPSRAWZmi, 0 }, - { X86::VPSRLDQZ128rr, X86::VPSRLDQZ128rm, 0 }, - { X86::VPSRLDQZ256rr, X86::VPSRLDQZ256rm, 0 }, - { X86::VPSRLDQZrr, X86::VPSRLDQZrm, 0 }, + { X86::VPSRLDQZ128ri, X86::VPSRLDQZ128mi, 0 }, + { X86::VPSRLDQZ256ri, X86::VPSRLDQZ256mi, 0 }, + { X86::VPSRLDQZri, X86::VPSRLDQZmi, 0 }, { X86::VPSRLDZ128ri, X86::VPSRLDZ128mi, 0 }, { X86::VPSRLDZ256ri, X86::VPSRLDZ256mi, 0 }, { X86::VPSRLDZri, X86::VPSRLDZmi, 0 }, @@ -1609,16 +1621,16 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 }, { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 }, { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 }, - { X86::VBROADCASTF32X2Z256rkz, X86::VBROADCASTF32X2Z256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTF32X2Zrkz, X86::VBROADCASTF32X2Zmkz, TB_NO_REVERSE }, - { X86::VBROADCASTI32X2Z128rkz, X86::VBROADCASTI32X2Z128mkz, TB_NO_REVERSE }, - { X86::VBROADCASTI32X2Z256rkz, X86::VBROADCASTI32X2Z256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTI32X2Zrkz, X86::VBROADCASTI32X2Zmkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, - { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, + { X86::VBROADCASTF32X2Z256rrkz, X86::VBROADCASTF32X2Z256rmkz, TB_NO_REVERSE }, + { X86::VBROADCASTF32X2Zrrkz, X86::VBROADCASTF32X2Zrmkz, TB_NO_REVERSE }, + { X86::VBROADCASTI32X2Z128rrkz, X86::VBROADCASTI32X2Z128rmkz, TB_NO_REVERSE }, + { X86::VBROADCASTI32X2Z256rrkz, X86::VBROADCASTI32X2Z256rmkz, TB_NO_REVERSE }, + { X86::VBROADCASTI32X2Zrrkz, X86::VBROADCASTI32X2Zrmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rrkz, X86::VBROADCASTSDZ256rmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrrkz, X86::VBROADCASTSDZrmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128rrkz, X86::VBROADCASTSSZ128rmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256rrkz, X86::VBROADCASTSSZ256rmkz, TB_NO_REVERSE }, + { X86::VBROADCASTSSZrrkz, X86::VBROADCASTSSZrmkz, TB_NO_REVERSE }, { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 }, { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 }, { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 }, @@ -2153,18 +2165,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 }, { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 }, { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 }, - { X86::VPBROADCASTBZ128rkz, X86::VPBROADCASTBZ128mkz, TB_NO_REVERSE }, - { X86::VPBROADCASTBZ256rkz, X86::VPBROADCASTBZ256mkz, TB_NO_REVERSE }, - { X86::VPBROADCASTBZrkz, X86::VPBROADCASTBZmkz, TB_NO_REVERSE }, - { X86::VPBROADCASTDZ128rkz, X86::VPBROADCASTDZ128mkz, TB_NO_REVERSE }, - { X86::VPBROADCASTDZ256rkz, X86::VPBROADCASTDZ256mkz, TB_NO_REVERSE }, - { X86::VPBROADCASTDZrkz, X86::VPBROADCASTDZmkz, TB_NO_REVERSE }, - { X86::VPBROADCASTQZ128rkz, X86::VPBROADCASTQZ128mkz, TB_NO_REVERSE }, - { X86::VPBROADCASTQZ256rkz, X86::VPBROADCASTQZ256mkz, TB_NO_REVERSE }, - { X86::VPBROADCASTQZrkz, X86::VPBROADCASTQZmkz, TB_NO_REVERSE }, - { X86::VPBROADCASTWZ128rkz, X86::VPBROADCASTWZ128mkz, TB_NO_REVERSE }, - { X86::VPBROADCASTWZ256rkz, X86::VPBROADCASTWZ256mkz, TB_NO_REVERSE }, - { X86::VPBROADCASTWZrkz, X86::VPBROADCASTWZmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTBZ128rrkz, X86::VPBROADCASTBZ128rmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTBZ256rrkz, X86::VPBROADCASTBZ256rmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTBZrrkz, X86::VPBROADCASTBZrmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTDZ128rrkz, X86::VPBROADCASTDZ128rmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTDZ256rrkz, X86::VPBROADCASTDZ256rmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTDZrrkz, X86::VPBROADCASTDZrmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTQZ128rrkz, X86::VPBROADCASTQZ128rmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTQZ256rrkz, X86::VPBROADCASTQZ256rmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTQZrrkz, X86::VPBROADCASTQZrmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTWZ128rrkz, X86::VPBROADCASTWZ128rmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTWZ256rrkz, X86::VPBROADCASTWZ256rmkz, TB_NO_REVERSE }, + { X86::VPBROADCASTWZrrkz, X86::VPBROADCASTWZrmkz, TB_NO_REVERSE }, { X86::VPCLMULQDQYrr, X86::VPCLMULQDQYrm, 0 }, { X86::VPCLMULQDQZ128rr, X86::VPCLMULQDQZ128rm, 0 }, { X86::VPCLMULQDQZ256rr, X86::VPCLMULQDQZ256rm, 0 }, @@ -3010,16 +3022,16 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmk, 0 }, { X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmk, 0 }, { X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmk, 0 }, - { X86::VBROADCASTF32X2Z256rk, X86::VBROADCASTF32X2Z256mk, TB_NO_REVERSE }, - { X86::VBROADCASTF32X2Zrk, X86::VBROADCASTF32X2Zmk, TB_NO_REVERSE }, - { X86::VBROADCASTI32X2Z128rk, X86::VBROADCASTI32X2Z128mk, TB_NO_REVERSE }, - { X86::VBROADCASTI32X2Z256rk, X86::VBROADCASTI32X2Z256mk, TB_NO_REVERSE }, - { X86::VBROADCASTI32X2Zrk, X86::VBROADCASTI32X2Zmk, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, - { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, - { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, - { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, + { X86::VBROADCASTF32X2Z256rrk, X86::VBROADCASTF32X2Z256rmk, TB_NO_REVERSE }, + { X86::VBROADCASTF32X2Zrrk, X86::VBROADCASTF32X2Zrmk, TB_NO_REVERSE }, + { X86::VBROADCASTI32X2Z128rrk, X86::VBROADCASTI32X2Z128rmk, TB_NO_REVERSE }, + { X86::VBROADCASTI32X2Z256rrk, X86::VBROADCASTI32X2Z256rmk, TB_NO_REVERSE }, + { X86::VBROADCASTI32X2Zrrk, X86::VBROADCASTI32X2Zrmk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256rrk, X86::VBROADCASTSDZ256rmk, TB_NO_REVERSE }, + { X86::VBROADCASTSDZrrk, X86::VBROADCASTSDZrmk, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128rrk, X86::VBROADCASTSSZ128rmk, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256rrk, X86::VBROADCASTSSZ256rmk, TB_NO_REVERSE }, + { X86::VBROADCASTSSZrrk, X86::VBROADCASTSSZrmk, TB_NO_REVERSE }, { X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0 }, { X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0 }, { X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0 }, @@ -3662,18 +3674,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { { X86::VPBLENDMWZ128rrk, X86::VPBLENDMWZ128rmk, 0 }, { X86::VPBLENDMWZ256rrk, X86::VPBLENDMWZ256rmk, 0 }, { X86::VPBLENDMWZrrk, X86::VPBLENDMWZrmk, 0 }, - { X86::VPBROADCASTBZ128rk, X86::VPBROADCASTBZ128mk, TB_NO_REVERSE }, - { X86::VPBROADCASTBZ256rk, X86::VPBROADCASTBZ256mk, TB_NO_REVERSE }, - { X86::VPBROADCASTBZrk, X86::VPBROADCASTBZmk, TB_NO_REVERSE }, - { X86::VPBROADCASTDZ128rk, X86::VPBROADCASTDZ128mk, TB_NO_REVERSE }, - { X86::VPBROADCASTDZ256rk, X86::VPBROADCASTDZ256mk, TB_NO_REVERSE }, - { X86::VPBROADCASTDZrk, X86::VPBROADCASTDZmk, TB_NO_REVERSE }, - { X86::VPBROADCASTQZ128rk, X86::VPBROADCASTQZ128mk, TB_NO_REVERSE }, - { X86::VPBROADCASTQZ256rk, X86::VPBROADCASTQZ256mk, TB_NO_REVERSE }, - { X86::VPBROADCASTQZrk, X86::VPBROADCASTQZmk, TB_NO_REVERSE }, - { X86::VPBROADCASTWZ128rk, X86::VPBROADCASTWZ128mk, TB_NO_REVERSE }, - { X86::VPBROADCASTWZ256rk, X86::VPBROADCASTWZ256mk, TB_NO_REVERSE }, - { X86::VPBROADCASTWZrk, X86::VPBROADCASTWZmk, TB_NO_REVERSE }, + { X86::VPBROADCASTBZ128rrk, X86::VPBROADCASTBZ128rmk, TB_NO_REVERSE }, + { X86::VPBROADCASTBZ256rrk, X86::VPBROADCASTBZ256rmk, TB_NO_REVERSE }, + { X86::VPBROADCASTBZrrk, X86::VPBROADCASTBZrmk, TB_NO_REVERSE }, + { X86::VPBROADCASTDZ128rrk, X86::VPBROADCASTDZ128rmk, TB_NO_REVERSE }, + { X86::VPBROADCASTDZ256rrk, X86::VPBROADCASTDZ256rmk, TB_NO_REVERSE }, + { X86::VPBROADCASTDZrrk, X86::VPBROADCASTDZrmk, TB_NO_REVERSE }, + { X86::VPBROADCASTQZ128rrk, X86::VPBROADCASTQZ128rmk, TB_NO_REVERSE }, + { X86::VPBROADCASTQZ256rrk, X86::VPBROADCASTQZ256rmk, TB_NO_REVERSE }, + { X86::VPBROADCASTQZrrk, X86::VPBROADCASTQZrmk, TB_NO_REVERSE }, + { X86::VPBROADCASTWZ128rrk, X86::VPBROADCASTWZ128rmk, TB_NO_REVERSE }, + { X86::VPBROADCASTWZ256rrk, X86::VPBROADCASTWZ256rmk, TB_NO_REVERSE }, + { X86::VPBROADCASTWZrrk, X86::VPBROADCASTWZrmk, TB_NO_REVERSE }, { X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 }, { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 }, { X86::VPCMPBZ128rrik, X86::VPCMPBZ128rmik, 0 }, @@ -5509,6 +5521,12 @@ static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = { { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS }, { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS }, { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS }, + { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmbi, TB_BCAST_D }, + { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmbi, TB_BCAST_D }, + { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmbi, TB_BCAST_D }, + { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmbi, TB_BCAST_Q }, + { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmbi, TB_BCAST_Q }, + { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmbi, TB_BCAST_Q }, }; static const X86MemoryFoldTableEntry * @@ -5517,53 +5535,45 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) { // Make sure the tables are sorted. static std::atomic<bool> FoldTablesChecked(false); if (!FoldTablesChecked.load(std::memory_order_relaxed)) { - assert(std::is_sorted(std::begin(MemoryFoldTable2Addr), - std::end(MemoryFoldTable2Addr)) && + assert(llvm::is_sorted(MemoryFoldTable2Addr) && std::adjacent_find(std::begin(MemoryFoldTable2Addr), std::end(MemoryFoldTable2Addr)) == - std::end(MemoryFoldTable2Addr) && + std::end(MemoryFoldTable2Addr) && "MemoryFoldTable2Addr is not sorted and unique!"); - assert(std::is_sorted(std::begin(MemoryFoldTable0), - std::end(MemoryFoldTable0)) && + assert(llvm::is_sorted(MemoryFoldTable0) && std::adjacent_find(std::begin(MemoryFoldTable0), std::end(MemoryFoldTable0)) == - std::end(MemoryFoldTable0) && + std::end(MemoryFoldTable0) && "MemoryFoldTable0 is not sorted and unique!"); - assert(std::is_sorted(std::begin(MemoryFoldTable1), - std::end(MemoryFoldTable1)) && + assert(llvm::is_sorted(MemoryFoldTable1) && std::adjacent_find(std::begin(MemoryFoldTable1), std::end(MemoryFoldTable1)) == - std::end(MemoryFoldTable1) && + std::end(MemoryFoldTable1) && "MemoryFoldTable1 is not sorted and unique!"); - assert(std::is_sorted(std::begin(MemoryFoldTable2), - std::end(MemoryFoldTable2)) && + assert(llvm::is_sorted(MemoryFoldTable2) && std::adjacent_find(std::begin(MemoryFoldTable2), std::end(MemoryFoldTable2)) == - std::end(MemoryFoldTable2) && + std::end(MemoryFoldTable2) && "MemoryFoldTable2 is not sorted and unique!"); - assert(std::is_sorted(std::begin(MemoryFoldTable3), - std::end(MemoryFoldTable3)) && + assert(llvm::is_sorted(MemoryFoldTable3) && std::adjacent_find(std::begin(MemoryFoldTable3), std::end(MemoryFoldTable3)) == - std::end(MemoryFoldTable3) && + std::end(MemoryFoldTable3) && "MemoryFoldTable3 is not sorted and unique!"); - assert(std::is_sorted(std::begin(MemoryFoldTable4), - std::end(MemoryFoldTable4)) && + assert(llvm::is_sorted(MemoryFoldTable4) && std::adjacent_find(std::begin(MemoryFoldTable4), std::end(MemoryFoldTable4)) == - std::end(MemoryFoldTable4) && + std::end(MemoryFoldTable4) && "MemoryFoldTable4 is not sorted and unique!"); - assert(std::is_sorted(std::begin(BroadcastFoldTable2), - std::end(BroadcastFoldTable2)) && + assert(llvm::is_sorted(BroadcastFoldTable2) && std::adjacent_find(std::begin(BroadcastFoldTable2), std::end(BroadcastFoldTable2)) == - std::end(BroadcastFoldTable2) && + std::end(BroadcastFoldTable2) && "BroadcastFoldTable2 is not sorted and unique!"); - assert(std::is_sorted(std::begin(BroadcastFoldTable3), - std::end(BroadcastFoldTable3)) && + assert(llvm::is_sorted(BroadcastFoldTable3) && std::adjacent_find(std::begin(BroadcastFoldTable3), std::end(BroadcastFoldTable3)) == - std::end(BroadcastFoldTable3) && + std::end(BroadcastFoldTable3) && "BroadcastFoldTable3 is not sorted and unique!"); FoldTablesChecked.store(true, std::memory_order_relaxed); } @@ -5639,7 +5649,7 @@ struct X86MemUnfoldTable { addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3) - // Index 2, folded broadcast + // Index 3, folded broadcast addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); // Sort the memory->reg unfold table. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h index 7dc236a0d7e4..b7aca27ab2bb 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h @@ -13,7 +13,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H #define LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H -#include "llvm/Support/DataTypes.h" +#include <cstdint> namespace llvm { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td index 2f797fcfb8de..d7752e656b55 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td @@ -27,26 +27,33 @@ def RawFrmDstSrc : Format<6>; def RawFrmImm8 : Format<7>; def RawFrmImm16 : Format<8>; def AddCCFrm : Format<9>; -def MRMDestMem : Format<32>; -def MRMSrcMem : Format<33>; -def MRMSrcMem4VOp3 : Format<34>; -def MRMSrcMemOp4 : Format<35>; -def MRMSrcMemCC : Format<36>; -def MRMXmCC: Format<38>; -def MRMXm : Format<39>; -def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>; -def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>; -def MRM6m : Format<46>; def MRM7m : Format<47>; -def MRMDestReg : Format<48>; -def MRMSrcReg : Format<49>; -def MRMSrcReg4VOp3 : Format<50>; -def MRMSrcRegOp4 : Format<51>; -def MRMSrcRegCC : Format<52>; -def MRMXrCC: Format<54>; -def MRMXr : Format<55>; -def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>; -def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>; -def MRM6r : Format<62>; def MRM7r : Format<63>; +def PrefixByte : Format<10>; +def MRMr0 : Format<21>; +def MRMSrcMemFSIB : Format<22>; +def MRMDestMemFSIB : Format<23>; +def MRMDestMem : Format<24>; +def MRMSrcMem : Format<25>; +def MRMSrcMem4VOp3 : Format<26>; +def MRMSrcMemOp4 : Format<27>; +def MRMSrcMemCC : Format<28>; +def MRMXmCC: Format<30>; +def MRMXm : Format<31>; +def MRM0m : Format<32>; def MRM1m : Format<33>; def MRM2m : Format<34>; +def MRM3m : Format<35>; def MRM4m : Format<36>; def MRM5m : Format<37>; +def MRM6m : Format<38>; def MRM7m : Format<39>; +def MRMDestReg : Format<40>; +def MRMSrcReg : Format<41>; +def MRMSrcReg4VOp3 : Format<42>; +def MRMSrcRegOp4 : Format<43>; +def MRMSrcRegCC : Format<44>; +def MRMXrCC: Format<46>; +def MRMXr : Format<47>; +def MRM0r : Format<48>; def MRM1r : Format<49>; def MRM2r : Format<50>; +def MRM3r : Format<51>; def MRM4r : Format<52>; def MRM5r : Format<53>; +def MRM6r : Format<54>; def MRM7r : Format<55>; +def MRM0X : Format<56>; def MRM1X : Format<57>; def MRM2X : Format<58>; +def MRM3X : Format<59>; def MRM4X : Format<60>; def MRM5X : Format<61>; +def MRM6X : Format<62>; def MRM7X : Format<63>; def MRM_C0 : Format<64>; def MRM_C1 : Format<65>; def MRM_C2 : Format<66>; def MRM_C3 : Format<67>; def MRM_C4 : Format<68>; def MRM_C5 : Format<69>; def MRM_C6 : Format<70>; def MRM_C7 : Format<71>; def MRM_C8 : Format<72>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 3250123e5aa6..f3f7d17d9b3c 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -59,9 +59,13 @@ def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; -def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; -def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; -def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; +def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>; +def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>; + +def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; + def X86pshufb : SDNode<"X86ISD::PSHUFB", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; @@ -535,8 +539,20 @@ def X86any_Fmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3), [(X86strict_Fmadd node:$src1, node:$src2, node:$src3), (X86Fmadd node:$src1, node:$src2, node:$src3)]>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>; +def X86strict_Fnmadd : SDNode<"X86ISD::STRICT_FNMADD", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>; +def X86any_Fnmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_Fnmadd node:$src1, node:$src2, node:$src3), + (X86Fnmadd node:$src1, node:$src2, node:$src3)]>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86strict_Fmsub : SDNode<"X86ISD::STRICT_FMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>; +def X86any_Fmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_Fmsub node:$src1, node:$src2, node:$src3), + (X86Fmsub node:$src1, node:$src2, node:$src3)]>; def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86strict_Fnmsub : SDNode<"X86ISD::STRICT_FNMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>; +def X86any_Fnmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3), + [(X86strict_Fnmsub node:$src1, node:$src2, node:$src3), + (X86Fnmsub node:$src1, node:$src2, node:$src3)]>; def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>; @@ -709,19 +725,27 @@ def X86mcvtp2UInt : SDNode<"X86ISD::MCVTP2UI", SDTMFloatToInt>; def X86mcvttp2si : SDNode<"X86ISD::MCVTTP2SI", SDTMFloatToInt>; def X86mcvttp2ui : SDNode<"X86ISD::MCVTTP2UI", SDTMFloatToInt>; +def SDTcvtph2ps : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>]>; +def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", SDTcvtph2ps>; +def X86strict_cvtph2ps : SDNode<"X86ISD::STRICT_CVTPH2PS", SDTcvtph2ps, + [SDNPHasChain]>; +def X86any_cvtph2ps : PatFrags<(ops node:$src), + [(X86strict_cvtph2ps node:$src), + (X86cvtph2ps node:$src)]>; + +def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE", SDTcvtph2ps>; + +def SDTcvtps2ph : SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisVT<2, i32>]>; +def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", SDTcvtps2ph>; +def X86strict_cvtps2ph : SDNode<"X86ISD::STRICT_CVTPS2PH", SDTcvtps2ph, + [SDNPHasChain]>; +def X86any_cvtps2ph : PatFrags<(ops node:$src1, node:$src2), + [(X86strict_cvtps2ph node:$src1, node:$src2), + (X86cvtps2ph node:$src1, node:$src2)]>; -def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, i16>]> >; - -def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, - SDTCVecEltisVT<1, i16>]> >; - -def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, - SDTCVecEltisVT<1, f32>, - SDTCisVT<2, i32>]> >; def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH", SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>, SDTCVecEltisVT<1, f32>, @@ -741,7 +765,9 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND", // cvt fp to bfloat16 def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisSameSizeAs<0,1>, SDTCisSameAs<1,2>]>>; def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, @@ -768,23 +794,6 @@ def SDTX86MaskedStore: SDTypeProfile<0, 3, [ // masked store ]>; //===----------------------------------------------------------------------===// -// SSE Complex Patterns -//===----------------------------------------------------------------------===// - -// These are 'extloads' from a scalar to the low element of a vector, zeroing -// the top elements. These are used for the SSE 'ss' and 'sd' instruction -// forms. -def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [], - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, - SDNPWantRoot, SDNPWantParent]>; -def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [], - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, - SDNPWantRoot, SDNPWantParent]>; - -def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; -def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; - -//===----------------------------------------------------------------------===// // SSE pattern fragments //===----------------------------------------------------------------------===// @@ -895,89 +904,6 @@ def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>; def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>; def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>; -def X86masked_gather : SDNode<"X86ISD::MGATHER", - SDTypeProfile<2, 3, [SDTCisVec<0>, - SDTCisVec<1>, SDTCisInt<1>, - SDTCisSameAs<0, 2>, - SDTCisSameAs<1, 3>, - SDTCisPtrTy<4>]>, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; - -def X86masked_scatter : SDNode<"X86ISD::MSCATTER", - SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisSameAs<0, 2>, - SDTCVecEltisVT<0, i1>, - SDTCisPtrTy<3>]>, - [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; - -def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ - X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); - return Mgt->getIndex().getValueType() == MVT::v4i32; -}]>; - -def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ - X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); - return Mgt->getIndex().getValueType() == MVT::v8i32; -}]>; - -def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ - X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); - return Mgt->getIndex().getValueType() == MVT::v2i64; -}]>; -def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ - X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); - return Mgt->getIndex().getValueType() == MVT::v4i64; -}]>; -def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ - X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); - return Mgt->getIndex().getValueType() == MVT::v8i64; -}]>; -def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ - X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); - return Mgt->getIndex().getValueType() == MVT::v16i32; -}]>; - -def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ - X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); - return Sc->getIndex().getValueType() == MVT::v2i64; -}]>; - -def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ - X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); - return Sc->getIndex().getValueType() == MVT::v4i32; -}]>; - -def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ - X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); - return Sc->getIndex().getValueType() == MVT::v4i64; -}]>; - -def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ - X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); - return Sc->getIndex().getValueType() == MVT::v8i32; -}]>; - -def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ - X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); - return Sc->getIndex().getValueType() == MVT::v8i64; -}]>; -def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ - X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); - return Sc->getIndex().getValueType() == MVT::v16i32; -}]>; - // 128-bit bitconvert pattern fragments def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; @@ -1037,6 +963,23 @@ def X86VBroadcastld64 : PatFrag<(ops node:$src), return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8; }]>; +// Scalar SSE intrinsic fragments to match several different types of loads. +// Used by scalar SSE intrinsic instructions which have 128 bit types, but +// only load a single element. +// FIXME: We should add more canolicalizing in DAGCombine. Particulary removing +// the simple_load case. +def sse_load_f32 : PatFrags<(ops node:$ptr), + [(v4f32 (simple_load node:$ptr)), + (v4f32 (X86vzload32 node:$ptr)), + (v4f32 (scalar_to_vector (loadf32 node:$ptr)))]>; +def sse_load_f64 : PatFrags<(ops node:$ptr), + [(v2f64 (simple_load node:$ptr)), + (v2f64 (X86vzload64 node:$ptr)), + (v2f64 (scalar_to_vector (loadf64 node:$ptr)))]>; + +def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; +def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; + def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); @@ -1185,60 +1128,60 @@ def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTX86MaskedStore, def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr), (X86TruncSStore node:$val, node:$ptr), [{ - return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr), (X86TruncUSStore node:$val, node:$ptr), [{ - return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr), (X86TruncSStore node:$val, node:$ptr), [{ - return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; }]>; def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr), (X86TruncUSStore node:$val, node:$ptr), [{ - return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; }]>; def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr), (X86TruncSStore node:$val, node:$ptr), [{ - return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr), (X86TruncUSStore node:$val, node:$ptr), [{ - return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; }]>; def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; }]>; def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; }]>; def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ - return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; + return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp index 90484241c28c..42c111173570 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -88,7 +88,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) bool X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, + Register &SrcReg, Register &DstReg, unsigned &SubIdx) const { switch (MI.getOpcode()) { default: break; @@ -135,13 +135,497 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, return false; } +bool X86InstrInfo::isDataInvariant(MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + // By default, assume that the instruction is not data invariant. + return false; + + // Some target-independent operations that trivially lower to data-invariant + // instructions. + case TargetOpcode::COPY: + case TargetOpcode::INSERT_SUBREG: + case TargetOpcode::SUBREG_TO_REG: + return true; + + // On x86 it is believed that imul is constant time w.r.t. the loaded data. + // However, they set flags and are perhaps the most surprisingly constant + // time operations so we call them out here separately. + case X86::IMUL16rr: + case X86::IMUL16rri8: + case X86::IMUL16rri: + case X86::IMUL32rr: + case X86::IMUL32rri8: + case X86::IMUL32rri: + case X86::IMUL64rr: + case X86::IMUL64rri32: + case X86::IMUL64rri8: + + // Bit scanning and counting instructions that are somewhat surprisingly + // constant time as they scan across bits and do other fairly complex + // operations like popcnt, but are believed to be constant time on x86. + // However, these set flags. + case X86::BSF16rr: + case X86::BSF32rr: + case X86::BSF64rr: + case X86::BSR16rr: + case X86::BSR32rr: + case X86::BSR64rr: + case X86::LZCNT16rr: + case X86::LZCNT32rr: + case X86::LZCNT64rr: + case X86::POPCNT16rr: + case X86::POPCNT32rr: + case X86::POPCNT64rr: + case X86::TZCNT16rr: + case X86::TZCNT32rr: + case X86::TZCNT64rr: + + // Bit manipulation instructions are effectively combinations of basic + // arithmetic ops, and should still execute in constant time. These also + // set flags. + case X86::BLCFILL32rr: + case X86::BLCFILL64rr: + case X86::BLCI32rr: + case X86::BLCI64rr: + case X86::BLCIC32rr: + case X86::BLCIC64rr: + case X86::BLCMSK32rr: + case X86::BLCMSK64rr: + case X86::BLCS32rr: + case X86::BLCS64rr: + case X86::BLSFILL32rr: + case X86::BLSFILL64rr: + case X86::BLSI32rr: + case X86::BLSI64rr: + case X86::BLSIC32rr: + case X86::BLSIC64rr: + case X86::BLSMSK32rr: + case X86::BLSMSK64rr: + case X86::BLSR32rr: + case X86::BLSR64rr: + case X86::TZMSK32rr: + case X86::TZMSK64rr: + + // Bit extracting and clearing instructions should execute in constant time, + // and set flags. + case X86::BEXTR32rr: + case X86::BEXTR64rr: + case X86::BEXTRI32ri: + case X86::BEXTRI64ri: + case X86::BZHI32rr: + case X86::BZHI64rr: + + // Shift and rotate. + case X86::ROL8r1: + case X86::ROL16r1: + case X86::ROL32r1: + case X86::ROL64r1: + case X86::ROL8rCL: + case X86::ROL16rCL: + case X86::ROL32rCL: + case X86::ROL64rCL: + case X86::ROL8ri: + case X86::ROL16ri: + case X86::ROL32ri: + case X86::ROL64ri: + case X86::ROR8r1: + case X86::ROR16r1: + case X86::ROR32r1: + case X86::ROR64r1: + case X86::ROR8rCL: + case X86::ROR16rCL: + case X86::ROR32rCL: + case X86::ROR64rCL: + case X86::ROR8ri: + case X86::ROR16ri: + case X86::ROR32ri: + case X86::ROR64ri: + case X86::SAR8r1: + case X86::SAR16r1: + case X86::SAR32r1: + case X86::SAR64r1: + case X86::SAR8rCL: + case X86::SAR16rCL: + case X86::SAR32rCL: + case X86::SAR64rCL: + case X86::SAR8ri: + case X86::SAR16ri: + case X86::SAR32ri: + case X86::SAR64ri: + case X86::SHL8r1: + case X86::SHL16r1: + case X86::SHL32r1: + case X86::SHL64r1: + case X86::SHL8rCL: + case X86::SHL16rCL: + case X86::SHL32rCL: + case X86::SHL64rCL: + case X86::SHL8ri: + case X86::SHL16ri: + case X86::SHL32ri: + case X86::SHL64ri: + case X86::SHR8r1: + case X86::SHR16r1: + case X86::SHR32r1: + case X86::SHR64r1: + case X86::SHR8rCL: + case X86::SHR16rCL: + case X86::SHR32rCL: + case X86::SHR64rCL: + case X86::SHR8ri: + case X86::SHR16ri: + case X86::SHR32ri: + case X86::SHR64ri: + case X86::SHLD16rrCL: + case X86::SHLD32rrCL: + case X86::SHLD64rrCL: + case X86::SHLD16rri8: + case X86::SHLD32rri8: + case X86::SHLD64rri8: + case X86::SHRD16rrCL: + case X86::SHRD32rrCL: + case X86::SHRD64rrCL: + case X86::SHRD16rri8: + case X86::SHRD32rri8: + case X86::SHRD64rri8: + + // Basic arithmetic is constant time on the input but does set flags. + case X86::ADC8rr: + case X86::ADC8ri: + case X86::ADC16rr: + case X86::ADC16ri: + case X86::ADC16ri8: + case X86::ADC32rr: + case X86::ADC32ri: + case X86::ADC32ri8: + case X86::ADC64rr: + case X86::ADC64ri8: + case X86::ADC64ri32: + case X86::ADD8rr: + case X86::ADD8ri: + case X86::ADD16rr: + case X86::ADD16ri: + case X86::ADD16ri8: + case X86::ADD32rr: + case X86::ADD32ri: + case X86::ADD32ri8: + case X86::ADD64rr: + case X86::ADD64ri8: + case X86::ADD64ri32: + case X86::AND8rr: + case X86::AND8ri: + case X86::AND16rr: + case X86::AND16ri: + case X86::AND16ri8: + case X86::AND32rr: + case X86::AND32ri: + case X86::AND32ri8: + case X86::AND64rr: + case X86::AND64ri8: + case X86::AND64ri32: + case X86::OR8rr: + case X86::OR8ri: + case X86::OR16rr: + case X86::OR16ri: + case X86::OR16ri8: + case X86::OR32rr: + case X86::OR32ri: + case X86::OR32ri8: + case X86::OR64rr: + case X86::OR64ri8: + case X86::OR64ri32: + case X86::SBB8rr: + case X86::SBB8ri: + case X86::SBB16rr: + case X86::SBB16ri: + case X86::SBB16ri8: + case X86::SBB32rr: + case X86::SBB32ri: + case X86::SBB32ri8: + case X86::SBB64rr: + case X86::SBB64ri8: + case X86::SBB64ri32: + case X86::SUB8rr: + case X86::SUB8ri: + case X86::SUB16rr: + case X86::SUB16ri: + case X86::SUB16ri8: + case X86::SUB32rr: + case X86::SUB32ri: + case X86::SUB32ri8: + case X86::SUB64rr: + case X86::SUB64ri8: + case X86::SUB64ri32: + case X86::XOR8rr: + case X86::XOR8ri: + case X86::XOR16rr: + case X86::XOR16ri: + case X86::XOR16ri8: + case X86::XOR32rr: + case X86::XOR32ri: + case X86::XOR32ri8: + case X86::XOR64rr: + case X86::XOR64ri8: + case X86::XOR64ri32: + // Arithmetic with just 32-bit and 64-bit variants and no immediates. + case X86::ADCX32rr: + case X86::ADCX64rr: + case X86::ADOX32rr: + case X86::ADOX64rr: + case X86::ANDN32rr: + case X86::ANDN64rr: + // Unary arithmetic operations. + case X86::DEC8r: + case X86::DEC16r: + case X86::DEC32r: + case X86::DEC64r: + case X86::INC8r: + case X86::INC16r: + case X86::INC32r: + case X86::INC64r: + case X86::NEG8r: + case X86::NEG16r: + case X86::NEG32r: + case X86::NEG64r: + + // Unlike other arithmetic, NOT doesn't set EFLAGS. + case X86::NOT8r: + case X86::NOT16r: + case X86::NOT32r: + case X86::NOT64r: + + // Various move instructions used to zero or sign extend things. Note that we + // intentionally don't support the _NOREX variants as we can't handle that + // register constraint anyways. + case X86::MOVSX16rr8: + case X86::MOVSX32rr8: + case X86::MOVSX32rr16: + case X86::MOVSX64rr8: + case X86::MOVSX64rr16: + case X86::MOVSX64rr32: + case X86::MOVZX16rr8: + case X86::MOVZX32rr8: + case X86::MOVZX32rr16: + case X86::MOVZX64rr8: + case X86::MOVZX64rr16: + case X86::MOV32rr: + + // Arithmetic instructions that are both constant time and don't set flags. + case X86::RORX32ri: + case X86::RORX64ri: + case X86::SARX32rr: + case X86::SARX64rr: + case X86::SHLX32rr: + case X86::SHLX64rr: + case X86::SHRX32rr: + case X86::SHRX64rr: + + // LEA doesn't actually access memory, and its arithmetic is constant time. + case X86::LEA16r: + case X86::LEA32r: + case X86::LEA64_32r: + case X86::LEA64r: + return true; + } +} + +bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + // By default, assume that the load will immediately leak. + return false; + + // On x86 it is believed that imul is constant time w.r.t. the loaded data. + // However, they set flags and are perhaps the most surprisingly constant + // time operations so we call them out here separately. + case X86::IMUL16rm: + case X86::IMUL16rmi8: + case X86::IMUL16rmi: + case X86::IMUL32rm: + case X86::IMUL32rmi8: + case X86::IMUL32rmi: + case X86::IMUL64rm: + case X86::IMUL64rmi32: + case X86::IMUL64rmi8: + + // Bit scanning and counting instructions that are somewhat surprisingly + // constant time as they scan across bits and do other fairly complex + // operations like popcnt, but are believed to be constant time on x86. + // However, these set flags. + case X86::BSF16rm: + case X86::BSF32rm: + case X86::BSF64rm: + case X86::BSR16rm: + case X86::BSR32rm: + case X86::BSR64rm: + case X86::LZCNT16rm: + case X86::LZCNT32rm: + case X86::LZCNT64rm: + case X86::POPCNT16rm: + case X86::POPCNT32rm: + case X86::POPCNT64rm: + case X86::TZCNT16rm: + case X86::TZCNT32rm: + case X86::TZCNT64rm: + + // Bit manipulation instructions are effectively combinations of basic + // arithmetic ops, and should still execute in constant time. These also + // set flags. + case X86::BLCFILL32rm: + case X86::BLCFILL64rm: + case X86::BLCI32rm: + case X86::BLCI64rm: + case X86::BLCIC32rm: + case X86::BLCIC64rm: + case X86::BLCMSK32rm: + case X86::BLCMSK64rm: + case X86::BLCS32rm: + case X86::BLCS64rm: + case X86::BLSFILL32rm: + case X86::BLSFILL64rm: + case X86::BLSI32rm: + case X86::BLSI64rm: + case X86::BLSIC32rm: + case X86::BLSIC64rm: + case X86::BLSMSK32rm: + case X86::BLSMSK64rm: + case X86::BLSR32rm: + case X86::BLSR64rm: + case X86::TZMSK32rm: + case X86::TZMSK64rm: + + // Bit extracting and clearing instructions should execute in constant time, + // and set flags. + case X86::BEXTR32rm: + case X86::BEXTR64rm: + case X86::BEXTRI32mi: + case X86::BEXTRI64mi: + case X86::BZHI32rm: + case X86::BZHI64rm: + + // Basic arithmetic is constant time on the input but does set flags. + case X86::ADC8rm: + case X86::ADC16rm: + case X86::ADC32rm: + case X86::ADC64rm: + case X86::ADCX32rm: + case X86::ADCX64rm: + case X86::ADD8rm: + case X86::ADD16rm: + case X86::ADD32rm: + case X86::ADD64rm: + case X86::ADOX32rm: + case X86::ADOX64rm: + case X86::AND8rm: + case X86::AND16rm: + case X86::AND32rm: + case X86::AND64rm: + case X86::ANDN32rm: + case X86::ANDN64rm: + case X86::OR8rm: + case X86::OR16rm: + case X86::OR32rm: + case X86::OR64rm: + case X86::SBB8rm: + case X86::SBB16rm: + case X86::SBB32rm: + case X86::SBB64rm: + case X86::SUB8rm: + case X86::SUB16rm: + case X86::SUB32rm: + case X86::SUB64rm: + case X86::XOR8rm: + case X86::XOR16rm: + case X86::XOR32rm: + case X86::XOR64rm: + + // Integer multiply w/o affecting flags is still believed to be constant + // time on x86. Called out separately as this is among the most surprising + // instructions to exhibit that behavior. + case X86::MULX32rm: + case X86::MULX64rm: + + // Arithmetic instructions that are both constant time and don't set flags. + case X86::RORX32mi: + case X86::RORX64mi: + case X86::SARX32rm: + case X86::SARX64rm: + case X86::SHLX32rm: + case X86::SHLX64rm: + case X86::SHRX32rm: + case X86::SHRX64rm: + + // Conversions are believed to be constant time and don't set flags. + case X86::CVTTSD2SI64rm: + case X86::VCVTTSD2SI64rm: + case X86::VCVTTSD2SI64Zrm: + case X86::CVTTSD2SIrm: + case X86::VCVTTSD2SIrm: + case X86::VCVTTSD2SIZrm: + case X86::CVTTSS2SI64rm: + case X86::VCVTTSS2SI64rm: + case X86::VCVTTSS2SI64Zrm: + case X86::CVTTSS2SIrm: + case X86::VCVTTSS2SIrm: + case X86::VCVTTSS2SIZrm: + case X86::CVTSI2SDrm: + case X86::VCVTSI2SDrm: + case X86::VCVTSI2SDZrm: + case X86::CVTSI2SSrm: + case X86::VCVTSI2SSrm: + case X86::VCVTSI2SSZrm: + case X86::CVTSI642SDrm: + case X86::VCVTSI642SDrm: + case X86::VCVTSI642SDZrm: + case X86::CVTSI642SSrm: + case X86::VCVTSI642SSrm: + case X86::VCVTSI642SSZrm: + case X86::CVTSS2SDrm: + case X86::VCVTSS2SDrm: + case X86::VCVTSS2SDZrm: + case X86::CVTSD2SSrm: + case X86::VCVTSD2SSrm: + case X86::VCVTSD2SSZrm: + // AVX512 added unsigned integer conversions. + case X86::VCVTTSD2USI64Zrm: + case X86::VCVTTSD2USIZrm: + case X86::VCVTTSS2USI64Zrm: + case X86::VCVTTSS2USIZrm: + case X86::VCVTUSI2SDZrm: + case X86::VCVTUSI642SDZrm: + case X86::VCVTUSI2SSZrm: + case X86::VCVTUSI642SSZrm: + + // Loads to register don't set flags. + case X86::MOV8rm: + case X86::MOV8rm_NOREX: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::MOVSX16rm8: + case X86::MOVSX32rm16: + case X86::MOVSX32rm8: + case X86::MOVSX32rm8_NOREX: + case X86::MOVSX64rm16: + case X86::MOVSX64rm32: + case X86::MOVSX64rm8: + case X86::MOVZX16rm8: + case X86::MOVZX32rm16: + case X86::MOVZX32rm8: + case X86::MOVZX32rm8_NOREX: + case X86::MOVZX64rm16: + case X86::MOVZX64rm8: + return true; + } +} + int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const { const MachineFunction *MF = MI.getParent()->getParent(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); if (isFrameInstr(MI)) { - unsigned StackAlign = TFI->getStackAlignment(); - int SPAdj = alignTo(getFrameSize(MI), StackAlign); + int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign()); SPAdj -= getFrameAdjustment(MI); if (!isFrameSetup(MI)) SPAdj = -SPAdj; @@ -639,7 +1123,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, unsigned SubIdx, + Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const { bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI); @@ -1182,61 +1666,61 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk: case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk: case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk: - case X86::VBROADCASTSDZ256mk: - case X86::VBROADCASTSDZmk: - case X86::VBROADCASTSSZ128mk: - case X86::VBROADCASTSSZ256mk: - case X86::VBROADCASTSSZmk: - case X86::VPBROADCASTDZ128mk: - case X86::VPBROADCASTDZ256mk: - case X86::VPBROADCASTDZmk: - case X86::VPBROADCASTQZ128mk: - case X86::VPBROADCASTQZ256mk: - case X86::VPBROADCASTQZmk: { + case X86::VBROADCASTSDZ256rmk: + case X86::VBROADCASTSDZrmk: + case X86::VBROADCASTSSZ128rmk: + case X86::VBROADCASTSSZ256rmk: + case X86::VBROADCASTSSZrmk: + case X86::VPBROADCASTDZ128rmk: + case X86::VPBROADCASTDZ256rmk: + case X86::VPBROADCASTDZrmk: + case X86::VPBROADCASTQZ128rmk: + case X86::VPBROADCASTQZ256rmk: + case X86::VPBROADCASTQZrmk: { unsigned Opc; switch (MIOpc) { default: llvm_unreachable("Unreachable!"); - case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; - case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; - case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; - case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; - case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; - case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; - case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; - case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; - case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; - case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; - case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; - case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; - case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; - case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; - case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; - case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; - case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; - case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; - case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; - case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; - case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; - case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; - case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; - case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; - case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; - case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; - case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; - case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; - case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; - case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; - case X86::VBROADCASTSDZ256mk: Opc = X86::VBLENDMPDZ256rmbk; break; - case X86::VBROADCASTSDZmk: Opc = X86::VBLENDMPDZrmbk; break; - case X86::VBROADCASTSSZ128mk: Opc = X86::VBLENDMPSZ128rmbk; break; - case X86::VBROADCASTSSZ256mk: Opc = X86::VBLENDMPSZ256rmbk; break; - case X86::VBROADCASTSSZmk: Opc = X86::VBLENDMPSZrmbk; break; - case X86::VPBROADCASTDZ128mk: Opc = X86::VPBLENDMDZ128rmbk; break; - case X86::VPBROADCASTDZ256mk: Opc = X86::VPBLENDMDZ256rmbk; break; - case X86::VPBROADCASTDZmk: Opc = X86::VPBLENDMDZrmbk; break; - case X86::VPBROADCASTQZ128mk: Opc = X86::VPBLENDMQZ128rmbk; break; - case X86::VPBROADCASTQZ256mk: Opc = X86::VPBLENDMQZ256rmbk; break; - case X86::VPBROADCASTQZmk: Opc = X86::VPBLENDMQZrmbk; break; + case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break; + case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break; + case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break; + case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break; + case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break; + case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break; + case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break; + case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break; + case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break; + case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break; + case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break; + case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break; + case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break; + case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break; + case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break; + case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break; + case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break; + case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break; + case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break; + case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break; + case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break; + case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break; + case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break; + case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break; + case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break; + case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break; + case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break; + case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break; + case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break; } NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc)) @@ -1883,7 +2367,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, unsigned KMaskOp = -1U; if (X86II::isKMasked(TSFlags)) { // For k-zero-masked operations it is Ok to commute the first vector - // operand. + // operand. Unless this is an intrinsic instruction. // For regular k-masked operations a conservative choice is done as the // elements of the first vector operand, for which the corresponding bit // in the k-mask operand is set to 0, are copied to the result of the @@ -1902,7 +2386,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI, // The operand with index = 1 is used as a source for those elements for // which the corresponding bit in the k-mask is set to 0. - if (X86II::isKMergeMasked(TSFlags)) + if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic) FirstCommutableVecOp = 3; LastCommutableVecOp++; @@ -2379,17 +2863,6 @@ unsigned X86::getSwappedVCMPImm(unsigned Imm) { return Imm; } -bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const { - if (!MI.isTerminator()) return false; - - // Conditional branch is a special case. - if (MI.isBranch() && !MI.isBarrier()) - return true; - if (!MI.isPredicable()) - return true; - return !isPredicated(MI); -} - bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const { switch (MI.getOpcode()) { case X86::TCRETURNdi: @@ -2826,11 +3299,11 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB, return Count; } -bool X86InstrInfo:: -canInsertSelect(const MachineBasicBlock &MBB, - ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg, - int &CondCycles, int &TrueCycles, int &FalseCycles) const { +bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef<MachineOperand> Cond, + Register DstReg, Register TrueReg, + Register FalseReg, int &CondCycles, + int &TrueCycles, int &FalseCycles) const { // Not all subtargets have cmov instructions. if (!Subtarget.hasCMov()) return false; @@ -2865,9 +3338,9 @@ canInsertSelect(const MachineBasicBlock &MBB, void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DstReg, - ArrayRef<MachineOperand> Cond, unsigned TrueReg, - unsigned FalseReg) const { + const DebugLoc &DL, Register DstReg, + ArrayRef<MachineOperand> Cond, Register TrueReg, + Register FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo(); const TargetRegisterClass &RC = *MRI.getRegClass(DstReg); @@ -3189,8 +3662,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, } } -bool X86InstrInfo::getMemOperandWithOffset( - const MachineInstr &MemOp, const MachineOperand *&BaseOp, int64_t &Offset, +bool X86InstrInfo::getMemOperandsWithOffsetWidth( + const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, const TargetRegisterInfo *TRI) const { const MCInstrDesc &Desc = MemOp.getDesc(); int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); @@ -3199,7 +3673,8 @@ bool X86InstrInfo::getMemOperandWithOffset( MemRefBegin += X86II::getOperandBias(Desc); - BaseOp = &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg); + const MachineOperand *BaseOp = + &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg); if (!BaseOp->isReg()) // Can be an MO_FrameIndex return false; @@ -3221,6 +3696,13 @@ bool X86InstrInfo::getMemOperandWithOffset( if (!BaseOp->isReg()) return false; + OffsetIsScalable = false; + // FIXME: Relying on memoperands() may not be right thing to do here. Check + // with X86 maintainers, and fix it accordingly. For now, it is ok, since + // there is no use of `Width` for X86 back-end at the moment. + Width = + !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0; + BaseOps.push_back(BaseOp); return true; } @@ -3241,7 +3723,7 @@ static unsigned getLoadRegOpcode(unsigned DestReg, void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIdx, + Register SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); @@ -3249,7 +3731,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, "Stack slot too small for store"); unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); bool isAligned = - (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) @@ -3258,20 +3740,20 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIdx, + Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); bool isAligned = - (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx); } -bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, +bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, + Register &SrcReg2, int &CmpMask, int &CmpValue) const { switch (MI.getOpcode()) { default: break; @@ -3358,7 +3840,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, /// SrcReg, SrcRegs: register operands for FlagI. /// ImmValue: immediate for FlagI if it takes an immediate. inline static bool isRedundantFlagInstr(const MachineInstr &FlagI, - unsigned SrcReg, unsigned SrcReg2, + Register SrcReg, Register SrcReg2, int ImmMask, int ImmValue, const MachineInstr &OI) { if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) || @@ -3547,8 +4029,8 @@ static X86::CondCode isUseDefConvertible(const MachineInstr &MI) { /// Check if there exists an earlier instruction that /// operates on the same source operands and sets flags in the same way as /// Compare; remove Compare if possible. -bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, - unsigned SrcReg2, int CmpMask, +bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, + Register SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { // Check whether we can replace SUB with CMP. @@ -3875,15 +4357,15 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); - Register Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB.getReg(0); MIB->setDesc(Desc); // MachineInstr::addOperand() will insert explicit operands before any // implicit operands. MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); // But we don't trust that. - assert(MIB->getOperand(1).getReg() == Reg && - MIB->getOperand(2).getReg() == Reg && "Misplaced operand"); + assert(MIB.getReg(1) == Reg && + MIB.getReg(2) == Reg && "Misplaced operand"); return true; } @@ -3905,7 +4387,7 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, bool MinusOne) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); - Register Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB.getReg(0); // Insert the XOR. BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) @@ -3949,7 +4431,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm); MIB->setDesc(TII.get(X86::POP64r)); MIB->getOperand(0) - .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64)); + .setReg(getX86SubSuperRegister(MIB.getReg(0), 64)); } else { assert(MIB->getOpcode() == X86::MOV32ImmSExti8); StackAdjustment = 4; @@ -3981,14 +4463,14 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); - Register Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB.getReg(0); const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( - MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8); + MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8)); MachineBasicBlock::iterator I = MIB.getInstr(); BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) @@ -4019,7 +4501,7 @@ static bool expandNOVLXLoad(MachineInstrBuilder &MIB, const MCInstrDesc &LoadDesc, const MCInstrDesc &BroadcastDesc, unsigned SubIdx) { - Register DestReg = MIB->getOperand(0).getReg(); + Register DestReg = MIB.getReg(0); // Check if DestReg is XMM16-31 or YMM16-31. if (TRI->getEncodingValue(DestReg) < 16) { // We can use a normal VEX encoded load. @@ -4042,7 +4524,7 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB, const MCInstrDesc &StoreDesc, const MCInstrDesc &ExtractDesc, unsigned SubIdx) { - Register SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg(); + Register SrcReg = MIB.getReg(X86::AddrNumOperands); // Check if DestReg is XMM16-31 or YMM16-31. if (TRI->getEncodingValue(SrcReg) < 16) { // We can use a normal VEX encoded store. @@ -4065,7 +4547,7 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { // Temporarily remove the immediate so we can add another source register. MIB->RemoveOperand(2); // Add the register. Don't copy the kill flag if there is one. - MIB.addReg(MIB->getOperand(1).getReg(), + MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef())); // Add back the immediate. MIB.addImm(ShiftAmt); @@ -4085,10 +4567,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::MOV32ImmSExti8: case X86::MOV64ImmSExti8: return ExpandMOVImmSExti8(MIB, *this, Subtarget); - case X86::SETB_C8r: - return Expand2AddrUndef(MIB, get(X86::SBB8rr)); - case X86::SETB_C16r: - return Expand2AddrUndef(MIB, get(X86::SBB16rr)); case X86::SETB_C32r: return Expand2AddrUndef(MIB, get(X86::SBB32rr)); case X86::SETB_C64r: @@ -4103,7 +4581,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::AVX_SET0: { assert(HasAVX && "AVX not supported"); const TargetRegisterInfo *TRI = &getRegisterInfo(); - Register SrcReg = MIB->getOperand(0).getReg(); + Register SrcReg = MIB.getReg(0); Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); MIB->getOperand(0).setReg(XReg); Expand2AddrUndef(MIB, get(X86::VXORPSrr)); @@ -4115,7 +4593,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::AVX512_FsFLD0SD: case X86::AVX512_FsFLD0F128: { bool HasVLX = Subtarget.hasVLX(); - Register SrcReg = MIB->getOperand(0).getReg(); + Register SrcReg = MIB.getReg(0); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) return Expand2AddrUndef(MIB, @@ -4129,7 +4607,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: { bool HasVLX = Subtarget.hasVLX(); - Register SrcReg = MIB->getOperand(0).getReg(); + Register SrcReg = MIB.getReg(0); const TargetRegisterInfo *TRI = &getRegisterInfo(); if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) { Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); @@ -4152,14 +4630,14 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); case X86::AVX1_SETALLONES: { - Register Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB.getReg(0); // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS. MIB->setDesc(get(X86::VCMPPSYrri)); MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); return true; } case X86::AVX512_512_SETALLONES: { - Register Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB.getReg(0); MIB->setDesc(get(X86::VPTERNLOGDZrri)); // VPTERNLOGD needs 3 register inputs and an immediate. // 0xff will return 1s for any input. @@ -4169,8 +4647,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } case X86::AVX512_512_SEXT_MASK_32: case X86::AVX512_512_SEXT_MASK_64: { - Register Reg = MIB->getOperand(0).getReg(); - Register MaskReg = MIB->getOperand(1).getReg(); + Register Reg = MIB.getReg(0); + Register MaskReg = MIB.getReg(1); unsigned MaskState = getRegState(MIB->getOperand(1)); unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; @@ -4207,7 +4685,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr), get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm); case X86::MOV32ri64: { - Register Reg = MIB->getOperand(0).getReg(); + Register Reg = MIB.getReg(0); Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit); MI.setDesc(get(X86::MOV32ri)); MIB->getOperand(0).setReg(Reg32); @@ -4360,11 +4838,105 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance( // Return true for any instruction the copies the high bits of the first source // operand into the unused high bits of the destination operand. -static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum, +// Also returns true for instructions that have two inputs where one may +// be undef and we want it to use the same register as the other input. +static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold = false) { // Set the OpNum parameter to the first source operand. - OpNum = 1; switch (Opcode) { + case X86::MMX_PUNPCKHBWirr: + case X86::MMX_PUNPCKHWDirr: + case X86::MMX_PUNPCKHDQirr: + case X86::MMX_PUNPCKLBWirr: + case X86::MMX_PUNPCKLWDirr: + case X86::MMX_PUNPCKLDQirr: + case X86::MOVHLPSrr: + case X86::PACKSSWBrr: + case X86::PACKUSWBrr: + case X86::PACKSSDWrr: + case X86::PACKUSDWrr: + case X86::PUNPCKHBWrr: + case X86::PUNPCKLBWrr: + case X86::PUNPCKHWDrr: + case X86::PUNPCKLWDrr: + case X86::PUNPCKHDQrr: + case X86::PUNPCKLDQrr: + case X86::PUNPCKHQDQrr: + case X86::PUNPCKLQDQrr: + case X86::SHUFPDrri: + case X86::SHUFPSrri: + // These instructions are sometimes used with an undef first or second + // source. Return true here so BreakFalseDeps will assign this source to the + // same register as the first source to avoid a false dependency. + // Operand 1 of these instructions is tied so they're separate from their + // VEX counterparts. + return OpNum == 2 && !ForLoadFold; + + case X86::VMOVLHPSrr: + case X86::VMOVLHPSZrr: + case X86::VPACKSSWBrr: + case X86::VPACKUSWBrr: + case X86::VPACKSSDWrr: + case X86::VPACKUSDWrr: + case X86::VPACKSSWBZ128rr: + case X86::VPACKUSWBZ128rr: + case X86::VPACKSSDWZ128rr: + case X86::VPACKUSDWZ128rr: + case X86::VPERM2F128rr: + case X86::VPERM2I128rr: + case X86::VSHUFF32X4Z256rri: + case X86::VSHUFF32X4Zrri: + case X86::VSHUFF64X2Z256rri: + case X86::VSHUFF64X2Zrri: + case X86::VSHUFI32X4Z256rri: + case X86::VSHUFI32X4Zrri: + case X86::VSHUFI64X2Z256rri: + case X86::VSHUFI64X2Zrri: + case X86::VPUNPCKHBWrr: + case X86::VPUNPCKLBWrr: + case X86::VPUNPCKHBWYrr: + case X86::VPUNPCKLBWYrr: + case X86::VPUNPCKHBWZ128rr: + case X86::VPUNPCKLBWZ128rr: + case X86::VPUNPCKHBWZ256rr: + case X86::VPUNPCKLBWZ256rr: + case X86::VPUNPCKHBWZrr: + case X86::VPUNPCKLBWZrr: + case X86::VPUNPCKHWDrr: + case X86::VPUNPCKLWDrr: + case X86::VPUNPCKHWDYrr: + case X86::VPUNPCKLWDYrr: + case X86::VPUNPCKHWDZ128rr: + case X86::VPUNPCKLWDZ128rr: + case X86::VPUNPCKHWDZ256rr: + case X86::VPUNPCKLWDZ256rr: + case X86::VPUNPCKHWDZrr: + case X86::VPUNPCKLWDZrr: + case X86::VPUNPCKHDQrr: + case X86::VPUNPCKLDQrr: + case X86::VPUNPCKHDQYrr: + case X86::VPUNPCKLDQYrr: + case X86::VPUNPCKHDQZ128rr: + case X86::VPUNPCKLDQZ128rr: + case X86::VPUNPCKHDQZ256rr: + case X86::VPUNPCKLDQZ256rr: + case X86::VPUNPCKHDQZrr: + case X86::VPUNPCKLDQZrr: + case X86::VPUNPCKHQDQrr: + case X86::VPUNPCKLQDQrr: + case X86::VPUNPCKHQDQYrr: + case X86::VPUNPCKLQDQYrr: + case X86::VPUNPCKHQDQZ128rr: + case X86::VPUNPCKLQDQZ128rr: + case X86::VPUNPCKHQDQZ256rr: + case X86::VPUNPCKLQDQZ256rr: + case X86::VPUNPCKHQDQZrr: + case X86::VPUNPCKLQDQZrr: + // These instructions are sometimes used with an undef first or second + // source. Return true here so BreakFalseDeps will assign this source to the + // same register as the first source to avoid a false dependency. + return (OpNum == 1 || OpNum == 2) && !ForLoadFold; + case X86::VCVTSI2SSrr: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSrr_Int: @@ -4422,7 +4994,7 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum, case X86::VCVTUSI642SDZrm_Int: // Load folding won't effect the undef register update since the input is // a GPR. - return !ForLoadFold; + return OpNum == 1 && !ForLoadFold; case X86::VCVTSD2SSrr: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSrr_Int: @@ -4521,15 +5093,13 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum, case X86::VSQRTSDZrb_Int: case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: - return true; + return OpNum == 1; case X86::VMOVSSZrrk: case X86::VMOVSDZrrk: - OpNum = 3; - return true; + return OpNum == 3 && !ForLoadFold; case X86::VMOVSSZrrkz: case X86::VMOVSDZrrkz: - OpNum = 2; - return true; + return OpNum == 2 && !ForLoadFold; } return false; @@ -4552,13 +5122,17 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum, unsigned X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, const TargetRegisterInfo *TRI) const { - if (!hasUndefRegUpdate(MI.getOpcode(), OpNum)) - return 0; - - const MachineOperand &MO = MI.getOperand(OpNum); - if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) { - return UndefRegClearance; + for (unsigned i = MI.getNumExplicitDefs(), e = MI.getNumExplicitOperands(); + i != e; ++i) { + const MachineOperand &MO = MI.getOperand(i); + if (MO.isReg() && MO.isUndef() && + Register::isPhysicalRegister(MO.getReg()) && + hasUndefRegUpdate(MI.getOpcode(), i)) { + OpNum = i; + return UndefRegClearance; + } } + return 0; } @@ -4729,7 +5303,7 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, MachineInstr *X86InstrInfo::foldMemoryOperandCustom( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align) const { + unsigned Size, Align Alignment) const { switch (MI.getOpcode()) { case X86::INSERTPSrr: case X86::VINSERTPSrr: @@ -4745,7 +5319,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if ((Size == 0 || Size >= 16) && RCSize >= 16 && 4 <= Align) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(4)) { int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = @@ -4769,7 +5343,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if ((Size == 0 || Size >= 16) && RCSize >= 16 && 8 <= Align) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) { unsigned NewOpCode = (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : @@ -4788,7 +5362,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) { MachineInstr *NewMI = FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this); return NewMI; @@ -4802,8 +5376,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, MachineInstr &MI) { - unsigned Ignored; - if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) || + if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/true) || !MI.getOperand(1).isReg()) return false; @@ -4820,11 +5393,10 @@ static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, return VRegDef && VRegDef->isImplicitDef(); } - MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align, bool AllowCommute) const { + unsigned Size, Align Alignment, bool AllowCommute) const { bool isSlowTwoMemOps = Subtarget.slowTwoMemOps(); bool isTwoAddrFold = false; @@ -4864,8 +5436,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineInstr *NewMI = nullptr; // Attempt to fold any custom cases we have. - if (MachineInstr *CustomMI = - foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + if (MachineInstr *CustomMI = foldMemoryOperandCustom( + MF, MI, OpNum, MOs, InsertPt, Size, Alignment)) return CustomMI; const X86MemoryFoldTableEntry *I = nullptr; @@ -4892,9 +5464,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (I != nullptr) { unsigned Opcode = I->DstOp; - unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; - MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0; - if (Align < MinAlign) + MaybeAlign MinAlign = + decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT); + if (MinAlign && Alignment < *MinAlign) return nullptr; bool NarrowToMOV32rm = false; if (Size) { @@ -4969,8 +5541,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( } // Attempt to fold with the commuted version of the instruction. - NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, - Size, Align, /*AllowCommute=*/false); + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size, + Alignment, /*AllowCommute=*/false); if (NewMI) return NewMI; @@ -5024,12 +5596,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, const MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Size = MFI.getObjectSize(FrameIndex); - unsigned Alignment = MFI.getObjectAlignment(FrameIndex); + Align Alignment = MFI.getObjectAlign(FrameIndex); // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) Alignment = - std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment()); + std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; @@ -5087,12 +5659,31 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, // destination register is wider than 32 bits (4 bytes), and its user // instruction isn't scalar (SS). switch (UserOpc) { + case X86::CVTSS2SDrr_Int: + case X86::VCVTSS2SDrr_Int: + case X86::VCVTSS2SDZrr_Int: + case X86::VCVTSS2SDZrr_Intk: + case X86::VCVTSS2SDZrr_Intkz: + case X86::CVTSS2SIrr_Int: case X86::CVTSS2SI64rr_Int: + case X86::VCVTSS2SIrr_Int: case X86::VCVTSS2SI64rr_Int: + case X86::VCVTSS2SIZrr_Int: case X86::VCVTSS2SI64Zrr_Int: + case X86::CVTTSS2SIrr_Int: case X86::CVTTSS2SI64rr_Int: + case X86::VCVTTSS2SIrr_Int: case X86::VCVTTSS2SI64rr_Int: + case X86::VCVTTSS2SIZrr_Int: case X86::VCVTTSS2SI64Zrr_Int: + case X86::VCVTSS2USIZrr_Int: case X86::VCVTSS2USI64Zrr_Int: + case X86::VCVTTSS2USIZrr_Int: case X86::VCVTTSS2USI64Zrr_Int: + case X86::RCPSSr_Int: case X86::VRCPSSr_Int: + case X86::RSQRTSSr_Int: case X86::VRSQRTSSr_Int: + case X86::ROUNDSSr_Int: case X86::VROUNDSSr_Int: + case X86::COMISSrr_Int: case X86::VCOMISSrr_Int: case X86::VCOMISSZrr_Int: + case X86::UCOMISSrr_Int:case X86::VUCOMISSrr_Int:case X86::VUCOMISSZrr_Int: case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int: case X86::CMPSSrr_Int: case X86::VCMPSSrr_Int: case X86::VCMPSSZrr_Int: case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int: case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int: case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int: + case X86::SQRTSSr_Int: case X86::VSQRTSSr_Int: case X86::VSQRTSSZr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz: case X86::VCMPSSZrr_Intk: @@ -5100,6 +5691,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz: case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz: case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz: + case X86::VSQRTSSZr_Intk: case X86::VSQRTSSZr_Intkz: case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz: case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int: case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int: @@ -5127,6 +5719,41 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz: case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz: case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz: + case X86::VFIXUPIMMSSZrri: + case X86::VFIXUPIMMSSZrrik: + case X86::VFIXUPIMMSSZrrikz: + case X86::VFPCLASSSSZrr: + case X86::VFPCLASSSSZrrk: + case X86::VGETEXPSSZr: + case X86::VGETEXPSSZrk: + case X86::VGETEXPSSZrkz: + case X86::VGETMANTSSZrri: + case X86::VGETMANTSSZrrik: + case X86::VGETMANTSSZrrikz: + case X86::VRANGESSZrri: + case X86::VRANGESSZrrik: + case X86::VRANGESSZrrikz: + case X86::VRCP14SSZrr: + case X86::VRCP14SSZrrk: + case X86::VRCP14SSZrrkz: + case X86::VRCP28SSZr: + case X86::VRCP28SSZrk: + case X86::VRCP28SSZrkz: + case X86::VREDUCESSZrri: + case X86::VREDUCESSZrrik: + case X86::VREDUCESSZrrikz: + case X86::VRNDSCALESSZr_Int: + case X86::VRNDSCALESSZr_Intk: + case X86::VRNDSCALESSZr_Intkz: + case X86::VRSQRT14SSZrr: + case X86::VRSQRT14SSZrrk: + case X86::VRSQRT14SSZrrkz: + case X86::VRSQRT28SSZr: + case X86::VRSQRT28SSZrk: + case X86::VRSQRT28SSZrkz: + case X86::VSCALEFSSZrr: + case X86::VSCALEFSSZrrk: + case X86::VSCALEFSSZrrkz: return false; default: return true; @@ -5141,12 +5768,29 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, // destination register is wider than 64 bits (8 bytes), and its user // instruction isn't scalar (SD). switch (UserOpc) { + case X86::CVTSD2SSrr_Int: + case X86::VCVTSD2SSrr_Int: + case X86::VCVTSD2SSZrr_Int: + case X86::VCVTSD2SSZrr_Intk: + case X86::VCVTSD2SSZrr_Intkz: + case X86::CVTSD2SIrr_Int: case X86::CVTSD2SI64rr_Int: + case X86::VCVTSD2SIrr_Int: case X86::VCVTSD2SI64rr_Int: + case X86::VCVTSD2SIZrr_Int: case X86::VCVTSD2SI64Zrr_Int: + case X86::CVTTSD2SIrr_Int: case X86::CVTTSD2SI64rr_Int: + case X86::VCVTTSD2SIrr_Int: case X86::VCVTTSD2SI64rr_Int: + case X86::VCVTTSD2SIZrr_Int: case X86::VCVTTSD2SI64Zrr_Int: + case X86::VCVTSD2USIZrr_Int: case X86::VCVTSD2USI64Zrr_Int: + case X86::VCVTTSD2USIZrr_Int: case X86::VCVTTSD2USI64Zrr_Int: + case X86::ROUNDSDr_Int: case X86::VROUNDSDr_Int: + case X86::COMISDrr_Int: case X86::VCOMISDrr_Int: case X86::VCOMISDZrr_Int: + case X86::UCOMISDrr_Int:case X86::VUCOMISDrr_Int:case X86::VUCOMISDZrr_Int: case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int: case X86::CMPSDrr_Int: case X86::VCMPSDrr_Int: case X86::VCMPSDZrr_Int: case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int: case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int: case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int: + case X86::SQRTSDr_Int: case X86::VSQRTSDr_Int: case X86::VSQRTSDZr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz: case X86::VCMPSDZrr_Intk: @@ -5154,6 +5798,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz: case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz: case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz: + case X86::VSQRTSDZr_Intk: case X86::VSQRTSDZr_Intkz: case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz: case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int: case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int: @@ -5181,6 +5826,41 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz: case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz: case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz: + case X86::VFIXUPIMMSDZrri: + case X86::VFIXUPIMMSDZrrik: + case X86::VFIXUPIMMSDZrrikz: + case X86::VFPCLASSSDZrr: + case X86::VFPCLASSSDZrrk: + case X86::VGETEXPSDZr: + case X86::VGETEXPSDZrk: + case X86::VGETEXPSDZrkz: + case X86::VGETMANTSDZrri: + case X86::VGETMANTSDZrrik: + case X86::VGETMANTSDZrrikz: + case X86::VRANGESDZrri: + case X86::VRANGESDZrrik: + case X86::VRANGESDZrrikz: + case X86::VRCP14SDZrr: + case X86::VRCP14SDZrrk: + case X86::VRCP14SDZrrkz: + case X86::VRCP28SDZr: + case X86::VRCP28SDZrk: + case X86::VRCP28SDZrkz: + case X86::VREDUCESDZrri: + case X86::VREDUCESDZrrik: + case X86::VREDUCESDZrrikz: + case X86::VRNDSCALESDZr_Int: + case X86::VRNDSCALESDZr_Intk: + case X86::VRNDSCALESDZr_Intkz: + case X86::VRSQRT14SDZrr: + case X86::VRSQRT14SDZrrk: + case X86::VRSQRT14SDZrrkz: + case X86::VRSQRT28SDZr: + case X86::VRSQRT28SDZrk: + case X86::VRSQRT28SDZrkz: + case X86::VSCALEFSDZrr: + case X86::VSCALEFSDZrrk: + case X86::VSCALEFSDZrrkz: return false; default: return true; @@ -5221,36 +5901,36 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; // Determine the alignment of the load. - unsigned Alignment = 0; + Align Alignment; if (LoadMI.hasOneMemOperand()) - Alignment = (*LoadMI.memoperands_begin())->getAlignment(); + Alignment = (*LoadMI.memoperands_begin())->getAlign(); else switch (LoadMI.getOpcode()) { case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: - Alignment = 64; + Alignment = Align(64); break; case X86::AVX2_SETALLONES: case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_256_SET0: - Alignment = 32; + Alignment = Align(32); break; case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX512_128_SET0: case X86::FsFLD0F128: case X86::AVX512_FsFLD0F128: - Alignment = 16; + Alignment = Align(16); break; case X86::MMX_SET0: case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: - Alignment = 8; + Alignment = Align(8); break; case X86::FsFLD0SS: case X86::AVX512_FsFLD0SS: - Alignment = 4; + Alignment = Align(4); break; default: return nullptr; @@ -5325,14 +6005,18 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128) Ty = Type::getFP128Ty(MF.getFunction().getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) - Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16); + Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), + 16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES) - Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8); + Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), + 8); else if (Opc == X86::MMX_SET0) - Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 2); + Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), + 2); else - Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4); + Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), + 4); bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES || Opc == X86::AVX512_512_SETALLONES || @@ -5418,33 +6102,33 @@ static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I, case TB_BCAST_D: switch (SpillSize) { default: llvm_unreachable("Unknown spill size"); - case 16: return X86::VPBROADCASTDZ128m; - case 32: return X86::VPBROADCASTDZ256m; - case 64: return X86::VPBROADCASTDZm; + case 16: return X86::VPBROADCASTDZ128rm; + case 32: return X86::VPBROADCASTDZ256rm; + case 64: return X86::VPBROADCASTDZrm; } break; case TB_BCAST_Q: switch (SpillSize) { default: llvm_unreachable("Unknown spill size"); - case 16: return X86::VPBROADCASTQZ128m; - case 32: return X86::VPBROADCASTQZ256m; - case 64: return X86::VPBROADCASTQZm; + case 16: return X86::VPBROADCASTQZ128rm; + case 32: return X86::VPBROADCASTQZ256rm; + case 64: return X86::VPBROADCASTQZrm; } break; case TB_BCAST_SS: switch (SpillSize) { default: llvm_unreachable("Unknown spill size"); - case 16: return X86::VBROADCASTSSZ128m; - case 32: return X86::VBROADCASTSSZ256m; - case 64: return X86::VBROADCASTSSZm; + case 16: return X86::VBROADCASTSSZ128rm; + case 32: return X86::VBROADCASTSSZ256rm; + case 64: return X86::VBROADCASTSSZrm; } break; case TB_BCAST_SD: switch (SpillSize) { default: llvm_unreachable("Unknown spill size"); case 16: return X86::VMOVDDUPZ128rm; - case 32: return X86::VBROADCASTSDZ256m; - case 64: return X86::VBROADCASTSDZm; + case 32: return X86::VBROADCASTSDZ256rm; + case 64: return X86::VBROADCASTSDZrm; } break; } @@ -5504,7 +6188,7 @@ bool X86InstrInfo::unfoldMemoryOperand( Opc = getBroadcastOpcode(I, RC, Subtarget); } else { unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget); } @@ -5581,7 +6265,7 @@ bool X86InstrInfo::unfoldMemoryOperand( const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); auto MMOs = extractStoreMMOs(MI.memoperands(), MF); unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget); DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); @@ -5648,7 +6332,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, Opc = getBroadcastOpcode(I, RC, Subtarget); } else { unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget); } @@ -5714,7 +6398,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte // memory access is slow above. unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16); - bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment; + bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment; SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget), dl, MVT::Other, AddrOps); @@ -6124,18 +6808,18 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVSDZrm_alt, X86::VMOVSDZrm_alt, X86::VMOVQI2PQIZrm }, { X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm }, { X86::VMOVSSZrm_alt, X86::VMOVSSZrm_alt, X86::VMOVDI2PDIZrm }, - { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r }, - { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m }, - { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r }, - { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m }, - { X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr }, - { X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm }, - { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128r }, - { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128m }, - { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r }, - { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m }, - { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr }, - { X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm }, + { X86::VBROADCASTSSZ128rr,X86::VBROADCASTSSZ128rr,X86::VPBROADCASTDZ128rr }, + { X86::VBROADCASTSSZ128rm,X86::VBROADCASTSSZ128rm,X86::VPBROADCASTDZ128rm }, + { X86::VBROADCASTSSZ256rr,X86::VBROADCASTSSZ256rr,X86::VPBROADCASTDZ256rr }, + { X86::VBROADCASTSSZ256rm,X86::VBROADCASTSSZ256rm,X86::VPBROADCASTDZ256rm }, + { X86::VBROADCASTSSZrr, X86::VBROADCASTSSZrr, X86::VPBROADCASTDZrr }, + { X86::VBROADCASTSSZrm, X86::VBROADCASTSSZrm, X86::VPBROADCASTDZrm }, + { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128rr }, + { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128rm }, + { X86::VBROADCASTSDZ256rr,X86::VBROADCASTSDZ256rr,X86::VPBROADCASTQZ256rr }, + { X86::VBROADCASTSDZ256rm,X86::VBROADCASTSDZ256rm,X86::VPBROADCASTQZ256rm }, + { X86::VBROADCASTSDZrr, X86::VBROADCASTSDZrr, X86::VPBROADCASTQZrr }, + { X86::VBROADCASTSDZrm, X86::VBROADCASTSDZrm, X86::VPBROADCASTQZrm }, { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrr, X86::VINSERTI32x4Zrr }, { X86::VINSERTF32x4Zrm, X86::VINSERTF32x4Zrm, X86::VINSERTI32x4Zrm }, { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrr, X86::VINSERTI32x8Zrr }, @@ -6895,7 +7579,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ"); table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ); // Don't change integer Q instructions to D instructions and - // use D intructions if we started with a PS instruction. + // use D instructions if we started with a PS instruction. if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode())) Domain = 4; } @@ -7552,7 +8236,8 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::VMULSSrr: case X86::VMULSDZrr: case X86::VMULSSZrr: - return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); default: return false; } @@ -7679,6 +8364,10 @@ X86InstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const { return ParamLoadedValue(*Op, Expr);; } + case X86::MOV8ri: + case X86::MOV16ri: + // TODO: Handle MOV8ri and MOV16ri. + return None; case X86::MOV32ri: case X86::MOV64ri: case X86::MOV64ri32: @@ -7738,6 +8427,20 @@ void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const { + // Propagate FP flags from the original instructions. + // But clear poison-generating flags because those may not be valid now. + // TODO: There should be a helper function for copying only fast-math-flags. + uint16_t IntersectedFlags = OldMI1.getFlags() & OldMI2.getFlags(); + NewMI1.setFlags(IntersectedFlags); + NewMI1.clearFlag(MachineInstr::MIFlag::NoSWrap); + NewMI1.clearFlag(MachineInstr::MIFlag::NoUWrap); + NewMI1.clearFlag(MachineInstr::MIFlag::IsExact); + + NewMI2.setFlags(IntersectedFlags); + NewMI2.clearFlag(MachineInstr::MIFlag::NoSWrap); + NewMI2.clearFlag(MachineInstr::MIFlag::NoUWrap); + NewMI2.clearFlag(MachineInstr::MIFlag::IsExact); + // Integer instructions may define an implicit EFLAGS dest register operand. MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS); MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS); @@ -7957,8 +8660,7 @@ namespace { } // Visit the children of this block in the dominator tree. - for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end(); - I != E; ++I) { + for (auto I = Node->begin(), E = Node->end(); I != E; ++I) { Changed |= VisitNode(*I, TLSBaseAddrReg); } @@ -8073,6 +8775,35 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo( return Sum + 1; }); + // We check to see if CFI Instructions are present, and if they are + // we find the number of CFI Instructions in the candidates. + unsigned CFICount = 0; + MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); + for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); + Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { + const std::vector<MCCFIInstruction> &CFIInstructions = + RepeatedSequenceLocs[0].getMF()->getFrameInstructions(); + if (MBBI->isCFIInstruction()) { + unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex(); + MCCFIInstruction CFI = CFIInstructions[CFIIndex]; + CFICount++; + } + MBBI++; + } + + // We compare the number of found CFI Instructions to the number of CFI + // instructions in the parent function for each candidate. We must check this + // since if we outline one of the CFI instructions in a function, we have to + // outline them all for correctness. If we do not, the address offsets will be + // incorrect between the two sections of the program. + for (outliner::Candidate &C : RepeatedSequenceLocs) { + std::vector<MCCFIInstruction> CFIInstructions = + C.getMF()->getFrameInstructions(); + + if (CFICount > 0 && CFICount != CFIInstructions.size()) + return outliner::OutlinedFunction(); + } + // FIXME: Use real size in bytes for call and ret instructions. if (RepeatedSequenceLocs[0].back()->isTerminator()) { for (outliner::Candidate &C : RepeatedSequenceLocs) @@ -8084,6 +8815,9 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo( ); } + if (CFICount > 0) + return outliner::OutlinedFunction(); + for (outliner::Candidate &C : RepeatedSequenceLocs) C.setCallInfo(MachineOutlinerDefault, 1); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h index 1d2da5305357..89f2ff118c37 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h @@ -24,8 +24,6 @@ #include "X86GenInstrInfo.inc" namespace llvm { -class MachineInstrBuilder; -class X86RegisterInfo; class X86Subtarget; namespace X86 { @@ -180,8 +178,37 @@ public: /// true, then it's expected the pre-extension value is available as a subreg /// of the result register. This also returns the sub-register index in /// SubIdx. - bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, - unsigned &DstReg, unsigned &SubIdx) const override; + bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, + Register &DstReg, unsigned &SubIdx) const override; + + /// Returns true if the instruction has no behavior (specified or otherwise) + /// that is based on the value of any of its register operands + /// + /// Instructions are considered data invariant even if they set EFLAGS. + /// + /// A classical example of something that is inherently not data invariant is + /// an indirect jump -- the destination is loaded into icache based on the + /// bits set in the jump destination register. + /// + /// FIXME: This should become part of our instruction tables. + static bool isDataInvariant(MachineInstr &MI); + + /// Returns true if the instruction has no behavior (specified or otherwise) + /// that is based on the value loaded from memory or the value of any + /// non-address register operands. + /// + /// For example, if the latency of the instruction is dependent on the + /// particular bits set in any of the registers *or* any of the bits loaded + /// from memory. + /// + /// Instructions are considered data invariant even if they set EFLAGS. + /// + /// A classical example of something that is inherently not data invariant is + /// an indirect jump -- the destination is loaded into icache based on the + /// bits set in the jump destination register. + /// + /// FIXME: This should become part of our instruction tables. + static bool isDataInvariantLoad(MachineInstr &MI); unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; @@ -208,7 +235,7 @@ public: bool isReallyTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned DestReg, unsigned SubIdx, + Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override; @@ -278,7 +305,6 @@ public: const X86InstrFMA3Group &FMA3Group) const; // Branch analysis. - bool isUnpredicatedTerminator(const MachineInstr &MI) const override; bool isUnconditionalTailCall(const MachineInstr &MI) const override; bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond, const MachineInstr &TailCall) const override; @@ -291,10 +317,11 @@ public: SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; - bool getMemOperandWithOffset(const MachineInstr &LdSt, - const MachineOperand *&BaseOp, - int64_t &Offset, - const TargetRegisterInfo *TRI) const override; + bool getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, + SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset, + bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const override; bool analyzeBranchPredicate(MachineBasicBlock &MBB, TargetInstrInfo::MachineBranchPredicate &MBP, bool AllowModify = false) const override; @@ -306,22 +333,23 @@ public: const DebugLoc &DL, int *BytesAdded = nullptr) const override; bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond, - unsigned, unsigned, int &, int &, int &) const override; + Register, Register, Register, int &, int &, + int &) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const DebugLoc &DL, unsigned DstReg, - ArrayRef<MachineOperand> Cond, unsigned TrueReg, - unsigned FalseReg) const override; + const DebugLoc &DL, Register DstReg, + ArrayRef<MachineOperand> Cond, Register TrueReg, + Register FalseReg) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, unsigned SrcReg, + MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, unsigned DestReg, + MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; @@ -443,7 +471,7 @@ public: unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Alignment, + unsigned Size, Align Alignment, bool AllowCommute) const; bool isHighLatencyDef(int opc) const override; @@ -469,15 +497,15 @@ public: /// in SrcReg and SrcReg2 if having two register operands, and the value it /// compares against in CmpValue. Return true if the comparison instruction /// can be analyzed. - bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, + bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, + Register &SrcReg2, int &CmpMask, int &CmpValue) const override; /// optimizeCompareInstr - Check if there exists an earlier instruction that /// operates on the same source operands and sets flags in the same way as /// Compare; remove Compare if possible. - bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, - unsigned SrcReg2, int CmpMask, int CmpValue, + bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, + Register SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; /// optimizeLoadInstr - Try to remove the load by folding it to a register @@ -563,7 +591,7 @@ private: unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align) const; + unsigned Size, Align Alignment) const; /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td index 93f40c8ec996..23841c3d7e50 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td @@ -16,10 +16,10 @@ // X86 specific DAG Nodes. // -def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; - -def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; -//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisSameAs<1, 2>]>; +def SDTX86FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>, + SDTCisSameAs<1, 2>]>; def SDTX86Cmov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, @@ -121,6 +121,8 @@ def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; +def SDT_X86PROBED_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; + def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; @@ -138,12 +140,13 @@ def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>; def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>; -def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; -def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; +def X86fshl : SDNode<"X86ISD::FSHL", SDTIntShiftDOp>; +def X86fshr : SDNode<"X86ISD::FSHR", SDTIntShiftDOp>; def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>; -def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86CmpTest, [SDNPHasChain]>; -def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86CmpTest, [SDNPHasChain]>; +def X86fcmp : SDNode<"X86ISD::FCMP", SDTX86FCmp>; +def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86FCmp, [SDNPHasChain]>; +def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86FCmp, [SDNPHasChain]>; def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; @@ -152,8 +155,6 @@ def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>; -def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>; - def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, [SDNPHasChain, SDNPSideEffect]>; @@ -286,6 +287,9 @@ def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>; +def X86pdep : SDNode<"X86ISD::PDEP", SDTIntBinOp>; +def X86pext : SDNode<"X86ISD::PEXT", SDTIntBinOp>; + def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA, @@ -294,6 +298,9 @@ def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA, def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, [SDNPHasChain]>; +def X86ProbedAlloca : SDNode<"X86ISD::PROBED_ALLOCA", SDT_X86PROBED_ALLOCA, + [SDNPHasChain]>; + def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; @@ -354,6 +361,8 @@ let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in { def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; } def X86Mem256_RC512Operand : AsmOperandClass { let Name = "Mem256_RC512"; } def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; } + + def X86SibMemOperand : AsmOperandClass { let Name = "SibMem"; } } def X86AbsMemAsmOperand : AsmOperandClass { @@ -376,14 +385,16 @@ class X86VMemOperand<RegisterClass RC, string printMethod, let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG); } -def anymem : X86MemOperand<"printanymem">; +def anymem : X86MemOperand<"printMemReference">; def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs), [(X86strict_fcmp node:$lhs, node:$rhs), - (X86cmp node:$lhs, node:$rhs)]>; + (X86fcmp node:$lhs, node:$rhs)]>; // FIXME: Right now we allow any size during parsing, but we might want to // restrict to only unsized memory. -def opaquemem : X86MemOperand<"printopaquemem">; +def opaquemem : X86MemOperand<"printMemReference">; + +def sibmem: X86MemOperand<"printMemReference", X86SibMemOperand>; def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>; def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; @@ -757,14 +768,14 @@ def i64u8imm : Operand<i64> { } def lea64_32mem : Operand<i32> { - let PrintMethod = "printanymem"; + let PrintMethod = "printMemReference"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG); let ParserMatchClass = X86MemAsmOperand; } // Memory operands that use 64-bit pointers in both ILP32 and LP64. def lea64mem : Operand<i64> { - let PrintMethod = "printanymem"; + let PrintMethod = "printMemReference"; let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG); let ParserMatchClass = X86MemAsmOperand; } @@ -830,11 +841,10 @@ def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr", def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>; -// A relocatable immediate is either an immediate operand or an operand that can -// be relocated by the linker to an immediate, such as a regular symbol in -// non-PIC code. -def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [], - 0>; +// A relocatable immediate is an operand that can be relocated by the linker to +// an immediate, such as a regular symbol in non-PIC code. +def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", + [X86Wrapper], [], 0>; //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. @@ -922,11 +932,10 @@ def HasRTM : Predicate<"Subtarget->hasRTM()">; def HasADX : Predicate<"Subtarget->hasADX()">; def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasSGX : Predicate<"Subtarget->hasSGX()">; -def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">; def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">; -def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasPrefetchW : Predicate<"Subtarget->hasPrefetchW()">; def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">; def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">; @@ -948,18 +957,23 @@ def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">; def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">; +def HasSERIALIZE : Predicate<"Subtarget->hasSERIALIZE()">; +def HasTSXLDTRK : Predicate<"Subtarget->hasTSXLDTRK()">; +def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">; +def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">; +def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">; def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, - AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; + AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">; def In64BitMode : Predicate<"Subtarget->is64Bit()">, - AssemblerPredicate<"Mode64Bit", "64-bit mode">; + AssemblerPredicate<(all_of Mode64Bit), "64-bit mode">; def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">; def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">; def In16BitMode : Predicate<"Subtarget->is16Bit()">, - AssemblerPredicate<"Mode16Bit", "16-bit mode">; + AssemblerPredicate<(all_of Mode16Bit), "16-bit mode">; def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, - AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">; + AssemblerPredicate<(all_of (not Mode16Bit)), "Not 16-bit mode">; def In32BitMode : Predicate<"Subtarget->is32Bit()">, - AssemblerPredicate<"Mode32Bit", "32-bit mode">; + AssemblerPredicate<(all_of Mode32Bit), "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" @@ -1033,13 +1047,17 @@ def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; -// FIXME: Ideally we would just replace the above i*immSExt* matchers with -// relocImm-based matchers, but then FastISel would be unable to use them. +def i16relocImmSExt8 : PatLeaf<(i16 relocImm), [{ + return isSExtAbsoluteSymbolRef(8, N); +}]>; +def i32relocImmSExt8 : PatLeaf<(i32 relocImm), [{ + return isSExtAbsoluteSymbolRef(8, N); +}]>; def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{ - return isSExtRelocImm<8>(N); + return isSExtAbsoluteSymbolRef(8, N); }]>; def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{ - return isSExtRelocImm<32>(N); + return isSExtAbsoluteSymbolRef(32, N); }]>; // If we have multiple users of an immediate, it's much smaller to reuse @@ -1059,6 +1077,13 @@ def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{ // Eventually, it would be nice to allow ConstantHoisting to merge constants // globally for potentially added savings. // +def imm_su : PatLeaf<(imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64immSExt32_su : PatLeaf<(i64immSExt32), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + def relocImm8_su : PatLeaf<(i8 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; @@ -1069,20 +1094,26 @@ def relocImm32_su : PatLeaf<(i32 relocImm), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ +def i16relocImmSExt8_su : PatLeaf<(i16relocImmSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ +def i32relocImmSExt8_su : PatLeaf<(i32relocImmSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def i64immSExt8_su : PatLeaf<(i64immSExt8), [{ +def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{ +def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; -def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{ +def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64immSExt8_su : PatLeaf<(i64immSExt8), [{ return !shouldAvoidImmediateInstFormsForSize(N); }]>; @@ -1113,7 +1144,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{ ISD::LoadExtType ExtType = LD->getExtensionType(); if (ExtType == ISD::NON_EXTLOAD) return true; - if (ExtType == ISD::EXTLOAD) + if (ExtType == ISD::EXTLOAD && EnablePromoteAnyextLoad) return LD->getAlignment() >= 2 && LD->isSimple(); return false; }]>; @@ -1123,7 +1154,7 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ ISD::LoadExtType ExtType = LD->getExtensionType(); if (ExtType == ISD::NON_EXTLOAD) return true; - if (ExtType == ISD::EXTLOAD) + if (ExtType == ISD::EXTLOAD && EnablePromoteAnyextLoad) return LD->getAlignment() >= 4 && LD->isSimple(); return false; }]>; @@ -1550,7 +1581,7 @@ def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src), [(set GR16:$dst, imm:$src)]>, OpSize16; def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, relocImm:$src)]>, OpSize32; + [(set GR32:$dst, imm:$src)]>, OpSize32; def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, i64immSExt32:$src)]>; @@ -1558,7 +1589,7 @@ def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), let isReMaterializable = 1, isMoveImm = 1 in { def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), "movabs{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, relocImm:$src)]>; + [(set GR64:$dst, imm:$src)]>; } // Longer forms that use a ModR/M byte. Needed for disassembler @@ -1578,19 +1609,31 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), let SchedRW = [WriteStore] in { def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(store (i8 relocImm8_su:$src), addr:$dst)]>; + [(store (i8 imm_su:$src), addr:$dst)]>; def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(store (i16 relocImm16_su:$src), addr:$dst)]>, OpSize16; + [(store (i16 imm_su:$src), addr:$dst)]>, OpSize16; def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(store (i32 relocImm32_su:$src), addr:$dst)]>, OpSize32; + [(store (i32 imm_su:$src), addr:$dst)]>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", - [(store i64relocImmSExt32_su:$src, addr:$dst)]>, + [(store i64immSExt32_su:$src, addr:$dst)]>, Requires<[In64BitMode]>; } // SchedRW +def : Pat<(i32 relocImm:$src), (MOV32ri relocImm:$src)>; +def : Pat<(i64 relocImm:$src), (MOV64ri relocImm:$src)>; + +def : Pat<(store (i8 relocImm8_su:$src), addr:$dst), + (MOV8mi addr:$dst, relocImm8_su:$src)>; +def : Pat<(store (i16 relocImm16_su:$src), addr:$dst), + (MOV16mi addr:$dst, relocImm16_su:$src)>; +def : Pat<(store (i32 relocImm32_su:$src), addr:$dst), + (MOV32mi addr:$dst, relocImm32_su:$src)>; +def : Pat<(store (i64 i64relocImmSExt32_su:$src), addr:$dst), + (MOV64mi32 addr:$dst, i64immSExt32_su:$src)>; + let hasSideEffects = 0 in { /// Memory offset versions of moves. The immediate is an address mode sized @@ -1787,9 +1830,8 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem, // Condition code ops, incl. set if equal/not equal/... let SchedRW = [WriteLAHFSAHF] in { -let Defs = [EFLAGS], Uses = [AH] in -def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", - [(set EFLAGS, (X86sahf AH))]>, +let Defs = [EFLAGS], Uses = [AH], hasSideEffects = 0 in +def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", []>, // flags = AH Requires<[HasLAHFSAHF]>; let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags @@ -2163,24 +2205,24 @@ def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), // Lock instruction prefix let SchedRW = [WriteMicrocoded] in -def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>; +def LOCK_PREFIX : I<0xF0, PrefixByte, (outs), (ins), "lock", []>; let SchedRW = [WriteNop] in { // Rex64 instruction prefix -def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>, +def REX64_PREFIX : I<0x48, PrefixByte, (outs), (ins), "rex64", []>, Requires<[In64BitMode]>; // Data16 instruction prefix -def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>; +def DATA16_PREFIX : I<0x66, PrefixByte, (outs), (ins), "data16", []>; } // SchedRW // Repeat string operation instruction prefixes let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in { // Repeat (used with INS, OUTS, MOVS, LODS and STOS) -def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>; +def REP_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "rep", []>; // Repeat while not equal (used with CMPS and SCAS) -def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; +def REPNE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "repne", []>; } // String manipulation instructions @@ -2581,27 +2623,27 @@ let Predicates = [HasBMI2, NoTBM] in { } multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC, - X86MemOperand x86memop, Intrinsic Int, + X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag> { def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (Int RC:$src1, RC:$src2))]>, + [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>, VEX_4V, Sched<[WriteALU]>; def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, + [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>; } let Predicates = [HasBMI2] in { defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem, - int_x86_bmi_pdep_32, loadi32>, T8XD; + X86pdep, loadi32>, T8XD; defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem, - int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W; + X86pdep, loadi64>, T8XD, VEX_W; defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem, - int_x86_bmi_pext_32, loadi32>, T8XS; + X86pext, loadi32>, T8XS; defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem, - int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W; + X86pext, loadi64>, T8XS, VEX_W; } //===----------------------------------------------------------------------===// @@ -2785,11 +2827,11 @@ let SchedRW = [WriteStore] in { def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movdiri\t{$src, $dst|$dst, $src}", [(int_x86_directstore32 addr:$dst, GR32:$src)]>, - T8, Requires<[HasMOVDIRI]>; + T8PS, Requires<[HasMOVDIRI]>; def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), "movdiri\t{$src, $dst|$dst, $src}", [(int_x86_directstore64 addr:$dst, GR64:$src)]>, - T8, Requires<[In64BitMode, HasMOVDIRI]>; + T8PS, Requires<[In64BitMode, HasMOVDIRI]>; } // SchedRW //===----------------------------------------------------------------------===// @@ -2856,6 +2898,23 @@ def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>; def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// +// SERIALIZE Instruction +// +def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize", + [(int_x86_serialize)]>, PS, + Requires<[HasSERIALIZE]>; + +//===----------------------------------------------------------------------===// +// TSXLDTRK - TSX Suspend Load Address Tracking +// +let Predicates = [HasTSXLDTRK] in { + def XSUSLDTRK : I<0x01, MRM_E8, (outs), (ins), "xsusldtrk", + [(int_x86_xsusldtrk)]>, XD; + def XRESLDTRK : I<0x01, MRM_E9, (outs), (ins), "xresldtrk", + [(int_x86_xresldtrk)]>, XD; +} + +//===----------------------------------------------------------------------===// // Pattern fragments to auto generate TBM instructions. //===----------------------------------------------------------------------===// @@ -2913,6 +2972,11 @@ let Predicates = [HasTBM] in { (TZMSK64rr GR64:$src)>; // Patterns to match flag producing ops. + def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, 1)), + (BLCFILL32rr GR32:$src)>; + def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, 1)), + (BLCFILL64rr GR64:$src)>; + def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))), (BLCI32rr GR32:$src)>; def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))), @@ -2974,7 +3038,7 @@ def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src", - [(int_x86_cldemote addr:$src)]>, TB; + [(int_x86_cldemote addr:$src)]>, PS; //===----------------------------------------------------------------------===// // Subsystems. @@ -3013,6 +3077,9 @@ include "X86InstrSVM.td" include "X86InstrTSX.td" include "X86InstrSGX.td" +// AMX instructions +include "X86InstrAMX.td" + // System instructions. include "X86InstrSystem.td" @@ -3108,6 +3175,9 @@ def : MnemonicAlias<"smovl", "movsl", "att">; def : MnemonicAlias<"smovq", "movsq", "att">; def : MnemonicAlias<"ud2a", "ud2", "att">; +def : MnemonicAlias<"ud2bw", "ud1w", "att">; +def : MnemonicAlias<"ud2bl", "ud1l", "att">; +def : MnemonicAlias<"ud2bq", "ud1q", "att">; def : MnemonicAlias<"verrw", "verr", "att">; // MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release' diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td index 0f4d4d764cc9..49940204c25a 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td @@ -24,8 +24,9 @@ // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, - isPseudo = 1, SchedRW = [WriteZero] in { -def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>; + isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasMMX] in { +def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", + [(set VR64:$dst, (x86mmx (MMX_X86movw2d (i32 0))))]>; } let Constraints = "$src1 = $dst" in { @@ -43,8 +44,7 @@ let Constraints = "$src1 = $dst" in { def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, OType:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId VR64:$src1, - (bitconvert (load_mmx addr:$src2))))]>, + [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -60,8 +60,7 @@ let Constraints = "$src1 = $dst" in { def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - [(set VR64:$dst, (IntId VR64:$src1, - (bitconvert (load_mmx addr:$src2))))]>, + [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), (ins VR64:$src1, i32u8imm:$src2), @@ -81,8 +80,7 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR64:$dst, - (IntId64 (bitconvert (load_mmx addr:$src))))]>, + [(set VR64:$dst, (IntId64 (load_mmx addr:$src)))]>, Sched<[sched.Folded]>; } @@ -101,8 +99,7 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, (ins VR64:$src1, i64mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, - (IntId64 VR64:$src1, - (bitconvert (load_mmx addr:$src2))))]>, + (IntId64 VR64:$src1, (load_mmx addr:$src2)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -118,8 +115,8 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId, def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2, u8imm:$src3), !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR64:$dst, (IntId VR64:$src1, - (bitconvert (load_mmx addr:$src2)), (i8 timm:$src3)))]>, + [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2), + (i8 timm:$src3)))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -164,23 +161,14 @@ def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>; def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, - (x86mmx (scalar_to_vector GR32:$src)))]>, + (x86mmx (MMX_X86movw2d GR32:$src)))]>, Sched<[WriteVecMoveFromGpr]>; def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, - (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>, + (x86mmx (MMX_X86movw2d (loadi32 addr:$src))))]>, Sched<[WriteVecLoad]>; -let Predicates = [HasMMX] in { - def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)), - (MMX_MOVD64rr GR32:$src)>; - def : Pat<(x86mmx (MMX_X86movw2d (i32 0))), - (MMX_SET0)>; - def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))), - (MMX_MOVD64rm addr:$src)>; -} - let mayStore = 1 in def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), "movd\t{$src, $dst|$dst, $src}", []>, @@ -240,20 +228,21 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movq\t{$src, $dst|$dst, $src}", [(store (x86mmx VR64:$src), addr:$dst)]>; +def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, + [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>; +def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1, + [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>; + let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", [(set VR64:$dst, - (x86mmx (bitconvert - (i64 (extractelt (v2i64 VR128:$src), - (iPTR 0))))))]>; + (x86mmx (MMX_X86movdq2q VR128:$src)))]>; def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2i64 - (scalar_to_vector - (i64 (bitconvert (x86mmx VR64:$src))))))]>; + (v2i64 (MMX_X86movq2dq VR64:$src)))]>; let isCodeGenOnly = 1, hasSideEffects = 1 in { def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), @@ -272,14 +261,6 @@ def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>, Sched<[SchedWriteVecMoveLSNT.MMX.MR]>; -let Predicates = [HasMMX] in { - // movd to MMX register zero-extends - def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))), - (MMX_MOVD64rr GR32:$src)>; - def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))), - (MMX_MOVD64rm addr:$src)>; -} - // Arithmetic Instructions defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b, SchedWriteVecALU.MMX>; @@ -566,27 +547,6 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (int_x86_mmx_pmovmskb VR64:$src))]>, Sched<[WriteMMXMOVMSK]>; -// MMX to XMM for vector types -def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1, - [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>; - -def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)), - (v2i64 (MMX_MOVQ2DQrr VR64:$src))>; - -// Low word of XMM to MMX. -def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, - [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>; - -def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)), - (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>; - -def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))), - (x86mmx (MMX_MOVQ64rm addr:$src))>; - -def : Pat<(v2i64 (X86vzmovl (scalar_to_vector - (i64 (bitconvert (x86mmx VR64:$src)))))), - (MMX_MOVQ2DQrr VR64:$src)>; - // Misc. let SchedRW = [SchedWriteShuffle.MMX] in { let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td index 747f5aa86653..6439f717accb 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td @@ -17,13 +17,13 @@ let SchedRW = [WriteSystem], Predicates = [HasSGX] in { // ENCLS - Execute an Enclave System Function of Specified Leaf Number def ENCLS : I<0x01, MRM_CF, (outs), (ins), - "encls", []>, TB; + "encls", []>, PS; // ENCLU - Execute an Enclave User Function of Specified Leaf Number def ENCLU : I<0x01, MRM_D7, (outs), (ins), - "enclu", []>, TB; + "enclu", []>, PS; // ENCLV - Execute an Enclave VMM Function of Specified Leaf Number def ENCLV : I<0x01, MRM_C0, (outs), (ins), - "enclv", []>, TB; + "enclv", []>, PS; } // SchedRW diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td index c45f342ed75b..c3c9f22381f8 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td @@ -43,7 +43,7 @@ let isCodeGenOnly = 1 in { multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, RegisterClass RC, ValueType VT, string asm, Operand memopr, - ComplexPattern mem_cpat, Domain d, + PatFrags mem_frags, Domain d, X86FoldableSchedWrite sched, bit Is2Addr = 1> { let hasSideEffects = 0 in { def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), @@ -57,7 +57,7 @@ let hasSideEffects = 0 in { !if(Is2Addr, !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>, + [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -720,11 +720,7 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), } // SchedRW let Predicates = [UseAVX] in { - // Also handle an i64 load because that may get selected as a faster way to - // load the data. - def : Pat<(v2f64 (X86Unpckl VR128:$src1, - (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), - (VMOVHPDrm VR128:$src1, addr:$src2)>; + // MOVHPD patterns def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), (VMOVHPDrm VR128:$src1, addr:$src2)>; @@ -754,12 +750,6 @@ let Predicates = [UseSSE1] in { let Predicates = [UseSSE2] in { // MOVHPD patterns - - // Also handle an i64 load because that may get selected as a faster way to - // load the data. - def : Pat<(v2f64 (X86Unpckl VR128:$src1, - (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), - (MOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), (MOVHPDrm VR128:$src1, addr:$src2)>; @@ -884,6 +874,23 @@ defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf6 "cvttsd2si", "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG; + +defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, + "cvtss2si", "cvtss2si", + WriteCvtSS2I, SSEPackedSingle>, + XS, VEX, VEX_LIG; +defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, + "cvtss2si", "cvtss2si", + WriteCvtSS2I, SSEPackedSingle>, + XS, VEX, VEX_W, VEX_LIG; +defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, + "cvtsd2si", "cvtsd2si", + WriteCvtSD2I, SSEPackedDouble>, + XD, VEX, VEX_LIG; +defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, + "cvtsd2si", "cvtsd2si", + WriteCvtSD2I, SSEPackedDouble>, + XD, VEX, VEX_W, VEX_LIG; } // The assembler can recognize rr 64-bit instructions by seeing a rxx @@ -923,6 +930,12 @@ let Predicates = [UseAVX] in { (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; def : Pat<(f64 (any_sint_to_fp GR64:$src)), (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; + + def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>; + def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>; + + def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>; + def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>; } let isCodeGenOnly = 1 in { @@ -938,6 +951,20 @@ defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; + +defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, + "cvtss2si", "cvtss2si", + WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; +defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, + "cvtss2si", "cvtss2si", + WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; +defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, + "cvtsd2si", "cvtsd2si", + WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; +defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, + "cvtsd2si", "cvtsd2si", + WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; + defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, "cvtsi2ss", "cvtsi2ss{l}", WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC; @@ -952,12 +979,22 @@ defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC; } // isCodeGenOnly = 1 +let Predicates = [UseSSE1] in { + def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>; + def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>; + def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>; +} + // Conversion Instructions Intrinsics - Match intrinsics which expect MM // and/or XMM operand(s). multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, ValueType DstVT, ValueType SrcVT, SDNode OpNode, - Operand memop, ComplexPattern mem_cpat, string asm, + Operand memop, PatFrags mem_frags, string asm, X86FoldableSchedWrite sched, Domain d> { let ExeDomain = d in { def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), @@ -966,7 +1003,7 @@ let ExeDomain = d in { Sched<[sched]>; def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>, + [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>, Sched<[sched.Folded]>; } } @@ -1247,7 +1284,7 @@ def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, - (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>, + (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; let Constraints = "$src1 = $dst" in { @@ -1261,7 +1298,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "cvtsd2ss\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, - (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>, + (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; } @@ -1745,124 +1782,94 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, - SDNode OpNode, ValueType VT, + Operand memop, SDNode OpNode, ValueType VT, PatFrag ld_frag, string asm, - X86FoldableSchedWrite sched> { -let Uses = [MXCSR], mayRaiseFPException = 1 in { - let isCommutable = 1 in - def rr : SIi8<0xC2, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, - [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>, - Sched<[sched]>; - def rm : SIi8<0xC2, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, - [(set RC:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), timm:$cc))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; -} -} - -let isCodeGenOnly = 1 in { - let ExeDomain = SSEPackedSingle in - defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32, - "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG; - let ExeDomain = SSEPackedDouble in - defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64, - "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SchedWriteFCmpSizes.PD.Scl>, - XD, VEX_4V, VEX_LIG, VEX_WIG; - - let Constraints = "$src1 = $dst" in { - let ExeDomain = SSEPackedSingle in - defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32, - "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SchedWriteFCmpSizes.PS.Scl>, XS; - let ExeDomain = SSEPackedDouble in - defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64, - "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SchedWriteFCmpSizes.PD.Scl>, XD; - } -} - -multiclass sse12_cmp_scalar_int<Operand memop, - Intrinsic Int, string asm, X86FoldableSchedWrite sched, - ComplexPattern mem_cpat> { -let Uses = [MXCSR], mayRaiseFPException = 1 in { + X86FoldableSchedWrite sched, + PatFrags mem_frags> { def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src, u8imm:$cc), asm, - [(set VR128:$dst, (Int VR128:$src1, - VR128:$src, timm:$cc))]>, - Sched<[sched]>; -let mayLoad = 1 in + (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm, + [(set VR128:$dst, (OpNode (VT VR128:$src1), + VR128:$src2, timm:$cc))]>, + Sched<[sched]>, SIMD_EXC; + let mayLoad = 1 in def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, memop:$src, u8imm:$cc), asm, - [(set VR128:$dst, (Int VR128:$src1, - mem_cpat:$src, timm:$cc))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; -} + (ins VR128:$src1, memop:$src2, u8imm:$cc), asm, + [(set VR128:$dst, (OpNode (VT VR128:$src1), + (mem_frags addr:$src2), timm:$cc))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; + + let isCodeGenOnly = 1 in { + let isCommutable = 1 in + def rr : SIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, + [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>, + Sched<[sched]>, SIMD_EXC; + def rm : SIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, + [(set RC:$dst, (OpNode RC:$src1, + (ld_frag addr:$src2), timm:$cc))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; + } } -// Aliases to match intrinsics which expect XMM operand(s). let ExeDomain = SSEPackedSingle in -defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss, - "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}", - SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, - XS, VEX_4V, VEX_LIG, VEX_WIG; +defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, + "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, + XS, VEX_4V, VEX_LIG, VEX_WIG; let ExeDomain = SSEPackedDouble in -defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd, - "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}", - SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, - XD, VEX_4V, VEX_LIG, VEX_WIG; +defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, + "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, + XD, VEX_4V, VEX_LIG, VEX_WIG; + let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in - defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss, - "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}", - SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; + defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; let ExeDomain = SSEPackedDouble in - defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd, - "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}", - SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; + defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, + "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; } - // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, PatFrag ld_frag, string OpcodeStr, Domain d, - X86FoldableSchedWrite sched = WriteFCom> { -let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, - ExeDomain = d in { + X86FoldableSchedWrite sched = WriteFComX> { + let ExeDomain = d in { def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, - Sched<[sched]>; -let mayLoad = 1 in + Sched<[sched]>, SIMD_EXC; + let mayLoad = 1 in def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), (ld_frag addr:$src2)))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } } // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, ValueType vt, Operand memop, - ComplexPattern mem_cpat, string OpcodeStr, + PatFrags mem_frags, string OpcodeStr, Domain d, - X86FoldableSchedWrite sched = WriteFCom> { -let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in { + X86FoldableSchedWrite sched = WriteFComX> { +let ExeDomain = d in { def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, - Sched<[sched]>; + Sched<[sched]>, SIMD_EXC; let mayLoad = 1 in def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), - mem_cpat:$src2))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>; + (mem_frags addr:$src2)))]>, + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } } @@ -1914,18 +1921,16 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, ValueType VT, string asm, X86FoldableSchedWrite sched, Domain d, PatFrag ld_frag> { -let Uses = [MXCSR], mayRaiseFPException = 1 in { let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, - Sched<[sched]>; + Sched<[sched]>, SIMD_EXC; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, [(set RC:$dst, (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, - Sched<[sched.Folded, sched.ReadAfterFold]>; -} + Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; } defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, @@ -2812,7 +2817,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, } multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, - ComplexPattern int_cpat, Intrinsic Intr, + PatFrags mem_frags, Intrinsic Intr, Predicate target, string Suffix> { let Predicates = [target] in { // These are unary operations, but they are modeled as having 2 source operands @@ -2828,13 +2833,13 @@ multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, // which has a clobber before the rcp, vs. // rcpss mem, %xmm0 let Predicates = [target, OptForSize] in { - def : Pat<(Intr int_cpat:$src2), + def : Pat<(Intr (mem_frags addr:$src2)), (!cast<Instruction>(NAME#m_Int) (vt (IMPLICIT_DEF)), addr:$src2)>; } } -multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat, +multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, PatFrags mem_frags, Intrinsic Intr, Predicate target> { let Predicates = [target] in { def : Pat<(Intr VR128:$src), @@ -2842,7 +2847,7 @@ multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int VR128:$src)>; } let Predicates = [target, OptForSize] in { - def : Pat<(Intr int_cpat:$src2), + def : Pat<(Intr (mem_frags addr:$src2)), (!cast<Instruction>(NAME#m_Int) (vt (IMPLICIT_DEF)), addr:$src2)>; } @@ -2968,28 +2973,28 @@ let Predicates = [HasAVX, NoVLX] in { multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched, Predicate AVXTarget> { defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, - !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), + !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), UseSSE1, "SS">, XS; defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, - !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), + !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), AVXTarget>, XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; } multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched, Predicate AVXTarget> { - defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem, + defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; - defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32, + defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32, f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, XS, VEX_4V, VEX_LIG, VEX_WIG; } multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched, Predicate AVXTarget> { - defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem, + defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; - defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64, + defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64, f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, XD, VEX_4V, VEX_LIG, VEX_WIG; } @@ -3185,13 +3190,13 @@ def PAUSE : I<0x90, RawFrm, (outs), (ins), let SchedRW = [WriteFence] in { // Load, store, and memory fence -// TODO: As with mfence, we may want to ease the availablity of sfence/lfence +// TODO: As with mfence, we may want to ease the availability of sfence/lfence // to include any 64-bit target. -def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, +def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, PS, Requires<[HasSSE1]>; -def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, +def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, PS, Requires<[HasSSE2]>; -def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, +def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, PS, Requires<[HasMFence]>; } // SchedRW @@ -3213,11 +3218,11 @@ def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), let mayLoad=1, hasSideEffects=1 in def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, - TB, Sched<[WriteLDMXCSR]>; + PS, Sched<[WriteLDMXCSR]>; let mayStore=1, hasSideEffects=1 in def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, - TB, Sched<[WriteSTMXCSR]>; + PS, Sched<[WriteSTMXCSR]>; //===---------------------------------------------------------------------===// // SSE2 - Move Aligned/Unaligned Packed Integer Instructions @@ -4185,8 +4190,6 @@ let Predicates = [UseAVX] in { // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. - def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), - (VMOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzload32 addr:$src)), (VMOVDI2PDIrm addr:$src)>; def : Pat<(v8i32 (X86vzload32 addr:$src)), @@ -4199,8 +4202,6 @@ let Predicates = [UseSSE2] in { def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), (MOV64toPQIrr GR64:$src)>; - def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), - (MOVDI2PDIrm addr:$src)>; def : Pat<(v4i32 (X86vzload32 addr:$src)), (MOVDI2PDIrm addr:$src)>; } @@ -4429,16 +4430,11 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))), - (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; } let Predicates = [UseSSE3] in { - // No need for aligned memory as this only loads 64-bits. - def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))), - (MOVDDUPrm addr:$src)>; def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), (MOVDDUPrm addr:$src)>; } @@ -5022,7 +5018,9 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; - def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))), + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), @@ -5030,12 +5028,14 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))), (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; - def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))), + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; } } @@ -5499,7 +5499,7 @@ let ExeDomain = SSEPackedSingle in { !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>, + (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 @@ -5522,7 +5522,7 @@ let ExeDomain = SSEPackedDouble in { !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>, + (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 } @@ -6623,7 +6623,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, [!if(UsesXMM0, (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, - T8, Sched<[sched]>; + T8PS, Sched<[sched]>; def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), @@ -6634,7 +6634,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, (set VR128:$dst, (IntId VR128:$src1, (memop addr:$src2), XMM0)), (set VR128:$dst, (IntId VR128:$src1, - (memop addr:$src2))))]>, T8, + (memop addr:$src2))))]>, T8PS, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6644,7 +6644,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, - (i8 timm:$src3)))]>, TA, + (i8 timm:$src3)))]>, TAPS, Sched<[SchedWriteVecIMul.XMM]>; def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, u8imm:$src3), @@ -6652,7 +6652,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, (memop addr:$src2), - (i8 timm:$src3)))]>, TA, + (i8 timm:$src3)))]>, TAPS, Sched<[SchedWriteVecIMul.XMM.Folded, SchedWriteVecIMul.XMM.ReadAfterFold]>; @@ -6687,7 +6687,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, PatFrag ld_frag, bit Is2Addr = 0, RegisterClass RC = VR128, X86MemOperand MemOp = i128mem> { - let AsmString = OpcodeStr## + let AsmString = OpcodeStr# !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), @@ -6874,10 +6874,10 @@ defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, X86MemOperand MemOp, string Hi, string Lo> { - def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; - def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; } @@ -7290,13 +7290,12 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, X86FoldableSchedWrite sched> { def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, + [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>, T8PD, VEX, Sched<[sched]>; let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>, - T8PD, VEX, Sched<[sched.Folded]>; + []>, T8PD, VEX, Sched<[sched.Folded]>; } multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, @@ -7304,7 +7303,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), (ins RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>, + [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>, TAPD, VEX, Sched<[RR]>; let hasSideEffects = 0, mayStore = 1 in def mr : Ii8<0x1D, MRMDestMem, (outs), @@ -7322,44 +7321,26 @@ let Predicates = [HasF16C, NoVLX] in { WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), + def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 + def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (VCVTPH2PSrm addr:$src)>; + def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))), + (VCVTPH2PSYrm addr:$src)>; def : Pat<(store (f64 (extractelt - (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), + (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; def : Pat<(store (i64 (extractelt - (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), + (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; - def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), + def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; } -// Patterns for matching conversions from float to half-float and vice versa. -let Predicates = [HasF16C, NoVLX] in { - // Use MXCSR.RC for rounding instead of explicitly specifying the default - // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the - // configurations we support (the default). However, falling back to MXCSR is - // more consistent with other instructions, which are always controlled by it. - // It's encoded as 0b100. - def : Pat<(fp_to_f16 FR32:$src), - (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr - (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>; - - def : Pat<(f16_to_fp GR16:$src), - (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr - (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >; - - def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), - (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr - (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >; -} - //===----------------------------------------------------------------------===// // AVX2 Instructions //===----------------------------------------------------------------------===// @@ -7415,7 +7396,7 @@ def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. -// NOTE: We're using FP instructions here, but exeuction domain fixing should +// NOTE: We're using FP instructions here, but execution domain fixing should // take care of using integer instructions when profitable. let Predicates = [HasAVX] in { def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), @@ -7496,46 +7477,6 @@ defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastl v2i64, v4i64, NoVLX>; let Predicates = [HasAVX2, NoVLX] in { - // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. - def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), - (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), - (VPBROADCASTQYrm addr:$src)>; - - // FIXME this is to handle aligned extloads from i8/i16. - def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDrm addr:$src)>; - def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDYrm addr:$src)>; -} -let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. - // This means we'll encounter truncated i32 loads; match that here. - def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), - (VPBROADCASTWrm addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), - (VPBROADCASTWYrm addr:$src)>; - def : Pat<(v8i16 (X86VBroadcast - (i16 (trunc (i32 (extloadi16 addr:$src)))))), - (VPBROADCASTWrm addr:$src)>; - def : Pat<(v8i16 (X86VBroadcast - (i16 (trunc (i32 (zextloadi16 addr:$src)))))), - (VPBROADCASTWrm addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast - (i16 (trunc (i32 (extloadi16 addr:$src)))))), - (VPBROADCASTWYrm addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast - (i16 (trunc (i32 (zextloadi16 addr:$src)))))), - (VPBROADCASTWYrm addr:$src)>; - - // FIXME this is to handle aligned extloads from i8. - def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWrm addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWYrm addr:$src)>; -} - -let Predicates = [HasAVX2, NoVLX] in { // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v4f32 (X86VBroadcast FR32:$src)), @@ -7597,10 +7538,6 @@ let Predicates = [HasAVX, NoVLX] in { def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), (VMOVDDUPrr VR128:$src)>; - def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), - (VMOVDDUPrm addr:$src)>; - def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), - (VMOVDDUPrm addr:$src)>; } let Predicates = [HasAVX1Only] in { @@ -7760,39 +7697,43 @@ let Predicates = [HasAVX2, NoVLX] in { // multiclass avx2_pmovmask<string OpcodeStr, Intrinsic IntLd128, Intrinsic IntLd256, - Intrinsic IntSt128, Intrinsic IntSt256> { + Intrinsic IntSt128, Intrinsic IntSt256, + X86SchedWriteMaskMove schedX, + X86SchedWriteMaskMove schedY> { def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, - VEX_4V, Sched<[WriteVecMaskedLoad]>; + VEX_4V, Sched<[schedX.RM]>; def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, - VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>; + VEX_4V, VEX_L, Sched<[schedY.RM]>; def mr : AVX28I<0x8e, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, - VEX_4V, Sched<[WriteVecMaskedStore]>; + VEX_4V, Sched<[schedX.MR]>; def Ymr : AVX28I<0x8e, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, - VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>; + VEX_4V, VEX_L, Sched<[schedY.MR]>; } defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", int_x86_avx2_maskload_d, int_x86_avx2_maskload_d_256, int_x86_avx2_maskstore_d, - int_x86_avx2_maskstore_d_256>; + int_x86_avx2_maskstore_d_256, + WriteVecMaskMove32, WriteVecMaskMove32Y>; defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskload_q, int_x86_avx2_maskload_q_256, int_x86_avx2_maskstore_q, - int_x86_avx2_maskstore_q_256>, VEX_W; + int_x86_avx2_maskstore_q_256, + WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W; multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, ValueType MaskVT> { @@ -7905,57 +7846,48 @@ let Predicates = [HasAVX2, NoVLX] in { // FIXME: Improve scheduling of gather instructions. multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, - ValueType VTy, PatFrag GatherNode128, - PatFrag GatherNode256, RegisterClass RC256, + ValueType VTy, RegisterClass RC256, X86MemOperand memop128, X86MemOperand memop256, ValueType MTx = VTx, ValueType MTy = VTy> { +let mayLoad = 1, hasSideEffects = 0 in { def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), (ins VR128:$src1, memop128:$src2, VR128:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - [(set (VTx VR128:$dst), (MTx VR128:$mask_wb), - (GatherNode128 VR128:$src1, VR128:$mask, - vectoraddr:$src2))]>, - VEX, Sched<[WriteLoad]>; + []>, VEX, Sched<[WriteLoad]>; def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), (ins RC256:$src1, memop256:$src2, RC256:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - [(set (VTy RC256:$dst), (MTy RC256:$mask_wb), - (GatherNode256 RC256:$src1, RC256:$mask, - vectoraddr:$src2))]>, - VEX, VEX_L, Sched<[WriteLoad]>; + []>, VEX, VEX_L, Sched<[WriteLoad]>; +} } let Predicates = [HasAVX2] in { let mayLoad = 1, hasSideEffects = 0, Constraints = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" in { - defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32, - mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W; - defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64, - mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W; - defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32, - mgatherv8i32, VR256, vx128mem, vy256mem>; - defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64, - mgatherv4i64, VR128, vx64mem, vy128mem>; + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, + VR256, vx128mem, vx256mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, + VR256, vx128mem, vy256mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, + VR256, vx128mem, vy256mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, + VR128, vx64mem, vy128mem>; let ExeDomain = SSEPackedDouble in { - defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32, - mgatherv4i32, VR256, vx128mem, vx256mem, - v2i64, v4i64>, VEX_W; - defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64, - mgatherv4i64, VR256, vx128mem, vy256mem, - v2i64, v4i64>, VEX_W; + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, + VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, + VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W; } let ExeDomain = SSEPackedSingle in { - defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32, - mgatherv8i32, VR256, vx128mem, vy256mem, - v4i32, v8i32>; - defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64, - mgatherv4i64, VR128, vx64mem, vy128mem, - v4i32, v4i32>; + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, + VR256, vx128mem, vy256mem, v4i32, v8i32>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, + VR128, vx64mem, vy128mem, v4i32, v4i32>; } } } @@ -7969,8 +7901,8 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, X86MemOperand X86MemOp, bit Is2Addr = 0> { let ExeDomain = SSEPackedInt, AsmString = !if(Is2Addr, - OpcodeStr##"\t{$src2, $dst|$dst, $src2}", - OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { + OpcodeStr#"\t{$src2, $dst|$dst, $src2}", + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { let isCommutable = 1 in def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, @@ -7987,8 +7919,8 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, X86MemOperand X86MemOp, bit Is2Addr = 0> { let AsmString = !if(Is2Addr, - OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}", - OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { + OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}", + OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), "", [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], @@ -8008,9 +7940,9 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, VR128, load, i128mem, 1>; let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { - defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128, + defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128, load, i128mem>, VEX_4V, VEX_W; - defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256, + defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256, load, i256mem>, VEX_4V, VEX_L, VEX_W; } } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td index 9d974b716dda..823ff78b9903 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -472,19 +472,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "rol{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (rotl GR8:$src1, (i8 relocImm:$src2)))]>; + [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>; def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "rol{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (rotl GR16:$src1, (i8 relocImm:$src2)))]>, + [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize16; def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "rol{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (rotl GR32:$src1, (i8 relocImm:$src2)))]>, + [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>, OpSize32; def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), "rol{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (rotl GR64:$src1, (i8 relocImm:$src2)))]>; + [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>; // Rotate by 1 def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), @@ -570,19 +570,19 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "ror{b}\t{$src2, $dst|$dst, $src2}", - [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))]>; + [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>; def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "ror{w}\t{$src2, $dst|$dst, $src2}", - [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))]>, + [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, OpSize16; def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "ror{l}\t{$src2, $dst|$dst, $src2}", - [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))]>, + [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>, OpSize32; def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), "ror{q}\t{$src2, $dst|$dst, $src2}", - [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))]>; + [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>; // Rotate by 1 def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), @@ -661,32 +661,32 @@ let Uses = [CL], SchedRW = [WriteSHDrrcl] in { def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>, + [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2, CL))]>, TB, OpSize16; def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>, + [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1, CL))]>, TB, OpSize16; def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, + [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2, CL))]>, TB, OpSize32; def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, + [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1, CL))]>, TB, OpSize32; def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>, + [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2, CL))]>, TB; def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>, + [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1, CL))]>, TB; } // SchedRW @@ -695,42 +695,42 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, + [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2, (i8 imm:$src3)))]>, TB, OpSize16; def SHRD16rri8 : Ii8<0xAC, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, u8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, + [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1, (i8 imm:$src3)))]>, TB, OpSize16; def SHLD32rri8 : Ii8<0xA4, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, u8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, + [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2, (i8 imm:$src3)))]>, TB, OpSize32; def SHRD32rri8 : Ii8<0xAC, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, u8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, + [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1, (i8 imm:$src3)))]>, TB, OpSize32; def SHLD64rri8 : RIi8<0xA4, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, u8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, + [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2, (i8 imm:$src3)))]>, TB; def SHRD64rri8 : RIi8<0xAC, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2, u8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, + [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1, (i8 imm:$src3)))]>, TB; } // SchedRW @@ -739,70 +739,70 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg, let Uses = [CL], SchedRW = [WriteSHDmrcl] in { def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), - addr:$dst)]>, TB, OpSize16; + [(store (X86fshl (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)]>, TB, OpSize16; def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), - addr:$dst)]>, TB, OpSize16; + [(store (X86fshr GR16:$src2, (loadi16 addr:$dst), CL), + addr:$dst)]>, TB, OpSize16; def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), + [(store (fshl (loadi32 addr:$dst), GR32:$src2, CL), addr:$dst)]>, TB, OpSize32; def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), - addr:$dst)]>, TB, OpSize32; + [(store (fshr GR32:$src2, (loadi32 addr:$dst), CL), + addr:$dst)]>, TB, OpSize32; def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), - addr:$dst)]>, TB; + [(store (fshl (loadi64 addr:$dst), GR64:$src2, CL), + addr:$dst)]>, TB; def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", - [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), - addr:$dst)]>, TB; + [(store (fshr GR64:$src2, (loadi64 addr:$dst), CL), + addr:$dst)]>, TB; } // SchedRW let SchedRW = [WriteSHDmri] in { def SHLD16mri8 : Ii8<0xA4, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(store (X86shld (loadi16 addr:$dst), GR16:$src2, - (i8 imm:$src3)), addr:$dst)]>, + [(store (X86fshl (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)]>, TB, OpSize16; def SHRD16mri8 : Ii8<0xAC, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, - (i8 imm:$src3)), addr:$dst)]>, + [(store (X86fshr GR16:$src2, (loadi16 addr:$dst), + (i8 imm:$src3)), addr:$dst)]>, TB, OpSize16; def SHLD32mri8 : Ii8<0xA4, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(store (X86shld (loadi32 addr:$dst), GR32:$src2, - (i8 imm:$src3)), addr:$dst)]>, + [(store (fshl (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)]>, TB, OpSize32; def SHRD32mri8 : Ii8<0xAC, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, - (i8 imm:$src3)), addr:$dst)]>, + [(store (fshr GR32:$src2, (loadi32 addr:$dst), + (i8 imm:$src3)), addr:$dst)]>, TB, OpSize32; def SHLD64mri8 : RIi8<0xA4, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(store (X86shld (loadi64 addr:$dst), GR64:$src2, - (i8 imm:$src3)), addr:$dst)]>, + [(store (fshl (loadi64 addr:$dst), GR64:$src2, + (i8 imm:$src3)), addr:$dst)]>, TB; def SHRD64mri8 : RIi8<0xAC, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, - (i8 imm:$src3)), addr:$dst)]>, + [(store (fshr GR64:$src2, (loadi64 addr:$dst), + (i8 imm:$src3)), addr:$dst)]>, TB; } // SchedRW @@ -1013,3 +1013,21 @@ let Predicates = [HasBMI2] in { (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; } + +def : Pat<(rotl GR8:$src1, (i8 relocImm:$src2)), + (ROL8ri GR8:$src1, relocImm:$src2)>; +def : Pat<(rotl GR16:$src1, (i8 relocImm:$src2)), + (ROL16ri GR16:$src1, relocImm:$src2)>; +def : Pat<(rotl GR32:$src1, (i8 relocImm:$src2)), + (ROL32ri GR32:$src1, relocImm:$src2)>; +def : Pat<(rotl GR64:$src1, (i8 relocImm:$src2)), + (ROL64ri GR64:$src1, relocImm:$src2)>; + +def : Pat<(rotr GR8:$src1, (i8 relocImm:$src2)), + (ROR8ri GR8:$src1, relocImm:$src2)>; +def : Pat<(rotr GR16:$src1, (i8 relocImm:$src2)), + (ROR16ri GR16:$src1, relocImm:$src2)>; +def : Pat<(rotr GR32:$src1, (i8 relocImm:$src2)), + (ROR32ri GR32:$src1, relocImm:$src2)>; +def : Pat<(rotr GR64:$src1, (i8 relocImm:$src2)), + (ROR64ri GR64:$src1, relocImm:$src2)>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td index 7f41feb6c0d9..c23bc7ebbf70 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td @@ -23,7 +23,20 @@ def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB; let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in { def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB; - def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB; + + def UD1Wm : I<0xB9, MRMSrcMem, (outs), (ins GR16:$src1, i16mem:$src2), + "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16; + def UD1Lm : I<0xB9, MRMSrcMem, (outs), (ins GR32:$src1, i32mem:$src2), + "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32; + def UD1Qm : RI<0xB9, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2), + "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB; + + def UD1Wr : I<0xB9, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2), + "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16; + def UD1Lr : I<0xB9, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2), + "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32; + def UD1Qr : RI<0xB9, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2), + "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB; } def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>; @@ -149,12 +162,12 @@ def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src), // Segment override instruction prefixes let SchedRW = [WriteNop] in { -def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>; -def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>; -def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>; -def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>; -def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>; -def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>; +def CS_PREFIX : I<0x2E, PrefixByte, (outs), (ins), "cs", []>; +def SS_PREFIX : I<0x36, PrefixByte, (outs), (ins), "ss", []>; +def DS_PREFIX : I<0x3E, PrefixByte, (outs), (ins), "ds", []>; +def ES_PREFIX : I<0x26, PrefixByte, (outs), (ins), "es", []>; +def FS_PREFIX : I<0x64, PrefixByte, (outs), (ins), "fs", []>; +def GS_PREFIX : I<0x65, PrefixByte, (outs), (ins), "gs", []>; } // SchedRW //===----------------------------------------------------------------------===// @@ -512,12 +525,12 @@ let SchedRW = [WriteSystem] in { let SchedRW = [WriteSystem] in { let Predicates = [HasXSAVE] in { let Defs = [EDX, EAX], Uses = [ECX] in - def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; + def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, PS; let Uses = [EDX, EAX, ECX] in def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", - [(int_x86_xsetbv ECX, EDX, EAX)]>, TB; + [(int_x86_xsetbv ECX, EDX, EAX)]>, PS; } // HasXSAVE @@ -542,47 +555,47 @@ def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst), [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>; def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst), "xsavec\t$dst", - [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>; + [(int_x86_xsavec addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC]>; def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst), "xsavec64\t$dst", - [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>; + [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC, In64BitMode]>; def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst), "xsaves\t$dst", - [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>; + [(int_x86_xsaves addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>; def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst), "xsaves64\t$dst", - [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>; + [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>; def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst), "xrstors\t$dst", - [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>; + [(int_x86_xrstors addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>; def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst), "xrstors64\t$dst", - [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>; + [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES, In64BitMode]>; } // Uses } // SchedRW //===----------------------------------------------------------------------===// // VIA PadLock crypto instructions let Defs = [RAX, RDI], Uses = [RDX, RDI], SchedRW = [WriteSystem] in - def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB; + def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB, REP; def : InstAlias<"xstorerng", (XSTORE)>; let SchedRW = [WriteSystem] in { let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in { - def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB; - def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB; - def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB; - def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB; - def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB; + def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB, REP; + def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB, REP; + def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB, REP; + def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB, REP; + def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB, REP; } let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { - def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB; - def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB; + def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB, REP; + def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB, REP; } let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in - def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB; + def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB, REP; } // SchedRW //==-----------------------------------------------------------------------===// @@ -590,10 +603,10 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in let SchedRW = [WriteSystem] in { let Defs = [EAX, EDX], Uses = [ECX] in def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", - [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB; + [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, PS; let Uses = [EAX, ECX, EDX] in def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", - [(X86wrpkru EAX, EDX, ECX)]>, TB; + [(X86wrpkru EAX, EDX, ECX)]>, PS; } // SchedRW //===----------------------------------------------------------------------===// @@ -653,15 +666,15 @@ let Predicates = [In64BitMode, HasINVPCID] in { //===----------------------------------------------------------------------===// // SMAP Instruction let Defs = [EFLAGS], SchedRW = [WriteSystem] in { - def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; - def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; + def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, PS; + def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, PS; } //===----------------------------------------------------------------------===// // SMX Instruction let SchedRW = [WriteSystem] in { let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in { - def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB; + def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, PS; } // Uses, Defs } // SchedRW @@ -729,6 +742,6 @@ def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst), let SchedRW = [WriteSystem] in { let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in - def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB, + def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, PS, Requires<[HasPCONFIG]>; } // SchedRW diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td index 41b839425ccd..28563eeb4484 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td @@ -37,11 +37,11 @@ def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>; } def XEND : I<0x01, MRM_D5, (outs), (ins), - "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; + "xend", [(int_x86_xend)]>, PS, Requires<[HasRTM]>; let Defs = [EFLAGS] in def XTEST : I<0x01, MRM_D6, (outs), (ins), - "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>; + "xtest", [(set EFLAGS, (X86xtest))]>, PS, Requires<[HasRTM]>; def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), "xabort\t$imm", @@ -52,8 +52,8 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), let SchedRW = [WriteSystem] in { let isAsmParserOnly = 1 in { -def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>; -def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>; +def XACQUIRE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "xacquire", []>; +def XRELEASE_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "xrelease", []>; } } // SchedRW diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td index 37bc4ce2e053..d204a33358ea 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td @@ -37,7 +37,7 @@ def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), "vmclear\t$vmcs", []>, PD; // OF 01 D4 -def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB; +def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, PS; // 0F 01 C2 def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td index 229af366d940..a5976b7d2d74 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td @@ -40,14 +40,14 @@ let ExeDomain = SSEPackedInt in { // Scalar load 2 addr operand instructions multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, - Operand memop, ComplexPattern mem_cpat, + Operand memop, PatFrags mem_frags, X86FoldableSchedWrite sched> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int mem_cpat:$src))]>, XOP, + [(set VR128:$dst, (Int (mem_frags addr:$src)))]>, XOP, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -335,13 +335,13 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V, Sched<[sched]>; - // FIXME: This pattern can't match. + // FIXME: We can't write a pattern for this in tablegen. + let hasSideEffects = 0, mayLoad = 1 in def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1), - (X86andnp (load addr:$src3), RC:$src2))))]>, + []>, XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>; def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), @@ -383,13 +383,13 @@ let Predicates = [HasXOP] in { (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>; def : Pat<(or (and VR128:$src3, VR128:$src1), - (X86andnp VR128:$src3, (bc_v16i8 (loadv2i64 addr:$src2)))), + (X86andnp VR128:$src3, (loadv16i8 addr:$src2))), (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>; def : Pat<(or (and VR128:$src3, VR128:$src1), - (X86andnp VR128:$src3, (bc_v8i16 (loadv2i64 addr:$src2)))), + (X86andnp VR128:$src3, (loadv8i16 addr:$src2))), (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>; def : Pat<(or (and VR128:$src3, VR128:$src1), - (X86andnp VR128:$src3, (bc_v4i32 (loadv2i64 addr:$src2)))), + (X86andnp VR128:$src3, (loadv4i32 addr:$src2))), (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>; def : Pat<(v32i8 (or (and VR256:$src3, VR256:$src1), @@ -403,13 +403,13 @@ let Predicates = [HasXOP] in { (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>; def : Pat<(or (and VR256:$src3, VR256:$src1), - (X86andnp VR256:$src3, (bc_v32i8 (loadv4i64 addr:$src2)))), + (X86andnp VR256:$src3, (loadv32i8 addr:$src2))), (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>; def : Pat<(or (and VR256:$src3, VR256:$src1), - (X86andnp VR256:$src3, (bc_v16i16 (loadv4i64 addr:$src2)))), + (X86andnp VR256:$src3, (loadv16i16 addr:$src2))), (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>; def : Pat<(or (and VR256:$src3, VR256:$src1), - (X86andnp VR256:$src3, (bc_v8i32 (loadv4i64 addr:$src2)))), + (X86andnp VR256:$src3, (loadv8i32 addr:$src2))), (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>; } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp index 3f9d626ff912..60fb4d2ef4bf 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/X86BaseInfo.h" +#include "X86.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86RegisterBankInfo.h" @@ -71,7 +72,7 @@ private: // TODO: remove after supported by Tablegen-erated instruction selection. unsigned getLoadStoreOp(const LLT &Ty, const RegisterBank &RB, unsigned Opc, - uint64_t Alignment) const; + Align Alignment) const; bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; @@ -394,7 +395,7 @@ bool X86InstructionSelector::select(MachineInstr &I) { unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty, const RegisterBank &RB, unsigned Opc, - uint64_t Alignment) const { + Align Alignment) const { bool Isload = (Opc == TargetOpcode::G_LOAD); bool HasAVX = STI.hasAVX(); bool HasAVX512 = STI.hasAVX512(); @@ -427,7 +428,7 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty, HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); } else if (Ty.isVector() && Ty.getSizeInBits() == 128) { - if (Alignment >= 16) + if (Alignment >= Align(16)) return Isload ? (HasVLX ? X86::VMOVAPSZ128rm : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX @@ -446,7 +447,7 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty, ? X86::VMOVUPSZ128mr_NOVLX : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); } else if (Ty.isVector() && Ty.getSizeInBits() == 256) { - if (Alignment >= 32) + if (Alignment >= Align(32)) return Isload ? (HasVLX ? X86::VMOVAPSZ256rm : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX : X86::VMOVAPSYrm) @@ -461,7 +462,7 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty, : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX : X86::VMOVUPSYmr); } else if (Ty.isVector() && Ty.getSizeInBits() == 512) { - if (Alignment >= 64) + if (Alignment >= Align(64)) return Isload ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; else return Isload ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; @@ -520,13 +521,13 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n"); return false; } - if (MemOp.getAlignment() < Ty.getSizeInBits()/8) { + if (MemOp.getAlign() < Ty.getSizeInBits() / 8) { LLVM_DEBUG(dbgs() << "Unaligned atomics not supported yet\n"); return false; } } - unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment()); + unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlign()); if (NewOpc == Opc) return false; @@ -1435,14 +1436,15 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I, const Register DstReg = I.getOperand(0).getReg(); const LLT DstTy = MRI.getType(DstReg); const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); - unsigned Align = DstTy.getSizeInBits(); + Align Alignment = Align(DstTy.getSizeInBytes()); const DebugLoc &DbgLoc = I.getDebugLoc(); - unsigned Opc = getLoadStoreOp(DstTy, RegBank, TargetOpcode::G_LOAD, Align); + unsigned Opc = + getLoadStoreOp(DstTy, RegBank, TargetOpcode::G_LOAD, Alignment); // Create the load from the constant pool. const ConstantFP *CFP = I.getOperand(1).getFPImm(); - unsigned CPI = MF.getConstantPool()->getConstantPoolIndex(CFP, Align); + unsigned CPI = MF.getConstantPool()->getConstantPoolIndex(CFP, Alignment); MachineInstr *LoadInst = nullptr; unsigned char OpFlag = STI.classifyLocalReference(nullptr); @@ -1456,7 +1458,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I, MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad, - MF.getDataLayout().getPointerSize(), Align); + MF.getDataLayout().getPointerSize(), Alignment); LoadInst = addDirectMem(BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg), diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 8f74a8fe041d..a19e12766e10 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -69,7 +69,7 @@ class X86InterleavedAccessGroup { /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors. - void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T, + void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T, SmallVectorImpl<Instruction *> &DecomposedVectors); /// Performs matrix transposition on a 4x4 matrix \p InputVectors and @@ -127,7 +127,7 @@ public: bool X86InterleavedAccessGroup::isSupported() const { VectorType *ShuffleVecTy = Shuffles[0]->getType(); - Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); + Type *ShuffleEltTy = ShuffleVecTy->getElementType(); unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); unsigned WideInstSize; @@ -150,7 +150,7 @@ bool X86InterleavedAccessGroup::isSupported() const { // We support shuffle represents stride 4 for byte type with size of // WideInstSize. if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4) - return true; + return true; if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 && (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 || @@ -165,7 +165,7 @@ bool X86InterleavedAccessGroup::isSupported() const { } void X86InterleavedAccessGroup::decompose( - Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy, + Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy, SmallVectorImpl<Instruction *> &DecomposedVectors) { assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) && "Expected Load or Shuffle"); @@ -186,8 +186,8 @@ void X86InterleavedAccessGroup::decompose( DecomposedVectors.push_back( cast<ShuffleVectorInst>(Builder.CreateShuffleVector( Op0, Op1, - createSequentialMask(Builder, Indices[i], - SubVecTy->getVectorNumElements(), 0)))); + createSequentialMask(Indices[i], SubVecTy->getNumElements(), + 0)))); return; } @@ -201,7 +201,7 @@ void X86InterleavedAccessGroup::decompose( // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1] unsigned VecLength = DL.getTypeSizeInBits(VecWidth); if (VecLength == 768 || VecLength == 1536) { - VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16); + VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16); VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace()); VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); NumLoads = NumSubVectors * (VecLength / 384); @@ -211,13 +211,20 @@ void X86InterleavedAccessGroup::decompose( VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); } // Generate N loads of T type. + assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized() && + "VecBaseTy's size must be a multiple of 8"); + const Align FirstAlignment = LI->getAlign(); + const Align SubsequentAlignment = commonAlignment( + FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize() / 8); + Align Alignment = FirstAlignment; for (unsigned i = 0; i < NumLoads; i++) { // TODO: Support inbounds GEP. Value *NewBasePtr = Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = - Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment()); + Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment); DecomposedVectors.push_back(NewLoad); + Alignment = SubsequentAlignment; } } @@ -229,11 +236,11 @@ static MVT scaleVectorType(MVT VT) { VT.getVectorNumElements() / 2); } -static uint32_t Concat[] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 }; +static constexpr int Concat[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63}; // genShuffleBland - Creates shuffle according to two vectors.This function is // only works on instructions with lane inside 256 registers. According to @@ -251,11 +258,11 @@ static uint32_t Concat[] = { // By computing the shuffle on a sequence of 16 elements(one lane) and add the // correct offset. We are creating a vpsuffed + blend sequence between two // shuffles. -static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask, - SmallVectorImpl<uint32_t> &Out, int LowOffset, - int HighOffset) { +static void genShuffleBland(MVT VT, ArrayRef<int> Mask, + SmallVectorImpl<int> &Out, int LowOffset, + int HighOffset) { assert(VT.getSizeInBits() >= 256 && - "This function doesn't accept width smaller then 256"); + "This function doesn't accept width smaller then 256"); unsigned NumOfElm = VT.getVectorNumElements(); for (unsigned i = 0; i < Mask.size(); i++) Out.push_back(Mask[i] + LowOffset); @@ -282,36 +289,35 @@ static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask, // Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11| static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix, - ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf, - unsigned VecElems, unsigned Stride, - IRBuilder<> Builder) { + ArrayRef<Value *> Vec, ArrayRef<int> VPShuf, + unsigned VecElems, unsigned Stride, + IRBuilder<> &Builder) { if (VecElems == 16) { for (unsigned i = 0; i < Stride; i++) TransposedMatrix[i] = Builder.CreateShuffleVector( - Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf); + Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf); return; } - SmallVector<uint32_t, 32> OptimizeShuf; + SmallVector<int, 32> OptimizeShuf; Value *Temp[8]; for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) { genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16, - (i + 1) / Stride * 16); + (i + 1) / Stride * 16); Temp[i / 2] = Builder.CreateShuffleVector( - Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); + Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); OptimizeShuf.clear(); } if (VecElems == 32) { std::copy(Temp, Temp + Stride, TransposedMatrix.begin()); return; - } - else + } else for (unsigned i = 0; i < Stride; i++) TransposedMatrix[i] = - Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); + Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); } void X86InterleavedAccessGroup::interleave8bitStride4VF8( @@ -325,19 +331,19 @@ void X86InterleavedAccessGroup::interleave8bitStride4VF8( MVT VT = MVT::v8i16; TransposedMatrix.resize(2); - SmallVector<uint32_t, 16> MaskLow; - SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord; - SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord; + SmallVector<int, 16> MaskLow; + SmallVector<int, 32> MaskLowTemp1, MaskLowWord; + SmallVector<int, 32> MaskHighTemp1, MaskHighWord; for (unsigned i = 0; i < 8; ++i) { MaskLow.push_back(i); MaskLow.push_back(i + 8); } - createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false); - createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false); - scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord); - scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord); + createUnpackShuffleMask(VT, MaskLowTemp1, true, false); + createUnpackShuffleMask(VT, MaskHighTemp1, false, false); + narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord); + narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord); // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7 // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7 Value *IntrVec1Low = @@ -367,25 +373,25 @@ void X86InterleavedAccessGroup::interleave8bitStride4( MVT HalfVT = scaleVectorType(VT); TransposedMatrix.resize(4); - SmallVector<uint32_t, 32> MaskHigh; - SmallVector<uint32_t, 32> MaskLow; - SmallVector<uint32_t, 32> LowHighMask[2]; - SmallVector<uint32_t, 32> MaskHighTemp; - SmallVector<uint32_t, 32> MaskLowTemp; + SmallVector<int, 32> MaskHigh; + SmallVector<int, 32> MaskLow; + SmallVector<int, 32> LowHighMask[2]; + SmallVector<int, 32> MaskHighTemp; + SmallVector<int, 32> MaskLowTemp; // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86 // shuffle pattern. - createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false); - createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false); + createUnpackShuffleMask(VT, MaskLow, true, false); + createUnpackShuffleMask(VT, MaskHigh, false, false); // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86 // shuffle pattern. - createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false); - createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false); - scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]); - scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]); + createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false); + createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false); + narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]); + narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]); // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23 // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31 @@ -433,7 +439,7 @@ void X86InterleavedAccessGroup::interleave8bitStride4( // For example shuffle pattern for VF 16 register size 256 -> lanes = 2 // {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>} static void createShuffleStride(MVT VT, int Stride, - SmallVectorImpl<uint32_t> &Mask) { + SmallVectorImpl<int> &Mask) { int VectorSize = VT.getSizeInBits(); int VF = VT.getVectorNumElements(); int LaneCount = std::max(VectorSize / 128, 1); @@ -446,7 +452,7 @@ static void createShuffleStride(MVT VT, int Stride, // inside mask a shuffleMask. A mask contains exactly 3 groups, where // each group is a monotonically increasing sequence with stride 3. // For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2} -static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) { +static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) { int VectorSize = VT.getSizeInBits(); int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1); for (int i = 0, FirstGroupElement = 0; i < 3; i++) { @@ -470,7 +476,7 @@ static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) { // direction of the alignment. (false - align to the "right" side while true - // align to the "left" side) static void DecodePALIGNRMask(MVT VT, unsigned Imm, - SmallVectorImpl<uint32_t> &ShuffleMask, + SmallVectorImpl<int> &ShuffleMask, bool AlignDirection = true, bool Unary = false) { unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1); @@ -519,7 +525,7 @@ static void DecodePALIGNRMask(MVT VT, unsigned Imm, // Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11| static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec, - unsigned VecElems, IRBuilder<> Builder) { + unsigned VecElems, IRBuilder<> &Builder) { if (VecElems == 16) { for (int i = 0; i < 3; i++) Vec[i] = InVec[i]; @@ -547,11 +553,11 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3( // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7 TransposedMatrix.resize(3); - SmallVector<uint32_t, 32> VPShuf; - SmallVector<uint32_t, 32> VPAlign[2]; - SmallVector<uint32_t, 32> VPAlign2; - SmallVector<uint32_t, 32> VPAlign3; - SmallVector<uint32_t, 3> GroupSize; + SmallVector<int, 32> VPShuf; + SmallVector<int, 32> VPAlign[2]; + SmallVector<int, 32> VPAlign2; + SmallVector<int, 32> VPAlign3; + SmallVector<int, 3> GroupSize; Value *Vec[6], *TempVector[3]; MVT VT = MVT::getVT(Shuffles[0]->getType()); @@ -605,8 +611,8 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3( // group2Shuffle reorder the shuffle stride back into continuous order. // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} => // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}. -static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask, - SmallVectorImpl<uint32_t> &Output) { +static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask, + SmallVectorImpl<int> &Output) { int IndexGroup[3] = {0, 0, 0}; int Index = 0; int VectorWidth = VT.getSizeInBits(); @@ -633,11 +639,11 @@ void X86InterleavedAccessGroup::interleave8bitStride3( // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7 TransposedMatrix.resize(3); - SmallVector<uint32_t, 3> GroupSize; - SmallVector<uint32_t, 32> VPShuf; - SmallVector<uint32_t, 32> VPAlign[3]; - SmallVector<uint32_t, 32> VPAlign2; - SmallVector<uint32_t, 32> VPAlign3; + SmallVector<int, 3> GroupSize; + SmallVector<int, 32> VPShuf; + SmallVector<int, 32> VPAlign[3]; + SmallVector<int, 32> VPAlign2; + SmallVector<int, 32> VPAlign3; Value *Vec[3], *TempVector[3]; MVT VT = MVT::getVectorVT(MVT::i8, VecElems); @@ -682,7 +688,7 @@ void X86InterleavedAccessGroup::interleave8bitStride3( unsigned NumOfElm = VT.getVectorNumElements(); group2Shuffle(VT, GroupSize, VPShuf); - reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder); + reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder); } void X86InterleavedAccessGroup::transpose_4x4( @@ -692,25 +698,25 @@ void X86InterleavedAccessGroup::transpose_4x4( TransposedMatrix.resize(4); // dst = src1[0,1],src2[0,1] - uint32_t IntMask1[] = {0, 1, 4, 5}; - ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4); + static constexpr int IntMask1[] = {0, 1, 4, 5}; + ArrayRef<int> Mask = makeArrayRef(IntMask1, 4); Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); // dst = src1[2,3],src2[2,3] - uint32_t IntMask2[] = {2, 3, 6, 7}; + static constexpr int IntMask2[] = {2, 3, 6, 7}; Mask = makeArrayRef(IntMask2, 4); Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask); Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask); // dst = src1[0],src2[0],src1[2],src2[2] - uint32_t IntMask3[] = {0, 4, 2, 6}; + static constexpr int IntMask3[] = {0, 4, 2, 6}; Mask = makeArrayRef(IntMask3, 4); TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); // dst = src1[1],src2[1],src1[3],src2[3] - uint32_t IntMask4[] = {1, 5, 3, 7}; + static constexpr int IntMask4[] = {1, 5, 3, 7}; Mask = makeArrayRef(IntMask4, 4); TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask); TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask); @@ -721,14 +727,14 @@ void X86InterleavedAccessGroup::transpose_4x4( bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { SmallVector<Instruction *, 4> DecomposedVectors; SmallVector<Value *, 4> TransposedVectors; - VectorType *ShuffleTy = Shuffles[0]->getType(); + auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType()); if (isa<LoadInst>(Inst)) { // Try to generate target-sized register(/instruction). decompose(Inst, Factor, ShuffleTy, DecomposedVectors); - Type *ShuffleEltTy = Inst->getType(); - unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor; + auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType()); + unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor; // Perform matrix-transposition in order to compute interleaved // results by generating some sort of (optimized) target-specific // instructions. @@ -756,13 +762,14 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { return true; } - Type *ShuffleEltTy = ShuffleTy->getVectorElementType(); - unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor; + Type *ShuffleEltTy = ShuffleTy->getElementType(); + unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor; // Lower the interleaved stores: // 1. Decompose the interleaved wide shuffle into individual shuffle // vectors. - decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems), + decompose(Shuffles[0], Factor, + FixedVectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors); // 2. Transpose the interleaved-vectors into vectors of contiguous @@ -793,8 +800,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // 4. Generate a store instruction for wide-vec. StoreInst *SI = cast<StoreInst>(Inst); - Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), - SI->getAlignment()); + Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign()); return true; } @@ -826,7 +832,8 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); - assert(SVI->getType()->getVectorNumElements() % Factor == 0 && + assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor == + 0 && "Invalid interleaved store"); // Holds the indices of SVI that correspond to the starting index of each diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 40bf28df3b90..1c10c07abeee 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -679,8 +679,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VTRUNCS, X86ISD::VMTRUNCS), X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, TRUNCATE_TO_REG, X86ISD::VTRUNCS, X86ISD::VMTRUNCS), - X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK, - X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, TRUNCATE_TO_REG, + X86ISD::VTRUNCS, X86ISD::VMTRUNCS), X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK, X86ISD::VTRUNCS, 0), X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK, @@ -783,10 +783,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FSUBS, X86ISD::FSUBS_RND), X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK, X86ISD::FSUBS, X86ISD::FSUBS_RND), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK, - X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK, - X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE, X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK, @@ -997,7 +993,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), X86_INTRINSIC_DATA(bmi_bzhi_64, INTR_TYPE_2OP, X86ISD::BZHI, 0), + X86_INTRINSIC_DATA(bmi_pdep_32, INTR_TYPE_2OP, X86ISD::PDEP, 0), + X86_INTRINSIC_DATA(bmi_pdep_64, INTR_TYPE_2OP, X86ISD::PDEP, 0), + X86_INTRINSIC_DATA(bmi_pext_32, INTR_TYPE_2OP, X86ISD::PEXT, 0), + X86_INTRINSIC_DATA(bmi_pext_64, INTR_TYPE_2OP, X86ISD::PEXT, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), + X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), + X86_INTRINSIC_DATA(sse_cmp_ss, INTR_TYPE_3OP, X86ISD::FSETCC, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT), @@ -1022,6 +1027,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT), X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE), X86_INTRINSIC_DATA(sse2_cmp_pd, INTR_TYPE_3OP, X86ISD::CMPP, 0), + X86_INTRINSIC_DATA(sse2_cmp_sd, INTR_TYPE_3OP, X86ISD::FSETCC, 0), X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT), @@ -1104,8 +1110,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB), X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTR, 0), - X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), @@ -1157,10 +1161,8 @@ static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) { } static void verifyIntrinsicTables() { - assert(std::is_sorted(std::begin(IntrinsicsWithoutChain), - std::end(IntrinsicsWithoutChain)) && - std::is_sorted(std::begin(IntrinsicsWithChain), - std::end(IntrinsicsWithChain)) && + assert(llvm::is_sorted(IntrinsicsWithoutChain) && + llvm::is_sorted(IntrinsicsWithChain) && "Intrinsic data tables should be sorted by Intrinsic ID"); assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain), std::end(IntrinsicsWithoutChain)) == diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp index da53d6420021..84f560f2f9ee 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp @@ -85,14 +85,14 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, verify(*STI.getInstrInfo()); } -bool X86LegalizerInfo::legalizeIntrinsic(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const { +bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; switch (MI.getIntrinsicID()) { case Intrinsic::memcpy: case Intrinsic::memset: case Intrinsic::memmove: - if (createMemLibcall(MIRBuilder, MRI, MI) == + if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) == LegalizerHelper::UnableToLegalize) return false; MI.eraseFromParent(); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h index 7a0f13fb5ae6..72d25096d72b 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h @@ -32,8 +32,8 @@ private: public: X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM); - bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &MIRBuilder) const override; + bool legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const override; private: void setLegalizerInfo32bit(); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 35fc439998f9..50f8b3477acc 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -822,79 +822,3 @@ INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY, FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningPass() { return new X86LoadValueInjectionLoadHardeningPass(); } - -namespace { - -/// The `X86LoadValueInjectionLoadHardeningPass` above depends on expensive -/// analysis passes that add complexity to the pipeline. This complexity -/// can cause noticable overhead when no optimizations are enabled, i.e., -O0. -/// The purpose of `X86LoadValueInjectionLoadHardeningUnoptimizedPass` is to -/// provide the same security as the optimized pass, but without adding -/// unnecessary complexity to the LLVM pipeline. -/// -/// The behavior of this pass is simply to insert an LFENCE after every load -/// instruction. -class X86LoadValueInjectionLoadHardeningUnoptimizedPass - : public MachineFunctionPass { -public: - X86LoadValueInjectionLoadHardeningUnoptimizedPass() - : MachineFunctionPass(ID) {} - - StringRef getPassName() const override { - return "X86 Load Value Injection (LVI) Load Hardening (Unoptimized)"; - } - bool runOnMachineFunction(MachineFunction &MF) override; - static char ID; -}; - -} // end anonymous namespace - -char X86LoadValueInjectionLoadHardeningUnoptimizedPass::ID = 0; - -bool X86LoadValueInjectionLoadHardeningUnoptimizedPass::runOnMachineFunction( - MachineFunction &MF) { - LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName() - << " *****\n"); - const X86Subtarget *STI = &MF.getSubtarget<X86Subtarget>(); - if (!STI->useLVILoadHardening()) - return false; - - // FIXME: support 32-bit - if (!STI->is64Bit()) - report_fatal_error("LVI load hardening is only supported on 64-bit", false); - - // Don't skip functions with the "optnone" attr but participate in opt-bisect. - const Function &F = MF.getFunction(); - if (!F.hasOptNone() && skipFunction(F)) - return false; - - bool Modified = false; - ++NumFunctionsConsidered; - - const TargetInstrInfo *TII = STI->getInstrInfo(); - for (auto &MBB : MF) { - for (auto &MI : MBB) { - if (!MI.mayLoad() || MI.getOpcode() == X86::LFENCE || - MI.getOpcode() == X86::MFENCE) - continue; - - MachineBasicBlock::iterator InsertionPt = - MI.getNextNode() ? MI.getNextNode() : MBB.end(); - BuildMI(MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE)); - ++NumFences; - Modified = true; - } - } - - if (Modified) - ++NumFunctionsMitigated; - - return Modified; -} - -INITIALIZE_PASS(X86LoadValueInjectionLoadHardeningUnoptimizedPass, PASS_KEY, - "X86 LVI load hardening", false, false) - -FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningUnoptimizedPass() { - return new X86LoadValueInjectionLoadHardeningUnoptimizedPass(); -} diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp index f5caaaae4d84..9ce2a4637e2e 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -14,11 +14,12 @@ #include "MCTargetDesc/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86InstComments.h" +#include "MCTargetDesc/X86ShuffleDecode.h" #include "MCTargetDesc/X86TargetStreamer.h" -#include "Utils/X86ShuffleDecode.h" #include "X86AsmPrinter.h" #include "X86RegisterInfo.h" #include "X86ShuffleDecodeConstantPool.h" +#include "X86Subtarget.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/iterator_range.h" @@ -43,6 +44,7 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -72,9 +74,30 @@ private: } // end anonymous namespace +/// A RAII helper which defines a region of instructions which can't have +/// padding added between them for correctness. +struct NoAutoPaddingScope { + MCStreamer &OS; + const bool OldAllowAutoPadding; + NoAutoPaddingScope(MCStreamer &OS) + : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) { + changeAndComment(false); + } + ~NoAutoPaddingScope() { changeAndComment(OldAllowAutoPadding); } + void changeAndComment(bool b) { + if (b == OS.getAllowAutoPadding()) + return; + OS.setAllowAutoPadding(b); + if (b) + OS.emitRawComment("autopadding"); + else + OS.emitRawComment("noautopadding"); + } +}; + // Emit a minimal sequence of nops spanning NumBytes bytes. -static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, - const MCSubtargetInfo &STI); +static void emitX86Nops(MCStreamer &OS, unsigned NumBytes, + const X86Subtarget *Subtarget); void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst, const MCSubtargetInfo &STI, @@ -94,13 +117,13 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding( MCStreamer &OutStreamer, const MCSubtargetInfo &STI) { if (InShadow && CurrentShadowSize < RequiredShadowSize) { InShadow = false; - EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize, - MF->getSubtarget<X86Subtarget>().is64Bit(), STI); + emitX86Nops(OutStreamer, RequiredShadowSize - CurrentShadowSize, + &MF->getSubtarget<X86Subtarget>()); } } void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { - OutStreamer->EmitInstruction(Inst, getSubtargetInfo()); + OutStreamer->emitInstruction(Inst, getSubtargetInfo()); SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get()); } @@ -116,6 +139,10 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol /// operand to an MCSymbol. MCSymbol *X86MCInstLower::GetSymbolFromOperand(const MachineOperand &MO) const { + const Triple &TT = TM.getTargetTriple(); + if (MO.isGlobal() && TT.isOSBinFormatELF()) + return AsmPrinter.getSymbolPreferLocal(*MO.getGlobal()); + const DataLayout &DL = MF.getDataLayout(); assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); @@ -272,7 +299,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO, // local labels. This is only safe when the symbols are in the same // section so we are restricting it to jumptable references. MCSymbol *Label = Ctx.createTempSymbol(); - AsmPrinter.OutStreamer->EmitAssignment(Label, Expr); + AsmPrinter.OutStreamer->emitAssignment(Label, Expr); Expr = MCSymbolRefExpr::create(Label, Ctx); } break; @@ -482,6 +509,26 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { "LEA has segment specified!"); break; + case X86::MULX32Hrr: + case X86::MULX32Hrm: + case X86::MULX64Hrr: + case X86::MULX64Hrm: { + // Turn into regular MULX by duplicating the destination. + unsigned NewOpc; + switch (OutMI.getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::MULX32Hrr: NewOpc = X86::MULX32rr; break; + case X86::MULX32Hrm: NewOpc = X86::MULX32rm; break; + case X86::MULX64Hrr: NewOpc = X86::MULX64rr; break; + case X86::MULX64Hrm: NewOpc = X86::MULX64rm; break; + } + OutMI.setOpcode(NewOpc); + // Duplicate the destination. + unsigned DestReg = OutMI.getOperand(0).getReg(); + OutMI.insert(OutMI.begin(), MCOperand::createReg(DestReg)); + break; + } + // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B // if one of the registers is extended, but other isn't. case X86::VMOVZPQILo2PQIrr: @@ -929,6 +976,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI) { + NoAutoPaddingScope NoPadScope(*OutStreamer); bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 || MI.getOpcode() == X86::TLS_base_addr64; MCContext &Ctx = OutStreamer->getContext(); @@ -1034,29 +1082,26 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering, /// Return the longest nop which can be efficiently decoded for the given /// target cpu. 15-bytes is the longest single NOP instruction, but some /// platforms can't decode the longest forms efficiently. -static unsigned MaxLongNopLength(const MCSubtargetInfo &STI) { - uint64_t MaxNopLength = 10; - if (STI.getFeatureBits()[X86::ProcIntelSLM]) - MaxNopLength = 7; - else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP]) - MaxNopLength = 15; - else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP]) - MaxNopLength = 11; - return MaxNopLength; +static unsigned maxLongNopLength(const X86Subtarget *Subtarget) { + if (Subtarget->getFeatureBits()[X86::ProcIntelSLM]) + return 7; + if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP]) + return 15; + if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP]) + return 11; + if (Subtarget->getFeatureBits()[X86::FeatureNOPL] || Subtarget->is64Bit()) + return 10; + if (Subtarget->is32Bit()) + return 2; + return 1; } /// Emit the largest nop instruction smaller than or equal to \p NumBytes /// bytes. Return the size of nop emitted. -static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, - const MCSubtargetInfo &STI) { - if (!Is64Bit) { - // TODO Do additional checking if the CPU supports multi-byte nops. - OS.EmitInstruction(MCInstBuilder(X86::NOOP), STI); - return 1; - } - +static unsigned emitNop(MCStreamer &OS, unsigned NumBytes, + const X86Subtarget *Subtarget) { // Cap a single nop emission at the profitable value for the target - NumBytes = std::min(NumBytes, MaxLongNopLength(STI)); + NumBytes = std::min(NumBytes, maxLongNopLength(Subtarget)); unsigned NopSize; unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg; @@ -1125,25 +1170,26 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U); NopSize += NumPrefixes; for (unsigned i = 0; i != NumPrefixes; ++i) - OS.EmitBytes("\x66"); + OS.emitBytes("\x66"); switch (Opc) { default: llvm_unreachable("Unexpected opcode"); case X86::NOOP: - OS.EmitInstruction(MCInstBuilder(Opc), STI); + OS.emitInstruction(MCInstBuilder(Opc), *Subtarget); break; case X86::XCHG16ar: - OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX), STI); + OS.emitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX), + *Subtarget); break; case X86::NOOPL: case X86::NOOPW: - OS.EmitInstruction(MCInstBuilder(Opc) + OS.emitInstruction(MCInstBuilder(Opc) .addReg(BaseReg) .addImm(ScaleVal) .addReg(IndexReg) .addImm(Displacement) .addReg(SegmentReg), - STI); + *Subtarget); break; } assert(NopSize <= NumBytes && "We overemitted?"); @@ -1151,39 +1197,16 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, } /// Emit the optimal amount of multi-byte nops on X86. -static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit, - const MCSubtargetInfo &STI) { +static void emitX86Nops(MCStreamer &OS, unsigned NumBytes, + const X86Subtarget *Subtarget) { unsigned NopsToEmit = NumBytes; (void)NopsToEmit; while (NumBytes) { - NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI); + NumBytes -= emitNop(OS, NumBytes, Subtarget); assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!"); } } -/// A RAII helper which defines a region of instructions which can't have -/// padding added between them for correctness. -struct NoAutoPaddingScope { - MCStreamer &OS; - const bool OldAllowAutoPadding; - NoAutoPaddingScope(MCStreamer &OS) - : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) { - changeAndComment(false); - } - ~NoAutoPaddingScope() { - changeAndComment(OldAllowAutoPadding); - } - void changeAndComment(bool b) { - if (b == OS.getAllowAutoPadding()) - return; - OS.setAllowAutoPadding(b); - if (b) - OS.emitRawComment("autopadding"); - else - OS.emitRawComment("noautopadding"); - } -}; - void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL) { assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64"); @@ -1192,8 +1215,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, StatepointOpers SOpers(&MI); if (unsigned PatchBytes = SOpers.getNumPatchBytes()) { - EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(), - getSubtargetInfo()); + emitX86Nops(*OutStreamer, PatchBytes, Subtarget); } else { // Lower call target and choose correct opcode const MachineOperand &CallTarget = SOpers.getCallTarget(); @@ -1235,14 +1257,14 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, MCInst CallInst; CallInst.setOpcode(CallOpcode); CallInst.addOperand(CallTargetMCOp); - OutStreamer->EmitInstruction(CallInst, getSubtargetInfo()); + OutStreamer->emitInstruction(CallInst, getSubtargetInfo()); } // Record our statepoint node in the same section used by STACKMAP // and PATCHPOINT auto &Ctx = OutStreamer->getContext(); MCSymbol *MILabel = Ctx.createTempSymbol(); - OutStreamer->EmitLabel(MILabel); + OutStreamer->emitLabel(MILabel); SM.recordStatepoint(*MILabel, MI); } @@ -1262,7 +1284,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, auto &Ctx = OutStreamer->getContext(); MCSymbol *FaultingLabel = Ctx.createTempSymbol(); - OutStreamer->EmitLabel(FaultingLabel); + OutStreamer->emitLabel(FaultingLabel); assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!"); FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel); @@ -1280,7 +1302,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI, MI.addOperand(MaybeOperand.getValue()); OutStreamer->AddComment("on-fault: " + HandlerLabel->getName()); - OutStreamer->EmitInstruction(MI, getSubtargetInfo()); + OutStreamer->emitInstruction(MI, getSubtargetInfo()); } void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI, @@ -1317,7 +1339,17 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo()); if (Code.size() < MinSize) { - if (MinSize == 2 && Opcode == X86::PUSH64r) { + if (MinSize == 2 && Subtarget->is32Bit() && + Subtarget->isTargetWindowsMSVC() && + (Subtarget->getCPU().empty() || Subtarget->getCPU() == "pentium3")) { + // For compatibilty reasons, when targetting MSVC, is is important to + // generate a 'legacy' NOP in the form of a 8B FF MOV EDI, EDI. Some tools + // rely specifically on this pattern to be able to patch a function. + // This is only for 32-bit targets, when using /arch:IA32 or /arch:SSE. + OutStreamer->emitInstruction( + MCInstBuilder(X86::MOV32rr_REV).addReg(X86::EDI).addReg(X86::EDI), + *Subtarget); + } else if (MinSize == 2 && Opcode == X86::PUSH64r) { // This is an optimization that lets us get away without emitting a nop in // many cases. // @@ -1325,14 +1357,13 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, // bytes too, so the check on MinSize is important. MCI.setOpcode(X86::PUSH64rmr); } else { - unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(), - getSubtargetInfo()); + unsigned NopSize = emitNop(*OutStreamer, MinSize, Subtarget); assert(NopSize == MinSize && "Could not implement MinSize!"); (void)NopSize; } } - OutStreamer->EmitInstruction(MCI, getSubtargetInfo()); + OutStreamer->emitInstruction(MCI, getSubtargetInfo()); } // Lower a stackmap of the form: @@ -1342,7 +1373,7 @@ void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { auto &Ctx = OutStreamer->getContext(); MCSymbol *MILabel = Ctx.createTempSymbol(); - OutStreamer->EmitLabel(MILabel); + OutStreamer->emitLabel(MILabel); SM.recordStackMap(*MILabel, MI); unsigned NumShadowBytes = MI.getOperand(1).getImm(); @@ -1361,7 +1392,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, auto &Ctx = OutStreamer->getContext(); MCSymbol *MILabel = Ctx.createTempSymbol(); - OutStreamer->EmitLabel(MILabel); + OutStreamer->emitLabel(MILabel); SM.recordPatchPoint(*MILabel, MI); PatchPointOpers opers(&MI); @@ -1410,8 +1441,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, assert(NumBytes >= EncodedBytes && "Patchpoint can't request size less than the length of a call."); - EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(), - getSubtargetInfo()); + emitX86Nops(*OutStreamer, NumBytes - EncodedBytes, Subtarget); } void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, @@ -1442,13 +1472,13 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, // First we emit the label and the jump. auto CurSled = OutContext.createTempSymbol("xray_event_sled_", true); OutStreamer->AddComment("# XRay Custom Event Log"); - OutStreamer->EmitCodeAlignment(2); - OutStreamer->EmitLabel(CurSled); + OutStreamer->emitCodeAlignment(2); + OutStreamer->emitLabel(CurSled); // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as // an operand (computed as an offset from the jmp instruction). // FIXME: Find another less hacky way do force the relative jump. - OutStreamer->EmitBinaryData("\xeb\x0f"); + OutStreamer->emitBinaryData("\xeb\x0f"); // The default C calling convention will place two arguments into %rcx and // %rdx -- so we only work with those. @@ -1471,7 +1501,7 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, EmitAndCountInstruction( MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I])); } else { - EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo()); + emitX86Nops(*OutStreamer, 4, Subtarget); } } @@ -1500,14 +1530,14 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, if (UsedMask[I]) EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I])); else - EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo()); + emitX86Nops(*OutStreamer, 1, Subtarget); OutStreamer->AddComment("xray custom event end."); - // Record the sled version. Older versions of this sled were spelled - // differently, so we let the runtime handle the different offsets we're - // using. - recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 1); + // Record the sled version. Version 0 of this sled was spelled differently, so + // we let the runtime handle the different offsets we're using. Version 2 + // changed the absolute address to a PC-relative address. + recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 2); } void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, @@ -1538,13 +1568,13 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, // First we emit the label and the jump. auto CurSled = OutContext.createTempSymbol("xray_typed_event_sled_", true); OutStreamer->AddComment("# XRay Typed Event Log"); - OutStreamer->EmitCodeAlignment(2); - OutStreamer->EmitLabel(CurSled); + OutStreamer->emitCodeAlignment(2); + OutStreamer->emitLabel(CurSled); // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as // an operand (computed as an offset from the jmp instruction). // FIXME: Find another less hacky way do force the relative jump. - OutStreamer->EmitBinaryData("\xeb\x14"); + OutStreamer->emitBinaryData("\xeb\x14"); // An x86-64 convention may place three arguments into %rcx, %rdx, and R8, // so we'll work with those. Or we may be called via SystemV, in which case @@ -1569,7 +1599,7 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, EmitAndCountInstruction( MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I])); } else { - EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo()); + emitX86Nops(*OutStreamer, 4, Subtarget); } } @@ -1603,12 +1633,12 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI, if (UsedMask[I]) EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I])); else - EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo()); + emitX86Nops(*OutStreamer, 1, Subtarget); OutStreamer->AddComment("xray typed event end."); // Record the sled version. - recordSled(CurSled, MI, SledKind::TYPED_EVENT, 0); + recordSled(CurSled, MI, SledKind::TYPED_EVENT, 2); } void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, @@ -1623,7 +1653,7 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, .getValueAsString() .getAsInteger(10, Num)) return; - EmitNops(*OutStreamer, Num, Subtarget->is64Bit(), getSubtargetInfo()); + emitX86Nops(*OutStreamer, Num, Subtarget); return; } // We want to emit the following pattern: @@ -1640,15 +1670,15 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, // call <relative offset, 32-bits> // 5 bytes // auto CurSled = OutContext.createTempSymbol("xray_sled_", true); - OutStreamer->EmitCodeAlignment(2); - OutStreamer->EmitLabel(CurSled); + OutStreamer->emitCodeAlignment(2); + OutStreamer->emitLabel(CurSled); // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as // an operand (computed as an offset from the jmp instruction). // FIXME: Find another less hacky way do force the relative jump. - OutStreamer->EmitBytes("\xeb\x09"); - EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo()); - recordSled(CurSled, MI, SledKind::FUNCTION_ENTER); + OutStreamer->emitBytes("\xeb\x09"); + emitX86Nops(*OutStreamer, 9, Subtarget); + recordSled(CurSled, MI, SledKind::FUNCTION_ENTER, 2); } void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, @@ -1670,17 +1700,17 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, // // This just makes sure that the alignment for the next instruction is 2. auto CurSled = OutContext.createTempSymbol("xray_sled_", true); - OutStreamer->EmitCodeAlignment(2); - OutStreamer->EmitLabel(CurSled); + OutStreamer->emitCodeAlignment(2); + OutStreamer->emitLabel(CurSled); unsigned OpCode = MI.getOperand(0).getImm(); MCInst Ret; Ret.setOpcode(OpCode); for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end())) if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) Ret.addOperand(MaybeOperand.getValue()); - OutStreamer->EmitInstruction(Ret, getSubtargetInfo()); - EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo()); - recordSled(CurSled, MI, SledKind::FUNCTION_EXIT); + OutStreamer->emitInstruction(Ret, getSubtargetInfo()); + emitX86Nops(*OutStreamer, 10, Subtarget); + recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2); } void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, @@ -1694,17 +1724,17 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, // the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual // tail call much like how we have it in PATCHABLE_RET. auto CurSled = OutContext.createTempSymbol("xray_sled_", true); - OutStreamer->EmitCodeAlignment(2); - OutStreamer->EmitLabel(CurSled); + OutStreamer->emitCodeAlignment(2); + OutStreamer->emitLabel(CurSled); auto Target = OutContext.createTempSymbol(); // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as // an operand (computed as an offset from the jmp instruction). // FIXME: Find another less hacky way do force the relative jump. - OutStreamer->EmitBytes("\xeb\x09"); - EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo()); - OutStreamer->EmitLabel(Target); - recordSled(CurSled, MI, SledKind::TAIL_CALL); + OutStreamer->emitBytes("\xeb\x09"); + emitX86Nops(*OutStreamer, 9, Subtarget); + OutStreamer->emitLabel(Target); + recordSled(CurSled, MI, SledKind::TAIL_CALL, 2); unsigned OpCode = MI.getOperand(0).getImm(); OpCode = convertTailJumpOpcode(OpCode); @@ -1717,7 +1747,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end())) if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO)) TC.addOperand(MaybeOperand.getValue()); - OutStreamer->EmitInstruction(TC, getSubtargetInfo()); + OutStreamer->emitInstruction(TC, getSubtargetInfo()); } // Returns instruction preceding MBBI in MachineFunction. @@ -1961,300 +1991,9 @@ static unsigned getRegisterWidth(const MCOperandInfo &Info) { llvm_unreachable("Unknown register class!"); } -void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { - X86MCInstLower MCInstLowering(*MF, *this); - const X86RegisterInfo *RI = - MF->getSubtarget<X86Subtarget>().getRegisterInfo(); - - // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that - // are compressed from EVEX encoding to VEX encoding. - if (TM.Options.MCOptions.ShowMCEncoding) { - if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX) - OutStreamer->AddComment("EVEX TO VEX Compression ", false); - } - +static void addConstantComments(const MachineInstr *MI, + MCStreamer &OutStreamer) { switch (MI->getOpcode()) { - case TargetOpcode::DBG_VALUE: - llvm_unreachable("Should be handled target independently"); - - // Emit nothing here but a comment if we can. - case X86::Int_MemBarrier: - OutStreamer->emitRawComment("MEMBARRIER"); - return; - - case X86::EH_RETURN: - case X86::EH_RETURN64: { - // Lower these as normal, but add some comments. - Register Reg = MI->getOperand(0).getReg(); - OutStreamer->AddComment(StringRef("eh_return, addr: %") + - X86ATTInstPrinter::getRegisterName(Reg)); - break; - } - case X86::CLEANUPRET: { - // Lower these as normal, but add some comments. - OutStreamer->AddComment("CLEANUPRET"); - break; - } - - case X86::CATCHRET: { - // Lower these as normal, but add some comments. - OutStreamer->AddComment("CATCHRET"); - break; - } - - case X86::ENDBR32: - case X86::ENDBR64: { - // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for - // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be - // non-empty. If MI is the initial ENDBR, place the - // __patchable_function_entries label after ENDBR. - if (CurrentPatchableFunctionEntrySym && - CurrentPatchableFunctionEntrySym == CurrentFnBegin && - MI == &MF->front().front()) { - MCInst Inst; - MCInstLowering.Lower(MI, Inst); - EmitAndCountInstruction(Inst); - CurrentPatchableFunctionEntrySym = createTempSymbol("patch"); - OutStreamer->EmitLabel(CurrentPatchableFunctionEntrySym); - return; - } - break; - } - - case X86::TAILJMPr: - case X86::TAILJMPm: - case X86::TAILJMPd: - case X86::TAILJMPd_CC: - case X86::TAILJMPr64: - case X86::TAILJMPm64: - case X86::TAILJMPd64: - case X86::TAILJMPd64_CC: - case X86::TAILJMPr64_REX: - case X86::TAILJMPm64_REX: - // Lower these as normal, but add some comments. - OutStreamer->AddComment("TAILCALL"); - break; - - case X86::TLS_addr32: - case X86::TLS_addr64: - case X86::TLS_base_addr32: - case X86::TLS_base_addr64: - return LowerTlsAddr(MCInstLowering, *MI); - - // Loading/storing mask pairs requires two kmov operations. The second one of these - // needs a 2 byte displacement relative to the specified address (with 32 bit spill - // size). The pairs of 1bit masks up to 16 bit masks all use the same spill size, - // they all are stored using MASKPAIR16STORE, loaded using MASKPAIR16LOAD. - // - // The displacement value might wrap around in theory, thus the asserts in both - // cases. - case X86::MASKPAIR16LOAD: { - int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm(); - assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); - Register Reg = MI->getOperand(0).getReg(); - Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); - Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); - - // Load the first mask register - MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm); - MIB.addReg(Reg0); - for (int i = 0; i < X86::AddrNumOperands; ++i) { - auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i)); - MIB.addOperand(Op.getValue()); - } - EmitAndCountInstruction(MIB); - - // Load the second mask register of the pair - MIB = MCInstBuilder(X86::KMOVWkm); - MIB.addReg(Reg1); - for (int i = 0; i < X86::AddrNumOperands; ++i) { - if (i == X86::AddrDisp) { - MIB.addImm(Disp + 2); - } else { - auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i)); - MIB.addOperand(Op.getValue()); - } - } - EmitAndCountInstruction(MIB); - return; - } - - case X86::MASKPAIR16STORE: { - int64_t Disp = MI->getOperand(X86::AddrDisp).getImm(); - assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); - Register Reg = MI->getOperand(X86::AddrNumOperands).getReg(); - Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0); - Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1); - - // Store the first mask register - MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk); - for (int i = 0; i < X86::AddrNumOperands; ++i) - MIB.addOperand(MCInstLowering.LowerMachineOperand(MI, MI->getOperand(i)).getValue()); - MIB.addReg(Reg0); - EmitAndCountInstruction(MIB); - - // Store the second mask register of the pair - MIB = MCInstBuilder(X86::KMOVWmk); - for (int i = 0; i < X86::AddrNumOperands; ++i) { - if (i == X86::AddrDisp) { - MIB.addImm(Disp + 2); - } else { - auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(0 + i)); - MIB.addOperand(Op.getValue()); - } - } - MIB.addReg(Reg1); - EmitAndCountInstruction(MIB); - return; - } - - case X86::MOVPC32r: { - // This is a pseudo op for a two instruction sequence with a label, which - // looks like: - // call "L1$pb" - // "L1$pb": - // popl %esi - - // Emit the call. - MCSymbol *PICBase = MF->getPICBaseSymbol(); - // FIXME: We would like an efficient form for this, so we don't have to do a - // lot of extra uniquing. - EmitAndCountInstruction( - MCInstBuilder(X86::CALLpcrel32) - .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); - - const X86FrameLowering *FrameLowering = - MF->getSubtarget<X86Subtarget>().getFrameLowering(); - bool hasFP = FrameLowering->hasFP(*MF); - - // TODO: This is needed only if we require precise CFA. - bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() && - !OutStreamer->getDwarfFrameInfos().back().End; - - int stackGrowth = -RI->getSlotSize(); - - if (HasActiveDwarfFrame && !hasFP) { - OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth); - } - - // Emit the label. - OutStreamer->EmitLabel(PICBase); - - // popl $reg - EmitAndCountInstruction( - MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg())); - - if (HasActiveDwarfFrame && !hasFP) { - OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth); - } - return; - } - - case X86::ADD32ri: { - // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri. - if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS) - break; - - // Okay, we have something like: - // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL) - - // For this, we want to print something like: - // MYGLOBAL + (. - PICBASE) - // However, we can't generate a ".", so just emit a new label here and refer - // to it. - MCSymbol *DotSym = OutContext.createTempSymbol(); - OutStreamer->EmitLabel(DotSym); - - // Now that we have emitted the label, lower the complex operand expression. - MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2)); - - const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext); - const MCExpr *PICBase = - MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext); - DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext); - - DotExpr = MCBinaryExpr::createAdd( - MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext); - - EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri) - .addReg(MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()) - .addExpr(DotExpr)); - return; - } - case TargetOpcode::STATEPOINT: - return LowerSTATEPOINT(*MI, MCInstLowering); - - case TargetOpcode::FAULTING_OP: - return LowerFAULTING_OP(*MI, MCInstLowering); - - case TargetOpcode::FENTRY_CALL: - return LowerFENTRY_CALL(*MI, MCInstLowering); - - case TargetOpcode::PATCHABLE_OP: - return LowerPATCHABLE_OP(*MI, MCInstLowering); - - case TargetOpcode::STACKMAP: - return LowerSTACKMAP(*MI); - - case TargetOpcode::PATCHPOINT: - return LowerPATCHPOINT(*MI, MCInstLowering); - - case TargetOpcode::PATCHABLE_FUNCTION_ENTER: - return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering); - - case TargetOpcode::PATCHABLE_RET: - return LowerPATCHABLE_RET(*MI, MCInstLowering); - - case TargetOpcode::PATCHABLE_TAIL_CALL: - return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering); - - case TargetOpcode::PATCHABLE_EVENT_CALL: - return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering); - - case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: - return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering); - - case X86::MORESTACK_RET: - EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); - return; - - case X86::MORESTACK_RET_RESTORE_R10: - // Return, then restore R10. - EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); - EmitAndCountInstruction( - MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX)); - return; - - case X86::SEH_PushReg: - case X86::SEH_SaveReg: - case X86::SEH_SaveXMM: - case X86::SEH_StackAlloc: - case X86::SEH_StackAlign: - case X86::SEH_SetFrame: - case X86::SEH_PushFrame: - case X86::SEH_EndPrologue: - EmitSEHInstruction(MI); - return; - - case X86::SEH_Epilogue: { - assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - MachineBasicBlock::const_iterator MBBI(MI); - // Check if preceded by a call and emit nop if so. - for (MBBI = PrevCrossBBInst(MBBI); - MBBI != MachineBasicBlock::const_iterator(); - MBBI = PrevCrossBBInst(MBBI)) { - // Conservatively assume that pseudo instructions don't emit code and keep - // looking for a call. We may emit an unnecessary nop in some cases. - if (!MBBI->isPseudo()) { - if (MBBI->isCall()) - EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); - break; - } - } - return; - } - // Lower PSHUFB and VPERMILP normally but add a comment if we can find // a constant shuffle mask. We won't be able to do this at the MC layer // because the mask isn't an immediate. @@ -2270,30 +2009,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VPSHUFBZrm: case X86::VPSHUFBZrmk: case X86::VPSHUFBZrmkz: { - if (!OutStreamer->isVerboseAsm()) - break; - unsigned SrcIdx, MaskIdx; - switch (MI->getOpcode()) { - default: llvm_unreachable("Invalid opcode"); - case X86::PSHUFBrm: - case X86::VPSHUFBrm: - case X86::VPSHUFBYrm: - case X86::VPSHUFBZ128rm: - case X86::VPSHUFBZ256rm: - case X86::VPSHUFBZrm: - SrcIdx = 1; MaskIdx = 5; break; - case X86::VPSHUFBZ128rmkz: - case X86::VPSHUFBZ256rmkz: - case X86::VPSHUFBZrmkz: - SrcIdx = 2; MaskIdx = 6; break; - case X86::VPSHUFBZ128rmk: - case X86::VPSHUFBZ256rmk: - case X86::VPSHUFBZrmk: - SrcIdx = 3; MaskIdx = 7; break; + unsigned SrcIdx = 1; + if (X86II::isKMasked(MI->getDesc().TSFlags)) { + // Skip mask operand. + ++SrcIdx; + if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) { + // Skip passthru operand. + ++SrcIdx; + } } + unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp; - assert(MI->getNumOperands() >= 6 && - "We should always have at least 6 operands!"); + assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) && + "Unexpected number of operands!"); const MachineOperand &MaskOp = MI->getOperand(MaskIdx); if (auto *C = getConstantFromPool(*MI, MaskOp)) { @@ -2301,7 +2029,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 64> Mask; DecodePSHUFBMask(C, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); + OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); } break; } @@ -2328,9 +2056,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VPERMILPDZrm: case X86::VPERMILPDZrmk: case X86::VPERMILPDZrmkz: { - if (!OutStreamer->isVerboseAsm()) - break; - unsigned SrcIdx, MaskIdx; unsigned ElSize; switch (MI->getOpcode()) { default: llvm_unreachable("Invalid opcode"); @@ -2339,33 +2064,42 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VPERMILPSZ128rm: case X86::VPERMILPSZ256rm: case X86::VPERMILPSZrm: - SrcIdx = 1; MaskIdx = 5; ElSize = 32; break; case X86::VPERMILPSZ128rmkz: case X86::VPERMILPSZ256rmkz: case X86::VPERMILPSZrmkz: - SrcIdx = 2; MaskIdx = 6; ElSize = 32; break; case X86::VPERMILPSZ128rmk: case X86::VPERMILPSZ256rmk: case X86::VPERMILPSZrmk: - SrcIdx = 3; MaskIdx = 7; ElSize = 32; break; + ElSize = 32; + break; case X86::VPERMILPDrm: case X86::VPERMILPDYrm: case X86::VPERMILPDZ128rm: case X86::VPERMILPDZ256rm: case X86::VPERMILPDZrm: - SrcIdx = 1; MaskIdx = 5; ElSize = 64; break; case X86::VPERMILPDZ128rmkz: case X86::VPERMILPDZ256rmkz: case X86::VPERMILPDZrmkz: - SrcIdx = 2; MaskIdx = 6; ElSize = 64; break; case X86::VPERMILPDZ128rmk: case X86::VPERMILPDZ256rmk: case X86::VPERMILPDZrmk: - SrcIdx = 3; MaskIdx = 7; ElSize = 64; break; + ElSize = 64; + break; } - assert(MI->getNumOperands() >= 6 && - "We should always have at least 6 operands!"); + unsigned SrcIdx = 1; + if (X86II::isKMasked(MI->getDesc().TSFlags)) { + // Skip mask operand. + ++SrcIdx; + if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) { + // Skip passthru operand. + ++SrcIdx; + } + } + unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp; + + assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) && + "Unexpected number of operands!"); const MachineOperand &MaskOp = MI->getOperand(MaskIdx); if (auto *C = getConstantFromPool(*MI, MaskOp)) { @@ -2373,7 +2107,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { SmallVector<int, 16> Mask; DecodeVPERMILPMask(C, ElSize, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); + OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask)); } break; } @@ -2382,10 +2116,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VPERMIL2PSrm: case X86::VPERMIL2PDYrm: case X86::VPERMIL2PSYrm: { - if (!OutStreamer->isVerboseAsm()) - break; - assert(MI->getNumOperands() >= 8 && - "We should always have at least 8 operands!"); + assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands + 1) && + "Unexpected number of operands!"); const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1); if (!CtrlOp.isImm()) @@ -2398,47 +2130,43 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break; } - const MachineOperand &MaskOp = MI->getOperand(6); + const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp); if (auto *C = getConstantFromPool(*MI, MaskOp)) { unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]); SmallVector<int, 16> Mask; DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); + OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask)); } break; } case X86::VPPERMrrm: { - if (!OutStreamer->isVerboseAsm()) - break; - assert(MI->getNumOperands() >= 7 && - "We should always have at least 7 operands!"); + assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands) && + "Unexpected number of operands!"); - const MachineOperand &MaskOp = MI->getOperand(6); + const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp); if (auto *C = getConstantFromPool(*MI, MaskOp)) { unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]); SmallVector<int, 16> Mask; DecodeVPPERMMask(C, Width, Mask); if (!Mask.empty()) - OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask)); + OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask)); } break; } case X86::MMX_MOVQ64rm: { - if (!OutStreamer->isVerboseAsm()) - break; - if (MI->getNumOperands() <= 4) - break; - if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + assert(MI->getNumOperands() == (1 + X86::AddrNumOperands) && + "Unexpected number of operands!"); + if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) { std::string Comment; raw_string_ostream CS(Comment); const MachineOperand &DstOp = MI->getOperand(0); CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; if (auto *CF = dyn_cast<ConstantFP>(C)) { CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false); - OutStreamer->AddComment(CS.str()); + OutStreamer.AddComment(CS.str()); } } break; @@ -2489,11 +2217,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { case X86::VBROADCASTI64X2Z128rm: case X86::VBROADCASTI64X2rm: case X86::VBROADCASTI64X4rm: - if (!OutStreamer->isVerboseAsm()) - break; - if (MI->getNumOperands() <= 4) - break; - if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) && + "Unexpected number of operands!"); + if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) { int NumLanes = 1; // Override NumLanes for the broadcast instructions. switch (MI->getOpcode()) { @@ -2535,7 +2261,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } CS << "]"; - OutStreamer->AddComment(CS.str()); + OutStreamer.AddComment(CS.str()); } else if (auto *CV = dyn_cast<ConstantVector>(C)) { CS << "<"; for (int l = 0; l != NumLanes; ++l) { @@ -2547,80 +2273,79 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } CS << ">"; - OutStreamer->AddComment(CS.str()); + OutStreamer.AddComment(CS.str()); } } break; + case X86::MOVDDUPrm: case X86::VMOVDDUPrm: case X86::VMOVDDUPZ128rm: case X86::VBROADCASTSSrm: case X86::VBROADCASTSSYrm: - case X86::VBROADCASTSSZ128m: - case X86::VBROADCASTSSZ256m: - case X86::VBROADCASTSSZm: + case X86::VBROADCASTSSZ128rm: + case X86::VBROADCASTSSZ256rm: + case X86::VBROADCASTSSZrm: case X86::VBROADCASTSDYrm: - case X86::VBROADCASTSDZ256m: - case X86::VBROADCASTSDZm: + case X86::VBROADCASTSDZ256rm: + case X86::VBROADCASTSDZrm: case X86::VPBROADCASTBrm: case X86::VPBROADCASTBYrm: - case X86::VPBROADCASTBZ128m: - case X86::VPBROADCASTBZ256m: - case X86::VPBROADCASTBZm: + case X86::VPBROADCASTBZ128rm: + case X86::VPBROADCASTBZ256rm: + case X86::VPBROADCASTBZrm: case X86::VPBROADCASTDrm: case X86::VPBROADCASTDYrm: - case X86::VPBROADCASTDZ128m: - case X86::VPBROADCASTDZ256m: - case X86::VPBROADCASTDZm: + case X86::VPBROADCASTDZ128rm: + case X86::VPBROADCASTDZ256rm: + case X86::VPBROADCASTDZrm: case X86::VPBROADCASTQrm: case X86::VPBROADCASTQYrm: - case X86::VPBROADCASTQZ128m: - case X86::VPBROADCASTQZ256m: - case X86::VPBROADCASTQZm: + case X86::VPBROADCASTQZ128rm: + case X86::VPBROADCASTQZ256rm: + case X86::VPBROADCASTQZrm: case X86::VPBROADCASTWrm: case X86::VPBROADCASTWYrm: - case X86::VPBROADCASTWZ128m: - case X86::VPBROADCASTWZ256m: - case X86::VPBROADCASTWZm: - if (!OutStreamer->isVerboseAsm()) - break; - if (MI->getNumOperands() <= 4) - break; - if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + case X86::VPBROADCASTWZ128rm: + case X86::VPBROADCASTWZ256rm: + case X86::VPBROADCASTWZrm: + assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) && + "Unexpected number of operands!"); + if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) { int NumElts; switch (MI->getOpcode()) { default: llvm_unreachable("Invalid opcode"); - case X86::MOVDDUPrm: NumElts = 2; break; - case X86::VMOVDDUPrm: NumElts = 2; break; - case X86::VMOVDDUPZ128rm: NumElts = 2; break; - case X86::VBROADCASTSSrm: NumElts = 4; break; - case X86::VBROADCASTSSYrm: NumElts = 8; break; - case X86::VBROADCASTSSZ128m: NumElts = 4; break; - case X86::VBROADCASTSSZ256m: NumElts = 8; break; - case X86::VBROADCASTSSZm: NumElts = 16; break; - case X86::VBROADCASTSDYrm: NumElts = 4; break; - case X86::VBROADCASTSDZ256m: NumElts = 4; break; - case X86::VBROADCASTSDZm: NumElts = 8; break; - case X86::VPBROADCASTBrm: NumElts = 16; break; - case X86::VPBROADCASTBYrm: NumElts = 32; break; - case X86::VPBROADCASTBZ128m: NumElts = 16; break; - case X86::VPBROADCASTBZ256m: NumElts = 32; break; - case X86::VPBROADCASTBZm: NumElts = 64; break; - case X86::VPBROADCASTDrm: NumElts = 4; break; - case X86::VPBROADCASTDYrm: NumElts = 8; break; - case X86::VPBROADCASTDZ128m: NumElts = 4; break; - case X86::VPBROADCASTDZ256m: NumElts = 8; break; - case X86::VPBROADCASTDZm: NumElts = 16; break; - case X86::VPBROADCASTQrm: NumElts = 2; break; - case X86::VPBROADCASTQYrm: NumElts = 4; break; - case X86::VPBROADCASTQZ128m: NumElts = 2; break; - case X86::VPBROADCASTQZ256m: NumElts = 4; break; - case X86::VPBROADCASTQZm: NumElts = 8; break; - case X86::VPBROADCASTWrm: NumElts = 8; break; - case X86::VPBROADCASTWYrm: NumElts = 16; break; - case X86::VPBROADCASTWZ128m: NumElts = 8; break; - case X86::VPBROADCASTWZ256m: NumElts = 16; break; - case X86::VPBROADCASTWZm: NumElts = 32; break; + case X86::MOVDDUPrm: NumElts = 2; break; + case X86::VMOVDDUPrm: NumElts = 2; break; + case X86::VMOVDDUPZ128rm: NumElts = 2; break; + case X86::VBROADCASTSSrm: NumElts = 4; break; + case X86::VBROADCASTSSYrm: NumElts = 8; break; + case X86::VBROADCASTSSZ128rm: NumElts = 4; break; + case X86::VBROADCASTSSZ256rm: NumElts = 8; break; + case X86::VBROADCASTSSZrm: NumElts = 16; break; + case X86::VBROADCASTSDYrm: NumElts = 4; break; + case X86::VBROADCASTSDZ256rm: NumElts = 4; break; + case X86::VBROADCASTSDZrm: NumElts = 8; break; + case X86::VPBROADCASTBrm: NumElts = 16; break; + case X86::VPBROADCASTBYrm: NumElts = 32; break; + case X86::VPBROADCASTBZ128rm: NumElts = 16; break; + case X86::VPBROADCASTBZ256rm: NumElts = 32; break; + case X86::VPBROADCASTBZrm: NumElts = 64; break; + case X86::VPBROADCASTDrm: NumElts = 4; break; + case X86::VPBROADCASTDYrm: NumElts = 8; break; + case X86::VPBROADCASTDZ128rm: NumElts = 4; break; + case X86::VPBROADCASTDZ256rm: NumElts = 8; break; + case X86::VPBROADCASTDZrm: NumElts = 16; break; + case X86::VPBROADCASTQrm: NumElts = 2; break; + case X86::VPBROADCASTQYrm: NumElts = 4; break; + case X86::VPBROADCASTQZ128rm: NumElts = 2; break; + case X86::VPBROADCASTQZ256rm: NumElts = 4; break; + case X86::VPBROADCASTQZrm: NumElts = 8; break; + case X86::VPBROADCASTWrm: NumElts = 8; break; + case X86::VPBROADCASTWYrm: NumElts = 16; break; + case X86::VPBROADCASTWZ128rm: NumElts = 8; break; + case X86::VPBROADCASTWZ256rm: NumElts = 16; break; + case X86::VPBROADCASTWZrm: NumElts = 32; break; } std::string Comment; @@ -2634,8 +2359,241 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { printConstant(C, CS); } CS << "]"; - OutStreamer->AddComment(CS.str()); + OutStreamer.AddComment(CS.str()); + } + } +} + +void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { + X86MCInstLower MCInstLowering(*MF, *this); + const X86RegisterInfo *RI = + MF->getSubtarget<X86Subtarget>().getRegisterInfo(); + + // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that + // are compressed from EVEX encoding to VEX encoding. + if (TM.Options.MCOptions.ShowMCEncoding) { + if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX) + OutStreamer->AddComment("EVEX TO VEX Compression ", false); + } + + // Add comments for values loaded from constant pool. + if (OutStreamer->isVerboseAsm()) + addConstantComments(MI, *OutStreamer); + + switch (MI->getOpcode()) { + case TargetOpcode::DBG_VALUE: + llvm_unreachable("Should be handled target independently"); + + // Emit nothing here but a comment if we can. + case X86::Int_MemBarrier: + OutStreamer->emitRawComment("MEMBARRIER"); + return; + + case X86::EH_RETURN: + case X86::EH_RETURN64: { + // Lower these as normal, but add some comments. + Register Reg = MI->getOperand(0).getReg(); + OutStreamer->AddComment(StringRef("eh_return, addr: %") + + X86ATTInstPrinter::getRegisterName(Reg)); + break; + } + case X86::CLEANUPRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CLEANUPRET"); + break; + } + + case X86::CATCHRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CATCHRET"); + break; + } + + case X86::ENDBR32: + case X86::ENDBR64: { + // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for + // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be + // non-empty. If MI is the initial ENDBR, place the + // __patchable_function_entries label after ENDBR. + if (CurrentPatchableFunctionEntrySym && + CurrentPatchableFunctionEntrySym == CurrentFnBegin && + MI == &MF->front().front()) { + MCInst Inst; + MCInstLowering.Lower(MI, Inst); + EmitAndCountInstruction(Inst); + CurrentPatchableFunctionEntrySym = createTempSymbol("patch"); + OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym); + return; } + break; + } + + case X86::TAILJMPr: + case X86::TAILJMPm: + case X86::TAILJMPd: + case X86::TAILJMPd_CC: + case X86::TAILJMPr64: + case X86::TAILJMPm64: + case X86::TAILJMPd64: + case X86::TAILJMPd64_CC: + case X86::TAILJMPr64_REX: + case X86::TAILJMPm64_REX: + // Lower these as normal, but add some comments. + OutStreamer->AddComment("TAILCALL"); + break; + + case X86::TLS_addr32: + case X86::TLS_addr64: + case X86::TLS_base_addr32: + case X86::TLS_base_addr64: + return LowerTlsAddr(MCInstLowering, *MI); + + case X86::MOVPC32r: { + // This is a pseudo op for a two instruction sequence with a label, which + // looks like: + // call "L1$pb" + // "L1$pb": + // popl %esi + + // Emit the call. + MCSymbol *PICBase = MF->getPICBaseSymbol(); + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + EmitAndCountInstruction( + MCInstBuilder(X86::CALLpcrel32) + .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); + + const X86FrameLowering *FrameLowering = + MF->getSubtarget<X86Subtarget>().getFrameLowering(); + bool hasFP = FrameLowering->hasFP(*MF); + + // TODO: This is needed only if we require precise CFA. + bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() && + !OutStreamer->getDwarfFrameInfos().back().End; + + int stackGrowth = -RI->getSlotSize(); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->emitCFIAdjustCfaOffset(-stackGrowth); + } + + // Emit the label. + OutStreamer->emitLabel(PICBase); + + // popl $reg + EmitAndCountInstruction( + MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg())); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->emitCFIAdjustCfaOffset(stackGrowth); + } + return; + } + + case X86::ADD32ri: { + // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri. + if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS) + break; + + // Okay, we have something like: + // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL) + + // For this, we want to print something like: + // MYGLOBAL + (. - PICBASE) + // However, we can't generate a ".", so just emit a new label here and refer + // to it. + MCSymbol *DotSym = OutContext.createTempSymbol(); + OutStreamer->emitLabel(DotSym); + + // Now that we have emitted the label, lower the complex operand expression. + MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2)); + + const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext); + const MCExpr *PICBase = + MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext); + DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext); + + DotExpr = MCBinaryExpr::createAdd( + MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext); + + EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri) + .addReg(MI->getOperand(0).getReg()) + .addReg(MI->getOperand(1).getReg()) + .addExpr(DotExpr)); + return; + } + case TargetOpcode::STATEPOINT: + return LowerSTATEPOINT(*MI, MCInstLowering); + + case TargetOpcode::FAULTING_OP: + return LowerFAULTING_OP(*MI, MCInstLowering); + + case TargetOpcode::FENTRY_CALL: + return LowerFENTRY_CALL(*MI, MCInstLowering); + + case TargetOpcode::PATCHABLE_OP: + return LowerPATCHABLE_OP(*MI, MCInstLowering); + + case TargetOpcode::STACKMAP: + return LowerSTACKMAP(*MI); + + case TargetOpcode::PATCHPOINT: + return LowerPATCHPOINT(*MI, MCInstLowering); + + case TargetOpcode::PATCHABLE_FUNCTION_ENTER: + return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering); + + case TargetOpcode::PATCHABLE_RET: + return LowerPATCHABLE_RET(*MI, MCInstLowering); + + case TargetOpcode::PATCHABLE_TAIL_CALL: + return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering); + + case TargetOpcode::PATCHABLE_EVENT_CALL: + return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering); + + case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: + return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering); + + case X86::MORESTACK_RET: + EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); + return; + + case X86::MORESTACK_RET_RESTORE_R10: + // Return, then restore R10. + EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget))); + EmitAndCountInstruction( + MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX)); + return; + + case X86::SEH_PushReg: + case X86::SEH_SaveReg: + case X86::SEH_SaveXMM: + case X86::SEH_StackAlloc: + case X86::SEH_StackAlign: + case X86::SEH_SetFrame: + case X86::SEH_PushFrame: + case X86::SEH_EndPrologue: + EmitSEHInstruction(MI); + return; + + case X86::SEH_Epilogue: { + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); + MachineBasicBlock::const_iterator MBBI(MI); + // Check if preceded by a call and emit nop if so. + for (MBBI = PrevCrossBBInst(MBBI); + MBBI != MachineBasicBlock::const_iterator(); + MBBI = PrevCrossBBInst(MBBI)) { + // Conservatively assume that pseudo instructions don't emit code and keep + // looking for a call. We may emit an unnecessary nop in some cases. + if (!MBBI->isPseudo()) { + if (MBBI->isCall()) + EmitAndCountInstruction(MCInstBuilder(X86::NOOP)); + break; + } + } + return; + } } MCInst TmpInst; @@ -2652,7 +2610,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { // after it. SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); // Then emit the call - OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo()); + OutStreamer->emitInstruction(TmpInst, getSubtargetInfo()); return; } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h index 5cb80a082b56..eedad952c3b9 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -13,9 +13,10 @@ #ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/Support/MachineValueType.h" namespace llvm { @@ -62,12 +63,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// SRetReturnReg - Some subtargets require that sret lowering includes /// returning the value of the returned struct in a register. This field /// holds the virtual register into which the sret argument is passed. - unsigned SRetReturnReg = 0; + Register SRetReturnReg; /// GlobalBaseReg - keeps track of the virtual register initialized for /// use as the global base register. This is used for PIC in some PIC /// relocation models. - unsigned GlobalBaseReg = 0; + Register GlobalBaseReg; /// VarArgsFrameIndex - FrameIndex for start of varargs area. int VarArgsFrameIndex = 0; @@ -104,6 +105,13 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// True if this function has WIN_ALLOCA instructions. bool HasWinAlloca = false; + /// True if this function has any preallocated calls. + bool HasPreallocatedCall = false; + + ValueMap<const Value *, size_t> PreallocatedIds; + SmallVector<size_t, 0> PreallocatedStackSizes; + SmallVector<SmallVector<size_t, 4>, 0> PreallocatedArgOffsets; + private: /// ForwardedMustTailRegParms - A list of virtual and physical registers /// that must be forwarded to every musttail call. @@ -143,11 +151,11 @@ public: int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; } void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;} - unsigned getSRetReturnReg() const { return SRetReturnReg; } - void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } + Register getSRetReturnReg() const { return SRetReturnReg; } + void setSRetReturnReg(Register Reg) { SRetReturnReg = Reg; } - unsigned getGlobalBaseReg() const { return GlobalBaseReg; } - void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; } + Register getGlobalBaseReg() const { return GlobalBaseReg; } + void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; } int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; } @@ -185,6 +193,36 @@ public: bool hasWinAlloca() const { return HasWinAlloca; } void setHasWinAlloca(bool v) { HasWinAlloca = v; } + + bool hasPreallocatedCall() const { return HasPreallocatedCall; } + void setHasPreallocatedCall(bool v) { HasPreallocatedCall = v; } + + size_t getPreallocatedIdForCallSite(const Value *CS) { + auto Insert = PreallocatedIds.insert({CS, PreallocatedIds.size()}); + if (Insert.second) { + PreallocatedStackSizes.push_back(0); + PreallocatedArgOffsets.emplace_back(); + } + return Insert.first->second; + } + + void setPreallocatedStackSize(size_t Id, size_t StackSize) { + PreallocatedStackSizes[Id] = StackSize; + } + + size_t getPreallocatedStackSize(const size_t Id) { + assert(PreallocatedStackSizes[Id] != 0 && "stack size not set"); + return PreallocatedStackSizes[Id]; + } + + void setPreallocatedArgOffsets(size_t Id, ArrayRef<size_t> AO) { + PreallocatedArgOffsets[Id].assign(AO.begin(), AO.end()); + } + + const ArrayRef<size_t> getPreallocatedArgOffsets(const size_t Id) { + assert(!PreallocatedArgOffsets[Id].empty() && "arg offsets not set"); + return PreallocatedArgOffsets[Id]; + } }; } // End llvm namespace diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp index b19d1263e0c9..425054cfdd92 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/X86BaseInfo.h" #include "X86MacroFusion.h" +#include "MCTargetDesc/X86BaseInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/MacroFusion.h" #include "llvm/CodeGen/TargetInstrInfo.h" diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h index d4ae54f657a5..05388b275ca3 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h @@ -14,10 +14,12 @@ #ifndef LLVM_LIB_TARGET_X86_X86MACROFUSION_H #define LLVM_LIB_TARGET_X86_X86MACROFUSION_H -#include "llvm/CodeGen/MachineScheduler.h" +#include <memory> namespace llvm { +class ScheduleDAGMutation; + /// Note that you have to add: /// DAG.addMutation(createX86MacroFusionDAGMutation()); /// to X86PassConfig::createMachineScheduler() to have an effect. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp index 0c791b6674dc..c8899a85118e 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -578,7 +578,7 @@ bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) { MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, unsigned VReg, int64_t AddrDispShift) { - DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression()); + const DIExpression *Expr = MI.getDebugExpression(); if (AddrDispShift != 0) Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp index 4c6bd0ccc2cd..ec81b07f9e5f 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -58,6 +58,7 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<ProfileSummaryInfoWrapperPass>(); AU.addRequired<LazyMachineBlockFrequencyInfoPass>(); + AU.addPreserved<LazyMachineBlockFrequencyInfoPass>(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp new file mode 100644 index 000000000000..8784a3df1773 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -0,0 +1,490 @@ +//===-- X86PartialReduction.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass looks for add instructions used by a horizontal reduction to see +// if we might be able to use pmaddwd or psadbw. Some cases of this require +// cross basic block knowledge and can't be done in SelectionDAG. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "X86TargetMachine.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-partial-reduction" + +namespace { + +class X86PartialReduction : public FunctionPass { + const DataLayout *DL; + const X86Subtarget *ST; + +public: + static char ID; // Pass identification, replacement for typeid. + + X86PartialReduction() : FunctionPass(ID) { } + + bool runOnFunction(Function &Fn) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + + StringRef getPassName() const override { + return "X86 Partial Reduction"; + } + +private: + bool tryMAddReplacement(Instruction *Op); + bool trySADReplacement(Instruction *Op); +}; +} + +FunctionPass *llvm::createX86PartialReductionPass() { + return new X86PartialReduction(); +} + +char X86PartialReduction::ID = 0; + +INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE, + "X86 Partial Reduction", false, false) + +bool X86PartialReduction::tryMAddReplacement(Instruction *Op) { + if (!ST->hasSSE2()) + return false; + + // Need at least 8 elements. + if (cast<FixedVectorType>(Op->getType())->getNumElements() < 8) + return false; + + // Element type should be i32. + if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32)) + return false; + + auto *Mul = dyn_cast<BinaryOperator>(Op); + if (!Mul || Mul->getOpcode() != Instruction::Mul) + return false; + + Value *LHS = Mul->getOperand(0); + Value *RHS = Mul->getOperand(1); + + // LHS and RHS should be only used once or if they are the same then only + // used twice. Only check this when SSE4.1 is enabled and we have zext/sext + // instructions, otherwise we use punpck to emulate zero extend in stages. The + // trunc/ we need to do likely won't introduce new instructions in that case. + if (ST->hasSSE41()) { + if (LHS == RHS) { + if (!isa<Constant>(LHS) && !LHS->hasNUses(2)) + return false; + } else { + if (!isa<Constant>(LHS) && !LHS->hasOneUse()) + return false; + if (!isa<Constant>(RHS) && !RHS->hasOneUse()) + return false; + } + } + + auto CanShrinkOp = [&](Value *Op) { + auto IsFreeTruncation = [&](Value *Op) { + if (auto *Cast = dyn_cast<CastInst>(Op)) { + if (Cast->getParent() == Mul->getParent() && + (Cast->getOpcode() == Instruction::SExt || + Cast->getOpcode() == Instruction::ZExt) && + Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 16) + return true; + } + + return isa<Constant>(Op); + }; + + // If the operation can be freely truncated and has enough sign bits we + // can shrink. + if (IsFreeTruncation(Op) && + ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16) + return true; + + // SelectionDAG has limited support for truncating through an add or sub if + // the inputs are freely truncatable. + if (auto *BO = dyn_cast<BinaryOperator>(Op)) { + if (BO->getParent() == Mul->getParent() && + IsFreeTruncation(BO->getOperand(0)) && + IsFreeTruncation(BO->getOperand(1)) && + ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16) + return true; + } + + return false; + }; + + // Both Ops need to be shrinkable. + if (!CanShrinkOp(LHS) && !CanShrinkOp(RHS)) + return false; + + IRBuilder<> Builder(Mul); + + auto *MulTy = cast<FixedVectorType>(Op->getType()); + unsigned NumElts = MulTy->getNumElements(); + + // Extract even elements and odd elements and add them together. This will + // be pattern matched by SelectionDAG to pmaddwd. This instruction will be + // half the original width. + SmallVector<int, 16> EvenMask(NumElts / 2); + SmallVector<int, 16> OddMask(NumElts / 2); + for (int i = 0, e = NumElts / 2; i != e; ++i) { + EvenMask[i] = i * 2; + OddMask[i] = i * 2 + 1; + } + // Creating a new mul so the replaceAllUsesWith below doesn't replace the + // uses in the shuffles we're creating. + Value *NewMul = Builder.CreateMul(Mul->getOperand(0), Mul->getOperand(1)); + Value *EvenElts = Builder.CreateShuffleVector(NewMul, NewMul, EvenMask); + Value *OddElts = Builder.CreateShuffleVector(NewMul, NewMul, OddMask); + Value *MAdd = Builder.CreateAdd(EvenElts, OddElts); + + // Concatenate zeroes to extend back to the original type. + SmallVector<int, 32> ConcatMask(NumElts); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + Value *Zero = Constant::getNullValue(MAdd->getType()); + Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask); + + Mul->replaceAllUsesWith(Concat); + Mul->eraseFromParent(); + + return true; +} + +bool X86PartialReduction::trySADReplacement(Instruction *Op) { + if (!ST->hasSSE2()) + return false; + + // TODO: There's nothing special about i32, any integer type above i16 should + // work just as well. + if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32)) + return false; + + // Operand should be a select. + auto *SI = dyn_cast<SelectInst>(Op); + if (!SI) + return false; + + // Select needs to implement absolute value. + Value *LHS, *RHS; + auto SPR = matchSelectPattern(SI, LHS, RHS); + if (SPR.Flavor != SPF_ABS) + return false; + + // Need a subtract of two values. + auto *Sub = dyn_cast<BinaryOperator>(LHS); + if (!Sub || Sub->getOpcode() != Instruction::Sub) + return false; + + // Look for zero extend from i8. + auto getZeroExtendedVal = [](Value *Op) -> Value * { + if (auto *ZExt = dyn_cast<ZExtInst>(Op)) + if (cast<VectorType>(ZExt->getOperand(0)->getType()) + ->getElementType() + ->isIntegerTy(8)) + return ZExt->getOperand(0); + + return nullptr; + }; + + // Both operands of the subtract should be extends from vXi8. + Value *Op0 = getZeroExtendedVal(Sub->getOperand(0)); + Value *Op1 = getZeroExtendedVal(Sub->getOperand(1)); + if (!Op0 || !Op1) + return false; + + IRBuilder<> Builder(SI); + + auto *OpTy = cast<FixedVectorType>(Op->getType()); + unsigned NumElts = OpTy->getNumElements(); + + unsigned IntrinsicNumElts; + Intrinsic::ID IID; + if (ST->hasBWI() && NumElts >= 64) { + IID = Intrinsic::x86_avx512_psad_bw_512; + IntrinsicNumElts = 64; + } else if (ST->hasAVX2() && NumElts >= 32) { + IID = Intrinsic::x86_avx2_psad_bw; + IntrinsicNumElts = 32; + } else { + IID = Intrinsic::x86_sse2_psad_bw; + IntrinsicNumElts = 16; + } + + Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID); + + if (NumElts < 16) { + // Pad input with zeroes. + SmallVector<int, 32> ConcatMask(16); + for (unsigned i = 0; i != NumElts; ++i) + ConcatMask[i] = i; + for (unsigned i = NumElts; i != 16; ++i) + ConcatMask[i] = (i % NumElts) + NumElts; + + Value *Zero = Constant::getNullValue(Op0->getType()); + Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask); + Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask); + NumElts = 16; + } + + // Intrinsics produce vXi64 and need to be casted to vXi32. + auto *I32Ty = + FixedVectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4); + + assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!"); + unsigned NumSplits = NumElts / IntrinsicNumElts; + + // First collect the pieces we need. + SmallVector<Value *, 4> Ops(NumSplits); + for (unsigned i = 0; i != NumSplits; ++i) { + SmallVector<int, 64> ExtractMask(IntrinsicNumElts); + std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts); + Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask); + Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask); + Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1}); + Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty); + } + + assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits"); + unsigned Stages = Log2_32(NumSplits); + for (unsigned s = Stages; s > 0; --s) { + unsigned NumConcatElts = + cast<FixedVectorType>(Ops[0]->getType())->getNumElements() * 2; + for (unsigned i = 0; i != 1U << (s - 1); ++i) { + SmallVector<int, 64> ConcatMask(NumConcatElts); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask); + } + } + + // At this point the final value should be in Ops[0]. Now we need to adjust + // it to the final original type. + NumElts = cast<FixedVectorType>(OpTy)->getNumElements(); + if (NumElts == 2) { + // Extract down to 2 elements. + Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ArrayRef<int>{0, 1}); + } else if (NumElts >= 8) { + SmallVector<int, 32> ConcatMask(NumElts); + unsigned SubElts = + cast<FixedVectorType>(Ops[0]->getType())->getNumElements(); + for (unsigned i = 0; i != SubElts; ++i) + ConcatMask[i] = i; + for (unsigned i = SubElts; i != NumElts; ++i) + ConcatMask[i] = (i % SubElts) + SubElts; + + Value *Zero = Constant::getNullValue(Ops[0]->getType()); + Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask); + } + + SI->replaceAllUsesWith(Ops[0]); + SI->eraseFromParent(); + + return true; +} + +// Walk backwards from the ExtractElementInst and determine if it is the end of +// a horizontal reduction. Return the input to the reduction if we find one. +static Value *matchAddReduction(const ExtractElementInst &EE) { + // Make sure we're extracting index 0. + auto *Index = dyn_cast<ConstantInt>(EE.getIndexOperand()); + if (!Index || !Index->isNullValue()) + return nullptr; + + const auto *BO = dyn_cast<BinaryOperator>(EE.getVectorOperand()); + if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse()) + return nullptr; + + unsigned NumElems = cast<FixedVectorType>(BO->getType())->getNumElements(); + // Ensure the reduction size is a power of 2. + if (!isPowerOf2_32(NumElems)) + return nullptr; + + const Value *Op = BO; + unsigned Stages = Log2_32(NumElems); + for (unsigned i = 0; i != Stages; ++i) { + const auto *BO = dyn_cast<BinaryOperator>(Op); + if (!BO || BO->getOpcode() != Instruction::Add) + return nullptr; + + // If this isn't the first add, then it should only have 2 users, the + // shuffle and another add which we checked in the previous iteration. + if (i != 0 && !BO->hasNUses(2)) + return nullptr; + + Value *LHS = BO->getOperand(0); + Value *RHS = BO->getOperand(1); + + auto *Shuffle = dyn_cast<ShuffleVectorInst>(LHS); + if (Shuffle) { + Op = RHS; + } else { + Shuffle = dyn_cast<ShuffleVectorInst>(RHS); + Op = LHS; + } + + // The first operand of the shuffle should be the same as the other operand + // of the bin op. + if (!Shuffle || Shuffle->getOperand(0) != Op) + return nullptr; + + // Verify the shuffle has the expected (at this stage of the pyramid) mask. + unsigned MaskEnd = 1 << i; + for (unsigned Index = 0; Index < MaskEnd; ++Index) + if (Shuffle->getMaskValue(Index) != (int)(MaskEnd + Index)) + return nullptr; + } + + return const_cast<Value *>(Op); +} + +// See if this BO is reachable from this Phi by walking forward through single +// use BinaryOperators with the same opcode. If we get back then we know we've +// found a loop and it is safe to step through this Add to find more leaves. +static bool isReachableFromPHI(PHINode *Phi, BinaryOperator *BO) { + // The PHI itself should only have one use. + if (!Phi->hasOneUse()) + return false; + + Instruction *U = cast<Instruction>(*Phi->user_begin()); + if (U == BO) + return true; + + while (U->hasOneUse() && U->getOpcode() == BO->getOpcode()) + U = cast<Instruction>(*U->user_begin()); + + return U == BO; +} + +// Collect all the leaves of the tree of adds that feeds into the horizontal +// reduction. Root is the Value that is used by the horizontal reduction. +// We look through single use phis, single use adds, or adds that are used by +// a phi that forms a loop with the add. +static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) { + SmallPtrSet<Value *, 8> Visited; + SmallVector<Value *, 8> Worklist; + Worklist.push_back(Root); + + while (!Worklist.empty()) { + Value *V = Worklist.pop_back_val(); + if (!Visited.insert(V).second) + continue; + + if (auto *PN = dyn_cast<PHINode>(V)) { + // PHI node should have single use unless it is the root node, then it + // has 2 uses. + if (!PN->hasNUses(PN == Root ? 2 : 1)) + break; + + // Push incoming values to the worklist. + for (Value *InV : PN->incoming_values()) + Worklist.push_back(InV); + + continue; + } + + if (auto *BO = dyn_cast<BinaryOperator>(V)) { + if (BO->getOpcode() == Instruction::Add) { + // Simple case. Single use, just push its operands to the worklist. + if (BO->hasNUses(BO == Root ? 2 : 1)) { + for (Value *Op : BO->operands()) + Worklist.push_back(Op); + continue; + } + + // If there is additional use, make sure it is an unvisited phi that + // gets us back to this node. + if (BO->hasNUses(BO == Root ? 3 : 2)) { + PHINode *PN = nullptr; + for (auto *U : Root->users()) + if (auto *P = dyn_cast<PHINode>(U)) + if (!Visited.count(P)) + PN = P; + + // If we didn't find a 2-input PHI then this isn't a case we can + // handle. + if (!PN || PN->getNumIncomingValues() != 2) + continue; + + // Walk forward from this phi to see if it reaches back to this add. + if (!isReachableFromPHI(PN, BO)) + continue; + + // The phi forms a loop with this Add, push its operands. + for (Value *Op : BO->operands()) + Worklist.push_back(Op); + } + } + } + + // Not an add or phi, make it a leaf. + if (auto *I = dyn_cast<Instruction>(V)) { + if (!V->hasNUses(I == Root ? 2 : 1)) + continue; + + // Add this as a leaf. + Leaves.push_back(I); + } + } +} + +bool X86PartialReduction::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + return false; + + auto &TM = TPC->getTM<X86TargetMachine>(); + ST = TM.getSubtargetImpl(F); + + DL = &F.getParent()->getDataLayout(); + + bool MadeChange = false; + for (auto &BB : F) { + for (auto &I : BB) { + auto *EE = dyn_cast<ExtractElementInst>(&I); + if (!EE) + continue; + + // First find a reduction tree. + // FIXME: Do we need to handle other opcodes than Add? + Value *Root = matchAddReduction(*EE); + if (!Root) + continue; + + SmallVector<Instruction *, 8> Leaves; + collectLeaves(Root, Leaves); + + for (Instruction *I : Leaves) { + if (tryMAddReplacement(I)) { + MadeChange = true; + continue; + } + + // Don't do SAD matching on the root node. SelectionDAG already + // has support for that and currently generates better code. + if (I != Root && trySADReplacement(I)) + MadeChange = true; + } + } + } + + return MadeChange; +} diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td b/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td index 93238983afa2..833013fb69f3 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td @@ -223,3 +223,13 @@ def ZnVer1PfmCounters : ProcPfmCounters { ]; } def : PfmCountersBinding<"znver1", ZnVer1PfmCounters>; + +def ZnVer2PfmCounters : ProcPfmCounters { + let CycleCounter = PfmCounter<"cycles_not_in_halt">; + let UopsCounter = PfmCounter<"retired_uops">; + let IssueCounters = [ + PfmIssueCounter<"Zn2AGU", "ls_dispatch:ld_dispatch + ls_dispatch:store_dispatch">, + PfmIssueCounter<"Zn2Divider", "div_op_count"> + ]; +} +def : PfmCountersBinding<"znver2", ZnVer2PfmCounters>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp index f69626b2622e..f456728cf47b 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -72,12 +72,6 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT) } } -bool -X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { - // ExecutionDomainFix, BreakFalseDeps and PostRAScheduler require liveness. - return true; -} - int X86RegisterInfo::getSEHRegNum(unsigned i) const { return getEncodingValue(i); @@ -633,18 +627,22 @@ static bool CantUseSP(const MachineFrameInfo &MFI) { } bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { - const MachineFrameInfo &MFI = MF.getFrameInfo(); - - if (!EnableBasePointer) - return false; - - // When we need stack realignment, we can't address the stack from the frame - // pointer. When we have dynamic allocas or stack-adjusting inline asm, we - // can't address variables from the stack pointer. MS inline asm can - // reference locals while also adjusting the stack pointer. When we can't - // use both the SP and the FP, we need a separate base pointer register. - bool CantUseFP = needsStackRealignment(MF); - return CantUseFP && CantUseSP(MFI); + const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + if (X86FI->hasPreallocatedCall()) + return true; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + if (!EnableBasePointer) + return false; + + // When we need stack realignment, we can't address the stack from the frame + // pointer. When we have dynamic allocas or stack-adjusting inline asm, we + // can't address variables from the stack pointer. MS inline asm can + // reference locals while also adjusting the stack pointer. When we can't + // use both the SP and the FP, we need a separate base pointer register. + bool CantUseFP = needsStackRealignment(MF); + return CantUseFP && CantUseSP(MFI); } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { @@ -667,7 +665,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { } bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, - unsigned Reg, int &FrameIdx) const { + Register Reg, int &FrameIdx) const { // Since X86 defines assignCalleeSavedSpillSlots which always return true // this function neither used nor tested. llvm_unreachable("Unused function on X86. Otherwise need a test case."); @@ -728,7 +726,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Determine base register and offset. int FIOffset; - unsigned BasePtr; + Register BasePtr; if (MI.isReturn()) { assert((!needsStackRealignment(MF) || MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) && diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h index b82920898069..3435c0a10b04 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h @@ -54,10 +54,6 @@ public: // FIXME: This should be tablegen'd like getDwarfRegNum is int getSEHRegNum(unsigned i) const; - /// Code Generation virtual methods... - /// - bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; - /// getMatchingSuperRegClass - Return a subclass of the specified register /// class A so that each register in it has a sub-register of the /// specified sub-register index which is in the specified register class B. @@ -125,7 +121,7 @@ public: bool canRealignStack(const MachineFunction &MF) const override; - bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, + bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg, int &FrameIdx) const override; void eliminateFrameIndex(MachineBasicBlock::iterator MI, diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td index 3cfaf714e93e..8de5b94bbffa 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td @@ -265,6 +265,16 @@ let SubRegIndices = [sub_ymm] in { } } +// Tile "registers". +def TMM0: X86Reg<"tmm0", 0>; +def TMM1: X86Reg<"tmm1", 1>; +def TMM2: X86Reg<"tmm2", 2>; +def TMM3: X86Reg<"tmm3", 3>; +def TMM4: X86Reg<"tmm4", 4>; +def TMM5: X86Reg<"tmm5", 5>; +def TMM6: X86Reg<"tmm6", 6>; +def TMM7: X86Reg<"tmm7", 7>; + // Mask Registers, used by AVX-512 instructions. def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, 93, 93]>; def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, 94, 94]>; @@ -498,7 +508,7 @@ def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, // which we do not have right now. def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>; -// When RBP is used as a base pointer in a 32-bit addresses environement, +// When RBP is used as a base pointer in a 32-bit addresses environment, // this is also safe to use the full register to access addresses. // Since RBP will never be spilled, stick to a 32 alignment to save // on memory consumption. @@ -621,3 +631,8 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} // Bound registers def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; + +// Tiles +let isAllocatable = 0 in +def TILE : RegisterClass<"X86", [untyped], 0, + (sequence "TMM%u", 0, 7)> {let Size = 8192;} diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td index 9b1fcaa8a13d..4aea7bc253bb 100755 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -260,7 +260,8 @@ defm : BWWriteResPair<WriteFCmp64X, [BWPort1], 3, [1], 1, 5>; // Floating point defm : BWWriteResPair<WriteFCmp64Y, [BWPort1], 3, [1], 1, 6>; // Floating point double compare (YMM/ZMM). defm : X86WriteResPairUnsupported<WriteFCmp64Z>; -defm : BWWriteResPair<WriteFCom, [BWPort1], 3>; // Floating point compare to flags. +defm : BWWriteResPair<WriteFCom, [BWPort1], 3>; // Floating point compare to flags (X87). +defm : BWWriteResPair<WriteFComX, [BWPort1], 3>; // Floating point compare to flags (SSE). defm : BWWriteResPair<WriteFMul, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication. defm : BWWriteResPair<WriteFMulX, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication (XMM). @@ -351,8 +352,10 @@ defm : X86WriteRes<WriteVecStoreX, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreY, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteVecMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; -defm : X86WriteRes<WriteVecMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStore32, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStore64, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteVecMove, [BWPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [BWPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [BWPort015], 1, [1], 1>; @@ -986,7 +989,7 @@ def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup62], (instrs FARJMP64)>; +def: InstRW<[BWWriteResGroup62], (instrs FARJMP64m)>; def: InstRW<[BWWriteResGroup62], (instregex "JMP(16|32|64)m")>; def BWWriteResGroup64 : SchedWriteRes<[BWPort23,BWPort15]> { @@ -1127,7 +1130,7 @@ def BWWriteResGroup89 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort let ResourceCycles = [1,1,1,1,1]; } def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m")>; -def: InstRW<[BWWriteResGroup89], (instrs FARCALL64)>; +def: InstRW<[BWWriteResGroup89], (instrs FARCALL64m)>; def BWWriteResGroup90 : SchedWriteRes<[BWPort6,BWPort06,BWPort15,BWPort0156]> { let Latency = 7; @@ -1479,54 +1482,42 @@ def BWWriteResGroup182 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI(16|32)m")>; def BWWriteResGroup183_1 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { - let Latency = 22; + let Latency = 17; let NumMicroOps = 7; let ResourceCycles = [1,3,2,1]; } -def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERQPDrm)>; +def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERDPDrm, VPGATHERDQrm, + VGATHERQPDrm, VPGATHERQQrm)>; def BWWriteResGroup183_2 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { - let Latency = 23; + let Latency = 18; let NumMicroOps = 9; let ResourceCycles = [1,3,4,1]; } -def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERQPDYrm)>; +def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERDPDYrm, VPGATHERDQYrm, + VGATHERQPDYrm, VPGATHERQQYrm)>; def BWWriteResGroup183_3 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { - let Latency = 24; + let Latency = 19; let NumMicroOps = 9; let ResourceCycles = [1,5,2,1]; } -def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSYrm)>; +def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSrm, VPGATHERQDrm)>; def BWWriteResGroup183_4 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { - let Latency = 25; - let NumMicroOps = 7; - let ResourceCycles = [1,3,2,1]; + let Latency = 19; + let NumMicroOps = 10; + let ResourceCycles = [1,4,4,1]; } -def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPDrm, - VGATHERDPSrm)>; +def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPSrm, VPGATHERDDrm, + VGATHERQPSYrm, VPGATHERQDYrm)>; def BWWriteResGroup183_5 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { - let Latency = 26; - let NumMicroOps = 9; - let ResourceCycles = [1,5,2,1]; -} -def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPDYrm)>; - -def BWWriteResGroup183_6 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { - let Latency = 26; + let Latency = 21; let NumMicroOps = 14; let ResourceCycles = [1,4,8,1]; } -def: InstRW<[BWWriteResGroup183_6], (instrs VGATHERDPSYrm)>; - -def BWWriteResGroup183_7 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { - let Latency = 27; - let NumMicroOps = 9; - let ResourceCycles = [1,5,2,1]; -} -def: InstRW<[BWWriteResGroup183_7], (instrs VGATHERQPSrm)>; +def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>; def BWWriteResGroup185 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> { let Latency = 29; @@ -1604,7 +1595,7 @@ def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>; def: InstRW<[WriteZero], (instrs CLC)>; -// Intruction variants handled by the renamer. These might not need execution +// Instruction variants handled by the renamer. These might not need execution // ports in certain conditions. // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", // section "Haswell and Broadwell Pipeline" > "Register allocation and diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td index 06f417501b21..746dbaeca189 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td @@ -261,6 +261,7 @@ defm : HWWriteResPair<WriteFCmp64Y, [HWPort1], 3, [1], 1, 7>; defm : HWWriteResPair<WriteFCmp64Z, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1 defm : HWWriteResPair<WriteFCom, [HWPort1], 3>; +defm : HWWriteResPair<WriteFComX, [HWPort1], 3>; defm : HWWriteResPair<WriteFMul, [HWPort01], 5, [1], 1, 5>; defm : HWWriteResPair<WriteFMulX, [HWPort01], 5, [1], 1, 6>; @@ -391,8 +392,10 @@ defm : X86WriteRes<WriteVecStoreX, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreY, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteVecMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; -defm : X86WriteRes<WriteVecMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStore32, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStore64, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteVecMove, [HWPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [HWPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [HWPort015], 1, [1], 1>; @@ -996,7 +999,7 @@ def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup14], (instrs FARJMP64)>; +def: InstRW<[HWWriteResGroup14], (instrs FARJMP64m)>; def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>; def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> { @@ -1205,7 +1208,7 @@ def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort let ResourceCycles = [1,1,1,1,1]; } def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>; -def: InstRW<[HWWriteResGroup48], (instrs FARCALL64)>; +def: InstRW<[HWWriteResGroup48], (instrs FARCALL64m)>; def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> { let Latency = 3; @@ -1784,80 +1787,60 @@ def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6, } def: InstRW<[HWWriteResGroup183], (instrs FSTENVm)>; -def HWWriteResGroup184 : SchedWriteRes<[HWPort0, HWPort5, HWPort15, HWPort015, HWPort06, HWPort23]> { - let Latency = 26; +def HWWriteResGroup184 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> { + let Latency = 14; let NumMicroOps = 12; - let ResourceCycles = [2,2,1,3,2,2]; -} -def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm, - VPGATHERDQrm, - VPGATHERDDrm)>; - -def HWWriteResGroup185 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { - let Latency = 24; - let NumMicroOps = 22; - let ResourceCycles = [5,3,4,1,5,4]; + let ResourceCycles = [2,2,2,1,3,2]; } -def: InstRW<[HWWriteResGroup185], (instrs VGATHERQPDYrm, - VPGATHERQQYrm)>; +def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm, VPGATHERDQrm)>; -def HWWriteResGroup186 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { - let Latency = 28; - let NumMicroOps = 22; - let ResourceCycles = [5,3,4,1,5,4]; -} -def: InstRW<[HWWriteResGroup186], (instrs VPGATHERQDYrm)>; - -def HWWriteResGroup187 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { - let Latency = 25; - let NumMicroOps = 22; - let ResourceCycles = [5,3,4,1,5,4]; +def HWWriteResGroup185 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> { + let Latency = 17; + let NumMicroOps = 20; + let ResourceCycles = [3,3,4,1,5,4]; } -def: InstRW<[HWWriteResGroup187], (instrs VPGATHERQDrm)>; +def: InstRW<[HWWriteResGroup185], (instrs VGATHERDPDYrm, VPGATHERDQYrm)>; -def HWWriteResGroup188 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { - let Latency = 27; +def HWWriteResGroup186 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> { + let Latency = 16; let NumMicroOps = 20; let ResourceCycles = [3,3,4,1,5,4]; } -def: InstRW<[HWWriteResGroup188], (instrs VGATHERDPDYrm, - VPGATHERDQYrm)>; +def: InstRW<[HWWriteResGroup186], (instrs VGATHERDPSrm, VPGATHERDDrm)>; -def HWWriteResGroup189 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { - let Latency = 27; +def HWWriteResGroup187 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> { + let Latency = 22; let NumMicroOps = 34; let ResourceCycles = [5,3,8,1,9,8]; } -def: InstRW<[HWWriteResGroup189], (instrs VGATHERDPSYrm, - VPGATHERDDYrm)>; +def: InstRW<[HWWriteResGroup187], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>; -def HWWriteResGroup190 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { - let Latency = 23; +def HWWriteResGroup188 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> { + let Latency = 15; let NumMicroOps = 14; let ResourceCycles = [3,3,2,1,3,2]; } -def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPDrm, - VPGATHERQQrm)>; +def: InstRW<[HWWriteResGroup188], (instrs VGATHERQPDrm, VPGATHERQQrm)>; -def HWWriteResGroup191 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { - let Latency = 28; - let NumMicroOps = 15; - let ResourceCycles = [3,3,2,1,4,2]; +def HWWriteResGroup189 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> { + let Latency = 17; + let NumMicroOps = 22; + let ResourceCycles = [5,3,4,1,5,4]; } -def: InstRW<[HWWriteResGroup191], (instrs VGATHERQPSYrm)>; +def: InstRW<[HWWriteResGroup189], (instrs VGATHERQPDYrm, VPGATHERQQYrm, + VGATHERQPSYrm, VPGATHERQDYrm)>; -def HWWriteResGroup192 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { - let Latency = 25; +def HWWriteResGroup190 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> { + let Latency = 16; let NumMicroOps = 15; let ResourceCycles = [3,3,2,1,4,2]; } -def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm, - VGATHERDPSrm)>; +def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPSrm, VPGATHERQDrm)>; def: InstRW<[WriteZero], (instrs CLC)>; -// Intruction variants handled by the renamer. These might not need execution +// Instruction variants handled by the renamer. These might not need execution // ports in certain conditions. // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", // section "Haswell and Broadwell Pipeline" > "Register allocation and diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td index 26d4d8fa3549..ac32f1b19990 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -238,6 +238,7 @@ defm : SBWriteResPair<WriteFCmp64Y, [SBPort1], 3, [1], 1, 7>; defm : SBWriteResPair<WriteFCmp64Z, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1 defm : SBWriteResPair<WriteFCom, [SBPort1], 3>; +defm : SBWriteResPair<WriteFComX, [SBPort1], 3>; defm : SBWriteResPair<WriteFMul, [SBPort0], 5, [1], 1, 6>; defm : SBWriteResPair<WriteFMulX, [SBPort0], 5, [1], 1, 6>; @@ -366,8 +367,10 @@ defm : X86WriteRes<WriteVecStoreX, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteVecStoreY, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteVecStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteVecStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>; -defm : X86WriteRes<WriteVecMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; -defm : X86WriteRes<WriteVecMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteVecMaskedStore32, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteVecMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteVecMaskedStore64, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteVecMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; defm : X86WriteRes<WriteVecMove, [SBPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [SBPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [SBPort05], 1, [1], 1>; @@ -481,7 +484,7 @@ def : WriteRes<WritePCmpEStrM, [SBPort015]> { let ResourceCycles = [8]; } def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> { - let Latency = 11; + let Latency = 17; let ResourceCycles = [7, 1]; } @@ -503,7 +506,7 @@ def : WriteRes<WritePCmpEStrI, [SBPort015]> { let ResourceCycles = [8]; } def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> { - let Latency = 4; + let Latency = 10; let ResourceCycles = [7, 1]; } @@ -541,7 +544,7 @@ def : WriteRes<WriteAESKeyGen, [SBPort015]> { let ResourceCycles = [11]; } def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> { - let Latency = 8; + let Latency = 14; let ResourceCycles = [10, 1]; } @@ -551,7 +554,7 @@ def : WriteRes<WriteCLMul, [SBPort015]> { let ResourceCycles = [18]; } def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> { - let Latency = 14; + let Latency = 20; let ResourceCycles = [17, 1]; } @@ -881,7 +884,7 @@ def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> { let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } -def: InstRW<[SBWriteResGroup64], (instrs FARJMP64)>; +def: InstRW<[SBWriteResGroup64], (instrs FARJMP64m)>; def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> { let Latency = 7; @@ -967,7 +970,7 @@ def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { let NumMicroOps = 5; let ResourceCycles = [1,1,1,2]; } -def: InstRW<[SBWriteResGroup87], (instrs FARCALL64)>; +def: InstRW<[SBWriteResGroup87], (instrs FARCALL64m)>; def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { let Latency = 9; @@ -1105,7 +1108,7 @@ def: InstRW<[SBWriteResGroupVzeroupper], (instrs VZEROUPPER)>; def: InstRW<[WriteZero], (instrs CLC)>; -// Intruction variants handled by the renamer. These might not need execution +// Instruction variants handled by the renamer. These might not need execution // ports in certain conditions. // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", // section "Sandy Bridge and Ivy Bridge Pipeline" > "Register allocation and diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 9a511ecc0071..0599564765da 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -255,7 +255,8 @@ defm : SKLWriteResPair<WriteFCmp64X, [SKLPort01], 4, [1], 1, 6>; defm : SKLWriteResPair<WriteFCmp64Y, [SKLPort01], 4, [1], 1, 7>; defm : X86WriteResPairUnsupported<WriteFCmp64Z>; -defm : SKLWriteResPair<WriteFCom, [SKLPort0], 2>; // Floating point compare to flags. +defm : SKLWriteResPair<WriteFCom, [SKLPort0], 2>; // Floating point compare to flags (X87). +defm : SKLWriteResPair<WriteFComX, [SKLPort0], 2>; // Floating point compare to flags (SSE). defm : SKLWriteResPair<WriteFMul, [SKLPort01], 4, [1], 1, 5>; // Floating point multiplication. defm : SKLWriteResPair<WriteFMulX, [SKLPort01], 4, [1], 1, 6>; @@ -342,8 +343,10 @@ defm : X86WriteRes<WriteVecStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteVecMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>; -defm : X86WriteRes<WriteVecMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore32, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore64, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMove, [SKLPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [SKLPort015], 1, [1], 1>; @@ -361,9 +364,9 @@ defm : X86WriteResPairUnsupported<WriteVecLogicZ>; defm : SKLWriteResPair<WriteVecTest, [SKLPort0,SKLPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions. defm : SKLWriteResPair<WriteVecTestY, [SKLPort0,SKLPort5], 3, [1,1], 2, 7>; defm : X86WriteResPairUnsupported<WriteVecTestZ>; -defm : SKLWriteResPair<WriteVecIMul, [SKLPort0] , 4, [1], 1, 5>; // Vector integer multiply. -defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01], 4, [1], 1, 6>; -defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01], 4, [1], 1, 7>; +defm : SKLWriteResPair<WriteVecIMul, [SKLPort0] , 5, [1], 1, 5>; // Vector integer multiply. +defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01], 5, [1], 1, 6>; +defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01], 5, [1], 1, 7>; defm : X86WriteResPairUnsupported<WriteVecIMulZ>; defm : SKLWriteResPair<WritePMULLD, [SKLPort01], 10, [2], 2, 6>; // Vector PMULLD. defm : SKLWriteResPair<WritePMULLDY, [SKLPort01], 10, [2], 2, 7>; @@ -1012,7 +1015,7 @@ def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup72], (instrs FARJMP64)>; +def: InstRW<[SKLWriteResGroup72], (instrs FARJMP64m)>; def: InstRW<[SKLWriteResGroup72], (instregex "JMP(16|32|64)m")>; def SKLWriteResGroup75 : SchedWriteRes<[SKLPort23,SKLPort15]> { @@ -1193,7 +1196,7 @@ def SKLWriteResGroup102 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237, let ResourceCycles = [1,1,1,1,1]; } def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m")>; -def: InstRW<[SKLWriteResGroup102], (instrs FARCALL64)>; +def: InstRW<[SKLWriteResGroup102], (instrs FARCALL64m)>; def SKLWriteResGroup103 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> { let Latency = 7; @@ -1592,33 +1595,31 @@ def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> { } def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F(32|64)m")>; -def SKLWriteResGroup196_1 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> { - let Latency = 22; - let NumMicroOps = 5; +def SKLWriteResGroupVEX2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> { + let Latency = 18; + let NumMicroOps = 5; // 2 uops perform multiple loads let ResourceCycles = [1,2,1,1]; } -def: InstRW<[SKLWriteResGroup196_1], (instrs VGATHERDPSrm, - VGATHERDPDrm, - VGATHERQPDrm, - VGATHERQPSrm, - VPGATHERDDrm, - VPGATHERDQrm, - VPGATHERQDrm, - VPGATHERQQrm)>; +def: InstRW<[SKLWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm, + VGATHERQPDrm, VPGATHERQQrm, + VGATHERQPSrm, VPGATHERQDrm)>; -def SKLWriteResGroup196_2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> { - let Latency = 25; - let NumMicroOps = 5; - let ResourceCycles = [1,2,1,1]; +def SKLWriteResGroupVEX4 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> { + let Latency = 20; + let NumMicroOps = 5; // 2 uops peform multiple loads + let ResourceCycles = [1,4,1,1]; +} +def: InstRW<[SKLWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm, + VGATHERDPSrm, VPGATHERDDrm, + VGATHERQPDYrm, VPGATHERQQYrm, + VGATHERQPSYrm, VPGATHERQDYrm)>; + +def SKLWriteResGroupVEX8 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> { + let Latency = 22; + let NumMicroOps = 5; // 2 uops perform multiple loads + let ResourceCycles = [1,8,1,1]; } -def: InstRW<[SKLWriteResGroup196_2], (instrs VGATHERDPSYrm, - VGATHERQPDYrm, - VGATHERQPSYrm, - VPGATHERDDYrm, - VPGATHERDQYrm, - VPGATHERQDYrm, - VPGATHERQQYrm, - VGATHERDPDYrm)>; +def: InstRW<[SKLWriteResGroupVEX8], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>; def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { let Latency = 23; @@ -1745,7 +1746,7 @@ def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>; def: InstRW<[WriteZero], (instrs CLC)>; -// Intruction variants handled by the renamer. These might not need execution +// Instruction variants handled by the renamer. These might not need execution // ports in certain conditions. // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", // section "Skylake Pipeline" > "Register allocation and renaming". diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td index a8c65435ab9b..7fc96d1eda89 100755 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -255,7 +255,8 @@ defm : SKXWriteResPair<WriteFCmp64X, [SKXPort01], 4, [1], 1, 6>; defm : SKXWriteResPair<WriteFCmp64Y, [SKXPort01], 4, [1], 1, 7>; defm : SKXWriteResPair<WriteFCmp64Z, [SKXPort05], 4, [1], 1, 7>; -defm : SKXWriteResPair<WriteFCom, [SKXPort0], 2>; // Floating point compare to flags. +defm : SKXWriteResPair<WriteFCom, [SKXPort0], 2>; // Floating point compare to flags (X87). +defm : SKXWriteResPair<WriteFComX, [SKXPort0], 2>; // Floating point compare to flags (SSE). defm : SKXWriteResPair<WriteFMul, [SKXPort01], 4, [1], 1, 5>; // Floating point multiplication. defm : SKXWriteResPair<WriteFMulX, [SKXPort01], 4, [1], 1, 6>; @@ -342,8 +343,10 @@ defm : X86WriteRes<WriteVecStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>; -defm : X86WriteRes<WriteVecMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>; -defm : X86WriteRes<WriteVecMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore32, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore64, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMove, [SKXPort05], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [SKXPort015], 1, [1], 1>; @@ -361,10 +364,10 @@ defm : SKXWriteResPair<WriteVecLogicZ,[SKXPort05], 1, [1], 1, 7>; defm : SKXWriteResPair<WriteVecTest, [SKXPort0,SKXPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions. defm : SKXWriteResPair<WriteVecTestY, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>; defm : SKXWriteResPair<WriteVecTestZ, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>; -defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 4, [1], 1, 5>; // Vector integer multiply. -defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01], 4, [1], 1, 6>; -defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01], 4, [1], 1, 7>; -defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 5, [1], 1, 5>; // Vector integer multiply. +defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01], 5, [1], 1, 6>; +defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01], 5, [1], 1, 7>; +defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05], 5, [1], 1, 7>; defm : SKXWriteResPair<WritePMULLD, [SKXPort01], 10, [2], 2, 6>; // Vector PMULLD. defm : SKXWriteResPair<WritePMULLDY, [SKXPort01], 10, [2], 2, 7>; defm : SKXWriteResPair<WritePMULLDZ, [SKXPort05], 10, [2], 2, 7>; @@ -619,6 +622,8 @@ def: InstRW<[SKXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr", "KOR(B|D|Q|W)rr", "KXNOR(B|D|Q|W)rr", "KXOR(B|D|Q|W)rr", + "KSET0(B|D|Q|W)", // Same as KXOR + "KSET1(B|D|Q|W)", // Same as KXNOR "MMX_PADDS(B|W)irr", "MMX_PADDUS(B|W)irr", "MMX_PAVG(B|W)irr", @@ -814,19 +819,26 @@ def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> { } def: InstRW<[SKXWriteResGroup32], (instrs VPSADBWZrr)>; // TODO: 512-bit ops require ports 0/1 to be joined. def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)", - "KADD(B|D|Q|W)rr", + "VALIGND(Z|Z128|Z256)rri", + "VALIGNQ(Z|Z128|Z256)rri", + "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined. + "VPBROADCAST(B|W)rr", + "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr")>; + +def SKXWriteResGroup33 : SchedWriteRes<[SKXPort5]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup33], (instregex "KADD(B|D|Q|W)rr", "KSHIFTL(B|D|Q|W)ri", "KSHIFTR(B|D|Q|W)ri", "KUNPCK(BW|DQ|WD)rr", - "VALIGND(Z|Z128|Z256)rri", - "VALIGNQ(Z|Z128|Z256)rri", "VCMPPD(Z|Z128|Z256)rri", "VCMPPS(Z|Z128|Z256)rri", "VCMP(SD|SS)Zrr", - "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined. "VFPCLASS(PD|PS)(Z|Z128|Z256)rr", "VFPCLASS(SD|SS)Zrr", - "VPBROADCAST(B|W)rr", "VPCMPB(Z|Z128|Z256)rri", "VPCMPD(Z|Z128|Z256)rri", "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr", @@ -834,7 +846,6 @@ def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0 "VPCMPQ(Z|Z128|Z256)rri", "VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri", "VPCMPW(Z|Z128|Z256)rri", - "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr", "VPTEST(N?)M(B|D|Q|W)(Z|Z128|Z256)rr")>; def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> { @@ -1171,7 +1182,7 @@ def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup76], (instrs FARJMP64)>; +def: InstRW<[SKXWriteResGroup76], (instrs FARJMP64m)>; def: InstRW<[SKXWriteResGroup76], (instregex "JMP(16|32|64)m")>; def SKXWriteResGroup79 : SchedWriteRes<[SKXPort23,SKXPort15]> { @@ -1331,8 +1342,8 @@ def: InstRW<[SKXWriteResGroup95], (instrs VMOVNTDQAZ128rm, def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd], (instregex "VBLENDMPDZ128rm(b?)", "VBLENDMPSZ128rm(b?)", - "VBROADCASTI32X2Z128m(b?)", - "VBROADCASTSSZ128m(b?)", + "VBROADCASTI32X2Z128rm(b?)", + "VBROADCASTSSZ128rm(b?)", "VINSERT(F|I)128rm", "VMOVAPDZ128rm(b?)", "VMOVAPSZ128rm(b?)", @@ -1350,8 +1361,8 @@ def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd], "VPADD(B|D|Q|W)Z128rm(b?)", "(V?)PADD(B|D|Q|W)rm", "VPBLENDM(B|D|Q|W)Z128rm(b?)", - "VPBROADCASTDZ128m(b?)", - "VPBROADCASTQZ128m(b?)", + "VPBROADCASTDZ128rm(b?)", + "VPBROADCASTQZ128rm(b?)", "VPSUB(B|D|Q|W)Z128rm(b?)", "(V?)PSUB(B|D|Q|W)rm", "VPTERNLOGDZ128rm(b?)i", @@ -1456,7 +1467,7 @@ def SKXWriteResGroup109 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237, let ResourceCycles = [1,1,1,1,1]; } def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m")>; -def: InstRW<[SKXWriteResGroup109], (instrs FARCALL64)>; +def: InstRW<[SKXWriteResGroup109], (instrs FARCALL64m)>; def SKXWriteResGroup110 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> { let Latency = 7; @@ -1516,9 +1527,8 @@ def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> { let ResourceCycles = [1,1]; } def: InstRW<[SKXWriteResGroup119], (instregex "FCOM(P?)(32|64)m", - "VFPCLASSSDZrm(b?)", - "VPBROADCASTB(Z|Z256)m(b?)", - "VPBROADCASTW(Z|Z256)m(b?)")>; + "VPBROADCASTB(Z|Z256)rm(b?)", + "VPBROADCASTW(Z|Z256)rm(b?)")>; def: InstRW<[SKXWriteResGroup119], (instrs VPBROADCASTBYrm, VPBROADCASTWYrm, VPMOVSXBDYrm, @@ -1535,24 +1545,24 @@ def: InstRW<[SKXWriteResGroup121], (instrs VMOVNTDQAZ256rm, def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd], (instregex "VBLENDMPD(Z|Z256)rm(b?)", "VBLENDMPS(Z|Z256)rm(b?)", - "VBROADCASTF32X2Z256m(b?)", - "VBROADCASTF32X2Zm(b?)", + "VBROADCASTF32X2Z256rm(b?)", + "VBROADCASTF32X2Zrm(b?)", "VBROADCASTF32X4Z256rm(b?)", "VBROADCASTF32X4rm(b?)", "VBROADCASTF32X8rm(b?)", "VBROADCASTF64X2Z128rm(b?)", "VBROADCASTF64X2rm(b?)", "VBROADCASTF64X4rm(b?)", - "VBROADCASTI32X2Z256m(b?)", - "VBROADCASTI32X2Zm(b?)", + "VBROADCASTI32X2Z256rm(b?)", + "VBROADCASTI32X2Zrm(b?)", "VBROADCASTI32X4Z256rm(b?)", "VBROADCASTI32X4rm(b?)", "VBROADCASTI32X8rm(b?)", "VBROADCASTI64X2Z128rm(b?)", "VBROADCASTI64X2rm(b?)", "VBROADCASTI64X4rm(b?)", - "VBROADCASTSD(Z|Z256)m(b?)", - "VBROADCASTSS(Z|Z256)m(b?)", + "VBROADCASTSD(Z|Z256)rm(b?)", + "VBROADCASTSS(Z|Z256)rm(b?)", "VINSERTF32x4(Z|Z256)rm(b?)", "VINSERTF32x8Zrm(b?)", "VINSERTF64x2(Z|Z256)rm(b?)", @@ -1577,8 +1587,8 @@ def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd], "VPADD(B|D|Q|W)Yrm", "VPADD(B|D|Q|W)(Z|Z256)rm(b?)", "VPBLENDM(B|D|Q|W)(Z|Z256)rm(b?)", - "VPBROADCASTD(Z|Z256)m(b?)", - "VPBROADCASTQ(Z|Z256)m(b?)", + "VPBROADCASTD(Z|Z256)rm(b?)", + "VPBROADCASTQ(Z|Z256)rm(b?)", "VPSUB(B|D|Q|W)Yrm", "VPSUB(B|D|Q|W)(Z|Z256)rm(b?)", "VPTERNLOGD(Z|Z256)rm(b?)i", @@ -1667,17 +1677,9 @@ def: InstRW<[SKXWriteResGroup136], (instrs VPMOVSXBWYrm, VPMOVSXWDYrm, VPMOVZXWDYrm)>; def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i", - "VCMP(PD|PS)Z128rm(b?)i", - "VCMP(SD|SS)Zrm", + "VFPCLASSSDZrm(b?)", "VFPCLASSSSZrm(b?)", - "VPCMPBZ128rmi(b?)", - "VPCMPDZ128rmi(b?)", - "VPCMPEQ(B|D|Q|W)Z128rm(b?)", - "VPCMPGT(B|D|Q|W)Z128rm(b?)", "(V?)PCMPGTQrm", - "VPCMPQZ128rmi(b?)", - "VPCMPU(B|D|Q|W)Z128rmi(b?)", - "VPCMPWZ128rmi(b?)", "VPERMI2D128rm(b?)", "VPERMI2PD128rm(b?)", "VPERMI2PS128rm(b?)", @@ -1701,15 +1703,32 @@ def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i", "VPMOVZXBWZ128rm(b?)", "VPMOVZXDQZ128rm(b?)", "VPMOVZXWDZ128rm(b?)", - "VPMOVZXWQZ128rm(b?)", - "VPTESTMBZ128rm(b?)", - "VPTESTMDZ128rm(b?)", - "VPTESTMQZ128rm(b?)", - "VPTESTMWZ128rm(b?)", - "VPTESTNMBZ128rm(b?)", - "VPTESTNMDZ128rm(b?)", - "VPTESTNMQZ128rm(b?)", - "VPTESTNMWZ128rm(b?)")>; + "VPMOVZXWQZ128rm(b?)")>; + +def SKXWriteResGroup136_2 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup136_2], (instregex "VCMP(PD|PS)Z128rm(b?)i", + "VCMP(SD|SS)Zrm", + "VFPCLASSPDZ128rm(b?)", + "VFPCLASSPSZ128rm(b?)", + "VPCMPBZ128rmi(b?)", + "VPCMPDZ128rmi(b?)", + "VPCMPEQ(B|D|Q|W)Z128rm(b?)", + "VPCMPGT(B|D|Q|W)Z128rm(b?)", + "VPCMPQZ128rmi(b?)", + "VPCMPU(B|D|Q|W)Z128rmi(b?)", + "VPCMPWZ128rmi(b?)", + "VPTESTMBZ128rm(b?)", + "VPTESTMDZ128rm(b?)", + "VPTESTMQZ128rm(b?)", + "VPTESTMWZ128rm(b?)", + "VPTESTNMBZ128rm(b?)", + "VPTESTNMDZ128rm(b?)", + "VPTESTNMQZ128rm(b?)", + "VPTESTNMWZ128rm(b?)")>; def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> { let Latency = 9; @@ -1745,30 +1764,38 @@ def: InstRW<[SKXWriteResGroup148], (instregex "(ADD|SUB|SUBR)_F(32|64)m", "ILD_F(16|32|64)m", "VALIGND(Z|Z256)rm(b?)i", "VALIGNQ(Z|Z256)rm(b?)i", - "VCMPPD(Z|Z256)rm(b?)i", - "VCMPPS(Z|Z256)rm(b?)i", - "VPCMPB(Z|Z256)rmi(b?)", - "VPCMPD(Z|Z256)rmi(b?)", - "VPCMPEQB(Z|Z256)rm(b?)", - "VPCMPEQD(Z|Z256)rm(b?)", - "VPCMPEQQ(Z|Z256)rm(b?)", - "VPCMPEQW(Z|Z256)rm(b?)", - "VPCMPGTB(Z|Z256)rm(b?)", - "VPCMPGTD(Z|Z256)rm(b?)", - "VPCMPGTQ(Z|Z256)rm(b?)", - "VPCMPGTW(Z|Z256)rm(b?)", - "VPCMPQ(Z|Z256)rmi(b?)", - "VPCMPU(B|D|Q|W)Z256rmi(b?)", - "VPCMPU(B|D|Q|W)Zrmi(b?)", - "VPCMPW(Z|Z256)rmi(b?)", "VPMAXSQ(Z|Z256)rm(b?)", "VPMAXUQ(Z|Z256)rm(b?)", "VPMINSQ(Z|Z256)rm(b?)", - "VPMINUQ(Z|Z256)rm(b?)", - "VPTESTM(B|D|Q|W)Z256rm(b?)", - "VPTESTM(B|D|Q|W)Zrm(b?)", - "VPTESTNM(B|D|Q|W)Z256rm(b?)", - "VPTESTNM(B|D|Q|W)Zrm(b?)")>; + "VPMINUQ(Z|Z256)rm(b?)")>; + +def SKXWriteResGroup148_2 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup148_2], (instregex "VCMPPD(Z|Z256)rm(b?)i", + "VCMPPS(Z|Z256)rm(b?)i", + "VFPCLASSPD(Z|Z256)rm(b?)", + "VFPCLASSPS(Z|Z256)rm(b?)", + "VPCMPB(Z|Z256)rmi(b?)", + "VPCMPD(Z|Z256)rmi(b?)", + "VPCMPEQB(Z|Z256)rm(b?)", + "VPCMPEQD(Z|Z256)rm(b?)", + "VPCMPEQQ(Z|Z256)rm(b?)", + "VPCMPEQW(Z|Z256)rm(b?)", + "VPCMPGTB(Z|Z256)rm(b?)", + "VPCMPGTD(Z|Z256)rm(b?)", + "VPCMPGTQ(Z|Z256)rm(b?)", + "VPCMPGTW(Z|Z256)rm(b?)", + "VPCMPQ(Z|Z256)rmi(b?)", + "VPCMPU(B|D|Q|W)Z256rmi(b?)", + "VPCMPU(B|D|Q|W)Zrmi(b?)", + "VPCMPW(Z|Z256)rmi(b?)", + "VPTESTM(B|D|Q|W)Z256rm(b?)", + "VPTESTM(B|D|Q|W)Zrm(b?)", + "VPTESTNM(B|D|Q|W)Z256rm(b?)", + "VPTESTNM(B|D|Q|W)Zrm(b?)")>; def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> { let Latency = 10; @@ -1938,14 +1965,14 @@ def SKXWriteResGroup171 : SchedWriteRes<[SKXPort06,SKXPort0156]> { def: InstRW<[SKXWriteResGroup171], (instrs LOOPE, LOOPNE)>; def SKXWriteResGroup174 : SchedWriteRes<[SKXPort01]> { - let Latency = 12; + let Latency = 15; let NumMicroOps = 3; let ResourceCycles = [3]; } def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQ(Z128|Z256)rr")>; def SKXWriteResGroup174z : SchedWriteRes<[SKXPort05]> { - let Latency = 12; + let Latency = 15; let NumMicroOps = 3; let ResourceCycles = [3]; } @@ -2106,8 +2133,8 @@ def SKXWriteResGroup202 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKX } def: InstRW<[SKXWriteResGroup202], (instrs XCH_F)>; -def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort015]> { - let Latency = 18; +def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort01]> { + let Latency = 21; let NumMicroOps = 4; let ResourceCycles = [1,3]; } @@ -2134,21 +2161,19 @@ def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> { } def : SchedAlias<WriteFDiv64Ld, SKXWriteResGroup209>; // TODO - convert to ZnWriteResFpuPair -def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort015]> { - let Latency = 19; +def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort01]> { + let Latency = 22; let NumMicroOps = 4; let ResourceCycles = [1,3]; } -def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)", - "VPMULLQZrm(b?)")>; +def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)")>; -def SKXWriteResGroup214 : SchedWriteRes<[]> { - let Latency = 20; - let NumMicroOps = 0; +def SKXWriteResGroup211_1 : SchedWriteRes<[SKXPort23,SKXPort05]> { + let Latency = 22; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; } -def: InstRW<[SKXWriteResGroup214], (instrs VGATHERDPSZ128rm, - VGATHERQPSZrm, - VPGATHERDDZ128rm)>; +def: InstRW<[SKXWriteResGroup211_1], (instregex "VPMULLQZrm(b?)")>; def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> { let Latency = 20; @@ -2164,15 +2189,41 @@ def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> { } def : SchedAlias<WriteFDiv64XLd, SKXWriteResGroup216>; // TODO - convert to ZnWriteResFpuPair -def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { - let Latency = 20; - let NumMicroOps = 5; +def SKXWriteGatherEVEX2 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 17; + let NumMicroOps = 5; // 2 uops perform multiple loads let ResourceCycles = [1,2,1,1]; } -def: InstRW<[SKXWriteResGroup218], (instrs VGATHERQPSZ128rm, - VGATHERQPSZ256rm, - VPGATHERQDZ128rm, - VPGATHERQDZ256rm)>; +def: InstRW<[SKXWriteGatherEVEX2], (instrs VGATHERQPSZ128rm, VPGATHERQDZ128rm, + VGATHERDPDZ128rm, VPGATHERDQZ128rm, + VGATHERQPDZ128rm, VPGATHERQQZ128rm)>; + +def SKXWriteGatherEVEX4 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 19; + let NumMicroOps = 5; // 2 uops perform multiple loads + let ResourceCycles = [1,4,1,1]; +} +def: InstRW<[SKXWriteGatherEVEX4], (instrs VGATHERQPSZ256rm, VPGATHERQDZ256rm, + VGATHERQPDZ256rm, VPGATHERQQZ256rm, + VGATHERDPSZ128rm, VPGATHERDDZ128rm, + VGATHERDPDZ256rm, VPGATHERDQZ256rm)>; + +def SKXWriteGatherEVEX8 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 21; + let NumMicroOps = 5; // 2 uops perform multiple loads + let ResourceCycles = [1,8,1,1]; +} +def: InstRW<[SKXWriteGatherEVEX8], (instrs VGATHERDPSZ256rm, VPGATHERDDZ256rm, + VGATHERDPDZrm, VPGATHERDQZrm, + VGATHERQPDZrm, VPGATHERQQZrm, + VGATHERQPSZrm, VPGATHERQDZrm)>; + +def SKXWriteGatherEVEX16 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 25; + let NumMicroOps = 5; // 2 uops perform multiple loads + let ResourceCycles = [1,16,1,1]; +} +def: InstRW<[SKXWriteGatherEVEX16], (instrs VGATHERDPSZrm, VPGATHERDDZrm)>; def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { let Latency = 20; @@ -2202,57 +2253,31 @@ def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> { } def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F(32|64)m")>; -def SKXWriteResGroup224 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { - let Latency = 22; - let NumMicroOps = 5; +def SKXWriteResGroupVEX2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> { + let Latency = 18; + let NumMicroOps = 5; // 2 uops perform multiple loads let ResourceCycles = [1,2,1,1]; } -def: InstRW<[SKXWriteResGroup224], (instrs VGATHERDPDZ128rm, - VGATHERQPDZ128rm, - VPGATHERDQZ128rm, - VPGATHERQQZ128rm)>; +def: InstRW<[SKXWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm, + VGATHERQPDrm, VPGATHERQQrm, + VGATHERQPSrm, VPGATHERQDrm)>; -def SKXWriteResGroup224_2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> { - let Latency = 22; - let NumMicroOps = 5; - let ResourceCycles = [1,2,1,1]; +def SKXWriteResGroupVEX4 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> { + let Latency = 20; + let NumMicroOps = 5; // 2 uops peform multiple loads + let ResourceCycles = [1,4,1,1]; } -def: InstRW<[SKXWriteResGroup224_2], (instrs VGATHERDPSrm, - VGATHERDPDrm, - VGATHERQPDrm, - VGATHERQPSrm, - VPGATHERDDrm, - VPGATHERDQrm, - VPGATHERQDrm, - VPGATHERQQrm, - VPGATHERDDrm, - VPGATHERQDrm, - VPGATHERDQrm, - VPGATHERQQrm, - VGATHERDPSrm, - VGATHERQPSrm, - VGATHERDPDrm, - VGATHERQPDrm)>; - -def SKXWriteResGroup224_3 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> { - let Latency = 25; - let NumMicroOps = 5; - let ResourceCycles = [1,2,1,1]; +def: InstRW<[SKXWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm, + VGATHERDPSrm, VPGATHERDDrm, + VGATHERQPDYrm, VPGATHERQQYrm, + VGATHERQPSYrm, VPGATHERQDYrm)>; + +def SKXWriteResGroupVEX8 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> { + let Latency = 22; + let NumMicroOps = 5; // 2 uops perform multiple loads + let ResourceCycles = [1,8,1,1]; } -def: InstRW<[SKXWriteResGroup224_3], (instrs VGATHERDPSYrm, - VGATHERQPDYrm, - VGATHERQPSYrm, - VPGATHERDDYrm, - VPGATHERDQYrm, - VPGATHERQDYrm, - VPGATHERQQYrm, - VPGATHERDDYrm, - VPGATHERQDYrm, - VPGATHERDQYrm, - VPGATHERQQYrm, - VGATHERDPSYrm, - VGATHERQPSYrm, - VGATHERDPDYrm)>; +def: InstRW<[SKXWriteResGroupVEX8], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>; def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> { let Latency = 22; @@ -2276,27 +2301,6 @@ def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { } def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI(16|32)m")>; -def SKXWriteResGroup234 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { - let Latency = 25; - let NumMicroOps = 5; - let ResourceCycles = [1,2,1,1]; -} -def: InstRW<[SKXWriteResGroup234], (instrs VGATHERDPDZ256rm, - VGATHERQPDZ256rm, - VPGATHERDQZ256rm, - VPGATHERQDZrm, - VPGATHERQQZ256rm)>; - -def SKXWriteResGroup238 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { - let Latency = 26; - let NumMicroOps = 5; - let ResourceCycles = [1,2,1,1]; -} -def: InstRW<[SKXWriteResGroup238], (instrs VGATHERDPDZrm, - VGATHERQPDZrm, - VPGATHERDQZrm, - VPGATHERQQZrm)>; - def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> { let Latency = 27; let NumMicroOps = 2; @@ -2304,14 +2308,6 @@ def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> { } def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F(32|64)m")>; -def SKXWriteResGroup240 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { - let Latency = 27; - let NumMicroOps = 5; - let ResourceCycles = [1,2,1,1]; -} -def: InstRW<[SKXWriteResGroup240], (instrs VGATHERDPSZ256rm, - VPGATHERDDZ256rm)>; - def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { let Latency = 29; let NumMicroOps = 15; @@ -2326,14 +2322,6 @@ def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { } def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI(16|32)m")>; -def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { - let Latency = 30; - let NumMicroOps = 5; - let ResourceCycles = [1,2,1,1]; -} -def: InstRW<[SKXWriteResGroup245], (instrs VGATHERDPSZrm, - VPGATHERDDZrm)>; - def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> { let Latency = 35; let NumMicroOps = 23; @@ -2461,7 +2449,7 @@ def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>; def: InstRW<[WriteZero], (instrs CLC)>; -// Intruction variants handled by the renamer. These might not need execution +// Instruction variants handled by the renamer. These might not need execution // ports in certain conditions. // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", // section "Skylake Pipeline" > "Register allocation and renaming". diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td index 95f710061aeb..f204d6622119 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td @@ -250,7 +250,8 @@ defm WriteFCmp64 : X86SchedWritePair<ReadAfterVecLd>; // Floating point double defm WriteFCmp64X : X86SchedWritePair<ReadAfterVecXLd>; // Floating point double compare (XMM). defm WriteFCmp64Y : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double compare (YMM). defm WriteFCmp64Z : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double compare (ZMM). -defm WriteFCom : X86SchedWritePair<ReadAfterVecLd>; // Floating point compare to flags. +defm WriteFCom : X86SchedWritePair<ReadAfterVecLd>; // Floating point compare to flags (X87). +defm WriteFComX : X86SchedWritePair<ReadAfterVecLd>; // Floating point compare to flags (SSE). defm WriteFMul : X86SchedWritePair<ReadAfterVecLd>; // Floating point multiplication. defm WriteFMulX : X86SchedWritePair<ReadAfterVecXLd>; // Floating point multiplication (XMM). defm WriteFMulY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point multiplication (YMM). @@ -340,8 +341,10 @@ def WriteVecStoreX : SchedWrite; def WriteVecStoreY : SchedWrite; def WriteVecStoreNT : SchedWrite; def WriteVecStoreNTY : SchedWrite; -def WriteVecMaskedStore : SchedWrite; -def WriteVecMaskedStoreY : SchedWrite; +def WriteVecMaskedStore32 : SchedWrite; +def WriteVecMaskedStore64 : SchedWrite; +def WriteVecMaskedStore32Y : SchedWrite; +def WriteVecMaskedStore64Y : SchedWrite; def WriteVecMove : SchedWrite; def WriteVecMoveX : SchedWrite; def WriteVecMoveY : SchedWrite; @@ -549,6 +552,14 @@ def WriteFMaskMove32Y : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>; def WriteFMaskMove64Y : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>; +def WriteVecMaskMove32 + : X86SchedWriteMaskMove<WriteVecMaskedLoad, WriteVecMaskedStore32>; +def WriteVecMaskMove64 + : X86SchedWriteMaskMove<WriteVecMaskedLoad, WriteVecMaskedStore64>; +def WriteVecMaskMove32Y + : X86SchedWriteMaskMove<WriteVecMaskedLoadY, WriteVecMaskedStore32Y>; +def WriteVecMaskMove64Y + : X86SchedWriteMaskMove<WriteVecMaskedLoadY, WriteVecMaskedStore64Y>; // Vector width wrappers. def SchedWriteFAdd diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td index b0153ca9da36..b90baf6c16b1 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -244,6 +244,7 @@ defm : AtomWriteResPair<WriteFCmp64X, [AtomPort01], [AtomPort01], 6, 7, defm : X86WriteResPairUnsupported<WriteFCmp64Y>; defm : X86WriteResPairUnsupported<WriteFCmp64Z>; defm : AtomWriteResPair<WriteFCom, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : AtomWriteResPair<WriteFComX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; defm : AtomWriteResPair<WriteFMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>; defm : AtomWriteResPair<WriteFMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; defm : X86WriteResPairUnsupported<WriteFMulY>; @@ -368,8 +369,10 @@ def : WriteRes<WriteVecStoreX, [AtomPort0]>; defm : X86WriteResUnsupported<WriteVecStoreY>; def : WriteRes<WriteVecStoreNT, [AtomPort0]>; defm : X86WriteResUnsupported<WriteVecStoreNTY>; -def : WriteRes<WriteVecMaskedStore, [AtomPort0]>; -defm : X86WriteResUnsupported<WriteVecMaskedStoreY>; +defm : X86WriteResUnsupported<WriteVecMaskedStore32>; +defm : X86WriteResUnsupported<WriteVecMaskedStore64>; +defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>; +defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; def : WriteRes<WriteVecMove, [AtomPort0]>; def : WriteRes<WriteVecMoveX, [AtomPort01]>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td index d7aea3cf4e9d..0a201bc74a48 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td @@ -545,8 +545,40 @@ def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> { def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>; // This is for simple LEAs with one or two input operands. -// FIXME: SAGU 3-operand LEA -def : WriteRes<WriteLEA, [PdEX01]> { let NumMicroOps = 2; } +def : WriteRes<WriteLEA, [PdEX01]> { let ResourceCycles = [2]; } + +// This write is used for slow LEA instructions. +def PdWrite3OpsLEA : SchedWriteRes<[PdEX01]> { + let Latency = 2; + let ResourceCycles = [2]; +} + +// On Piledriver, a slow LEA is either a 3Ops LEA (base, index, offset), +// or an LEA with a `Scale` value different than 1. +def PdSlowLEAPredicate : MCSchedPredicate< + CheckAny<[ + // A 3-operand LEA (base, index, offset). + IsThreeOperandsLEAFn, + // An LEA with a "Scale" different than 1. + CheckAll<[ + CheckIsImmOperand<2>, + CheckNot<CheckImmOperand<2, 1>> + ]> + ]> +>; + +def PdWriteLEA : SchedWriteVariant<[ + SchedVar<PdSlowLEAPredicate, [PdWrite3OpsLEA]>, + SchedVar<NoSchedPred, [WriteLEA]> +]>; + +def : InstRW<[PdWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; + +def PdWriteLEA16r : SchedWriteRes<[PdEX01]> { + let ResourceCycles = [3]; + let NumMicroOps = 2; +} +def : InstRW<[PdWriteLEA16r], (instrs LEA16r)>; // Bit counts. defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [6], 6, 2>; @@ -766,6 +798,7 @@ defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [1, 2]>; defm : X86WriteResPairUnsupported<WriteFCmp64Z>; defm : PdWriteResXMMPair<WriteFCom, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>; +defm : PdWriteResXMMPair<WriteFComX, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>; def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> { let Latency = 6; @@ -1060,8 +1093,10 @@ def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>; defm : PdWriteRes<WriteVecStoreNT, [PdStore, PdFPU1, PdFPSTO], 2>; defm : PdWriteRes<WriteVecStoreNTY, [PdStore, PdFPU1, PdFPSTO], 2, [2, 2, 2], 4>; -defm : PdWriteRes<WriteVecMaskedStore, [PdStore, PdFPU01, PdFPMAL], 6, [1, 1, 4]>; -defm : PdWriteRes<WriteVecMaskedStoreY, [PdStore, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>; +defm : X86WriteResUnsupported<WriteVecMaskedStore32>; +defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>; +defm : X86WriteResUnsupported<WriteVecMaskedStore64>; +defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; defm : PdWriteRes<WriteVecMove, [PdFPU01, PdFPMAL], 2>; defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 1, [1, 2]>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td index d0421d94ee05..13b6eed5126d 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -541,6 +541,7 @@ defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>; defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>; defm : X86WriteResPairUnsupported<WriteFCmp64Z>; defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>; +defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0], 3>; defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>; defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>; defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>; @@ -669,8 +670,10 @@ defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>; defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; -defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>; -defm : X86WriteRes<WriteVecMaskedStoreY, [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>; +defm : X86WriteResUnsupported<WriteVecMaskedStore32>; +defm : X86WriteResUnsupported<WriteVecMaskedStore64>; +defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>; +defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>; defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td index dcd155ea0e0e..3d53ef104ed6 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -214,6 +214,7 @@ defm : SLMWriteResPair<WriteFCmp64X, [SLM_FPC_RSV1], 3>; defm : SLMWriteResPair<WriteFCmp64Y, [SLM_FPC_RSV1], 3>; defm : X86WriteResPairUnsupported<WriteFCmp64Z>; defm : SLMWriteResPair<WriteFCom, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFComX, [SLM_FPC_RSV1], 3>; defm : SLMWriteResPair<WriteFMul, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; defm : SLMWriteResPair<WriteFMulX, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; defm : SLMWriteResPair<WriteFMulY, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; @@ -310,8 +311,10 @@ def : WriteRes<WriteVecStoreX, [SLM_MEC_RSV]>; def : WriteRes<WriteVecStoreY, [SLM_MEC_RSV]>; def : WriteRes<WriteVecStoreNT, [SLM_MEC_RSV]>; def : WriteRes<WriteVecStoreNTY, [SLM_MEC_RSV]>; -def : WriteRes<WriteVecMaskedStore, [SLM_MEC_RSV]>; -def : WriteRes<WriteVecMaskedStoreY, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecMaskedStore32, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecMaskedStore32Y, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecMaskedStore64, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecMaskedStore64Y, [SLM_MEC_RSV]>; def : WriteRes<WriteVecMove, [SLM_FPC_RSV01]>; def : WriteRes<WriteVecMoveX, [SLM_FPC_RSV01]>; def : WriteRes<WriteVecMoveY, [SLM_FPC_RSV01]>; @@ -390,44 +393,15 @@ defm : X86WriteResPairUnsupported<WritePHAddZ>; // String instructions. // Packed Compare Implicit Length Strings, Return Mask -def : WriteRes<WritePCmpIStrM, [SLM_FPC_RSV0]> { - let Latency = 13; - let ResourceCycles = [13]; -} -def : WriteRes<WritePCmpIStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { - let Latency = 13; - let ResourceCycles = [13, 1]; -} +defm : SLMWriteResPair<WritePCmpIStrM, [SLM_FPC_RSV0], 13, [13]>; // Packed Compare Explicit Length Strings, Return Mask -def : WriteRes<WritePCmpEStrM, [SLM_FPC_RSV0]> { - let Latency = 17; - let ResourceCycles = [17]; -} -def : WriteRes<WritePCmpEStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { - let Latency = 17; - let ResourceCycles = [17, 1]; -} - +defm : SLMWriteResPair<WritePCmpEStrM, [SLM_FPC_RSV0], 17, [17]>; // Packed Compare Implicit Length Strings, Return Index -def : WriteRes<WritePCmpIStrI, [SLM_FPC_RSV0]> { - let Latency = 17; - let ResourceCycles = [17]; -} -def : WriteRes<WritePCmpIStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { - let Latency = 17; - let ResourceCycles = [17, 1]; -} +defm : SLMWriteResPair<WritePCmpIStrI, [SLM_FPC_RSV0], 17, [17]>; // Packed Compare Explicit Length Strings, Return Index -def : WriteRes<WritePCmpEStrI, [SLM_FPC_RSV0]> { - let Latency = 21; - let ResourceCycles = [21]; -} -def : WriteRes<WritePCmpEStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { - let Latency = 21; - let ResourceCycles = [21, 1]; -} +defm : SLMWriteResPair<WritePCmpEStrI, [SLM_FPC_RSV0], 21, [21]>; // MOVMSK Instructions. def : WriteRes<WriteFMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; } @@ -436,42 +410,12 @@ def : WriteRes<WriteVecMOVMSKY, [SLM_FPC_RSV1]> { let Latency = 4; } def : WriteRes<WriteMMXMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; } // AES Instructions. -def : WriteRes<WriteAESDecEnc, [SLM_FPC_RSV0]> { - let Latency = 8; - let ResourceCycles = [5]; -} -def : WriteRes<WriteAESDecEncLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { - let Latency = 8; - let ResourceCycles = [5, 1]; -} - -def : WriteRes<WriteAESIMC, [SLM_FPC_RSV0]> { - let Latency = 8; - let ResourceCycles = [5]; -} -def : WriteRes<WriteAESIMCLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { - let Latency = 8; - let ResourceCycles = [5, 1]; -} - -def : WriteRes<WriteAESKeyGen, [SLM_FPC_RSV0]> { - let Latency = 8; - let ResourceCycles = [5]; -} -def : WriteRes<WriteAESKeyGenLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { - let Latency = 8; - let ResourceCycles = [5, 1]; -} +defm : SLMWriteResPair<WriteAESDecEnc, [SLM_FPC_RSV0], 8, [5]>; +defm : SLMWriteResPair<WriteAESIMC, [SLM_FPC_RSV0], 8, [5]>; +defm : SLMWriteResPair<WriteAESKeyGen, [SLM_FPC_RSV0], 8, [5]>; // Carry-less multiplication instructions. -def : WriteRes<WriteCLMul, [SLM_FPC_RSV0]> { - let Latency = 10; - let ResourceCycles = [10]; -} -def : WriteRes<WriteCLMulLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { - let Latency = 10; - let ResourceCycles = [10, 1]; -} +defm : SLMWriteResPair<WriteCLMul, [SLM_FPC_RSV0], 10, [10]>; def : WriteRes<WriteSystem, [SLM_FPC_RSV0]> { let Latency = 100; } def : WriteRes<WriteMicrocoded, [SLM_FPC_RSV0]> { let Latency = 100; } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td index 06201f4a3a84..fe09d6f85221 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -295,6 +295,7 @@ defm : ZnWriteResFpuPair<WriteFCmp64X, [ZnFPU0], 3>; defm : ZnWriteResFpuPair<WriteFCmp64Y, [ZnFPU0], 3>; defm : X86WriteResPairUnsupported<WriteFCmp64Z>; defm : ZnWriteResFpuPair<WriteFCom, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFComX, [ZnFPU0], 3>; defm : ZnWriteResFpuPair<WriteFBlend, [ZnFPU01], 1>; defm : ZnWriteResFpuPair<WriteFBlendY, [ZnFPU01], 1>; defm : X86WriteResPairUnsupported<WriteFBlendZ>; @@ -387,8 +388,10 @@ defm : X86WriteRes<WriteVecStoreX, [ZnAGU], 1, [1], 1>; defm : X86WriteRes<WriteVecStoreY, [ZnAGU], 1, [1], 1>; defm : X86WriteRes<WriteVecStoreNT, [ZnAGU], 1, [1], 1>; defm : X86WriteRes<WriteVecStoreNTY, [ZnAGU], 1, [1], 1>; -defm : X86WriteRes<WriteVecMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>; -defm : X86WriteRes<WriteVecMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>; +defm : X86WriteRes<WriteVecMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteVecMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>; +defm : X86WriteRes<WriteVecMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteVecMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>; defm : X86WriteRes<WriteVecMove, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [ZnFPU], 2, [1], 2>; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td index 4537d9cc7956..48da0d6329b1 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -187,7 +187,7 @@ defm : Zn2WriteResPair<WriteIMul8, [Zn2ALU1, Zn2Multiplier], 4>; defm : X86WriteRes<WriteBSWAP32, [Zn2ALU], 1, [4], 1>; defm : X86WriteRes<WriteBSWAP64, [Zn2ALU], 1, [4], 1>; -defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 1, [1], 1>; +defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 3, [1], 1>; defm : X86WriteRes<WriteCMPXCHGRMW,[Zn2ALU,Zn2AGU], 8, [1,1], 5>; defm : X86WriteRes<WriteXCHG, [Zn2ALU], 1, [2], 2>; @@ -216,7 +216,7 @@ defm : X86WriteRes<WriteBitTestSet, [Zn2ALU], 2, [1], 2>; // Bit counts. defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3>; -defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 3>; +defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 4>; defm : Zn2WriteResPair<WriteLZCNT, [Zn2ALU], 1>; defm : Zn2WriteResPair<WriteTZCNT, [Zn2ALU], 2>; defm : Zn2WriteResPair<WritePOPCNT, [Zn2ALU], 1>; @@ -272,15 +272,16 @@ defm : Zn2WriteResFpuPair<WriteFAdd64, [Zn2FPU0], 3>; defm : Zn2WriteResFpuPair<WriteFAdd64X, [Zn2FPU0], 3>; defm : Zn2WriteResFpuPair<WriteFAdd64Y, [Zn2FPU0], 3>; defm : X86WriteResPairUnsupported<WriteFAdd64Z>; -defm : Zn2WriteResFpuPair<WriteFCmp, [Zn2FPU0], 3>; -defm : Zn2WriteResFpuPair<WriteFCmpX, [Zn2FPU0], 3>; -defm : Zn2WriteResFpuPair<WriteFCmpY, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFCmp, [Zn2FPU0], 1>; +defm : Zn2WriteResFpuPair<WriteFCmpX, [Zn2FPU0], 1>; +defm : Zn2WriteResFpuPair<WriteFCmpY, [Zn2FPU0], 1>; defm : X86WriteResPairUnsupported<WriteFCmpZ>; -defm : Zn2WriteResFpuPair<WriteFCmp64, [Zn2FPU0], 3>; -defm : Zn2WriteResFpuPair<WriteFCmp64X, [Zn2FPU0], 3>; -defm : Zn2WriteResFpuPair<WriteFCmp64Y, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFCmp64, [Zn2FPU0], 1>; +defm : Zn2WriteResFpuPair<WriteFCmp64X, [Zn2FPU0], 1>; +defm : Zn2WriteResFpuPair<WriteFCmp64Y, [Zn2FPU0], 1>; defm : X86WriteResPairUnsupported<WriteFCmp64Z>; defm : Zn2WriteResFpuPair<WriteFCom, [Zn2FPU0], 3>; +defm : Zn2WriteResFpuPair<WriteFComX, [Zn2FPU0], 3>; defm : Zn2WriteResFpuPair<WriteFBlend, [Zn2FPU01], 1>; defm : Zn2WriteResFpuPair<WriteFBlendY, [Zn2FPU01], 1>; defm : X86WriteResPairUnsupported<WriteFBlendZ>; @@ -313,8 +314,8 @@ defm : Zn2WriteResFpuPair<WriteFDiv64, [Zn2FPU3], 15>; defm : Zn2WriteResFpuPair<WriteFDiv64X, [Zn2FPU3], 15>; defm : X86WriteResPairUnsupported<WriteFDiv64Z>; defm : Zn2WriteResFpuPair<WriteFSign, [Zn2FPU3], 2>; -defm : Zn2WriteResFpuPair<WriteFRnd, [Zn2FPU3], 4, [1], 1, 7, 0>; -defm : Zn2WriteResFpuPair<WriteFRndY, [Zn2FPU3], 4, [1], 1, 7, 0>; +defm : Zn2WriteResFpuPair<WriteFRnd, [Zn2FPU3], 3, [1], 1, 7, 0>; +defm : Zn2WriteResFpuPair<WriteFRndY, [Zn2FPU3], 3, [1], 1, 7, 0>; defm : X86WriteResPairUnsupported<WriteFRndZ>; defm : Zn2WriteResFpuPair<WriteFLogic, [Zn2FPU], 1>; defm : Zn2WriteResFpuPair<WriteFLogicY, [Zn2FPU], 1>; @@ -325,16 +326,16 @@ defm : X86WriteResPairUnsupported<WriteFTestZ>; defm : Zn2WriteResFpuPair<WriteFShuffle, [Zn2FPU12], 1>; defm : Zn2WriteResFpuPair<WriteFShuffleY, [Zn2FPU12], 1>; defm : X86WriteResPairUnsupported<WriteFShuffleZ>; -defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 1>; -defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 1>; +defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 3>; +defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 3>; defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; defm : Zn2WriteResFpuPair<WriteFMul, [Zn2FPU01], 3, [1], 1, 7, 1>; defm : Zn2WriteResFpuPair<WriteFMulX, [Zn2FPU01], 3, [1], 1, 7, 1>; -defm : Zn2WriteResFpuPair<WriteFMulY, [Zn2FPU01], 4, [1], 1, 7, 1>; +defm : Zn2WriteResFpuPair<WriteFMulY, [Zn2FPU01], 3, [1], 1, 7, 1>; defm : X86WriteResPairUnsupported<WriteFMulZ>; defm : Zn2WriteResFpuPair<WriteFMul64, [Zn2FPU01], 3, [1], 1, 7, 1>; defm : Zn2WriteResFpuPair<WriteFMul64X, [Zn2FPU01], 3, [1], 1, 7, 1>; -defm : Zn2WriteResFpuPair<WriteFMul64Y, [Zn2FPU01], 4, [1], 1, 7, 1>; +defm : Zn2WriteResFpuPair<WriteFMul64Y, [Zn2FPU01], 3, [1], 1, 7, 1>; defm : X86WriteResPairUnsupported<WriteFMul64Z>; defm : Zn2WriteResFpuPair<WriteFMA, [Zn2FPU03], 5>; defm : Zn2WriteResFpuPair<WriteFMAX, [Zn2FPU03], 5>; @@ -369,8 +370,10 @@ defm : X86WriteRes<WriteVecStoreX, [Zn2AGU], 1, [1], 1>; defm : X86WriteRes<WriteVecStoreY, [Zn2AGU], 1, [1], 1>; defm : X86WriteRes<WriteVecStoreNT, [Zn2AGU], 1, [1], 1>; defm : X86WriteRes<WriteVecStoreNTY, [Zn2AGU], 1, [1], 1>; -defm : X86WriteRes<WriteVecMaskedStore, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>; -defm : X86WriteRes<WriteVecMaskedStoreY, [Zn2AGU,Zn2FPU01], 5, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore32, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteVecMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>; +defm : X86WriteRes<WriteVecMaskedStore64, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteVecMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>; defm : X86WriteRes<WriteVecMove, [Zn2FPU], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveX, [Zn2FPU], 1, [1], 1>; defm : X86WriteRes<WriteVecMoveY, [Zn2FPU], 2, [1], 2>; @@ -380,7 +383,7 @@ defm : X86WriteRes<WriteEMMS, [Zn2FPU], 2, [1], 1>; defm : Zn2WriteResFpuPair<WriteVecShift, [Zn2FPU], 1>; defm : Zn2WriteResFpuPair<WriteVecShiftX, [Zn2FPU2], 1>; -defm : Zn2WriteResFpuPair<WriteVecShiftY, [Zn2FPU2], 2>; +defm : Zn2WriteResFpuPair<WriteVecShiftY, [Zn2FPU2], 1>; defm : X86WriteResPairUnsupported<WriteVecShiftZ>; defm : Zn2WriteResFpuPair<WriteVecShiftImm, [Zn2FPU], 1>; defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU], 1>; @@ -402,7 +405,7 @@ defm : Zn2WriteResFpuPair<WriteVecIMulX, [Zn2FPU0], 4>; defm : Zn2WriteResFpuPair<WriteVecIMulY, [Zn2FPU0], 4>; defm : X86WriteResPairUnsupported<WriteVecIMulZ>; defm : Zn2WriteResFpuPair<WritePMULLD, [Zn2FPU0], 4, [1], 1, 7, 1>; -defm : Zn2WriteResFpuPair<WritePMULLDY, [Zn2FPU0], 3, [1], 1, 7, 1>; +defm : Zn2WriteResFpuPair<WritePMULLDY, [Zn2FPU0], 4, [1], 1, 7, 1>; defm : X86WriteResPairUnsupported<WritePMULLDZ>; defm : Zn2WriteResFpuPair<WriteShuffle, [Zn2FPU], 1>; defm : Zn2WriteResFpuPair<WriteShuffleX, [Zn2FPU], 1>; @@ -424,8 +427,8 @@ defm : X86WriteResPairUnsupported<WritePSADBWZ>; defm : Zn2WriteResFpuPair<WritePHMINPOS, [Zn2FPU0], 4>; // Vector Shift Operations -defm : Zn2WriteResFpuPair<WriteVarVecShift, [Zn2FPU12], 1>; -defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 1>; +defm : Zn2WriteResFpuPair<WriteVarVecShift, [Zn2FPU12], 3>; +defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 3>; defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; // Vector insert/extract operations. @@ -469,6 +472,12 @@ defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>; def Zn2WriteMicrocoded : SchedWriteRes<[]> { let Latency = 100; } +defm : Zn2WriteResPair<WriteDPPS, [], 15>; +defm : Zn2WriteResPair<WriteFHAdd, [], 7>; +defm : Zn2WriteResPair<WriteFHAddY, [], 7>; +defm : Zn2WriteResPair<WritePHAdd, [], 3>; +defm : Zn2WriteResPair<WritePHAddX, [], 3>; +defm : Zn2WriteResPair<WritePHAddY, [], 3>; def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>; def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>; @@ -517,14 +526,14 @@ def Zn2WriteXCHG : SchedWriteRes<[Zn2ALU]> { let NumMicroOps = 2; } -def : InstRW<[Zn2WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>; +def : InstRW<[Zn2WriteXCHG], (instregex "^XCHG(8|16|32|64)rr", "^XCHG(16|32|64)ar")>; // r,m. def Zn2WriteXCHGrm : SchedWriteRes<[Zn2AGU, Zn2ALU]> { let Latency = 5; let NumMicroOps = 2; } -def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>; +def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "^XCHG(8|16|32|64)rm")>; def : InstRW<[WriteMicrocoded], (instrs XLAT)>; @@ -594,8 +603,11 @@ def : InstRW<[WriteALULd], def Zn2WriteMul16 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> { let Latency = 3; } +def Zn2WriteMul16Imm : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> { + let Latency = 4; +} def : SchedAlias<WriteIMul16, Zn2WriteMul16>; -def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16>; +def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16Imm>; def : SchedAlias<WriteIMul16Reg, Zn2WriteMul16>; // m16. @@ -1001,6 +1013,7 @@ def : InstRW<[WriteMicrocoded], (instrs FNINIT)>; // mm <- mm. def Zn2WriteFPU12 : SchedWriteRes<[Zn2FPU12]> ; def Zn2WriteFPU12Y : SchedWriteRes<[Zn2FPU12]> { + let Latency = 4; let NumMicroOps = 2; } def Zn2WriteFPU12m : SchedWriteRes<[Zn2AGU, Zn2FPU12]> ; @@ -1109,15 +1122,6 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>; //-- Arithmetic instructions --// -// HADD, HSUB PS/PD -// PHADD|PHSUB (S) W/D. -def : SchedAlias<WritePHAdd, Zn2WriteMicrocoded>; -def : SchedAlias<WritePHAddLd, Zn2WriteMicrocoded>; -def : SchedAlias<WritePHAddX, Zn2WriteMicrocoded>; -def : SchedAlias<WritePHAddXLd, Zn2WriteMicrocoded>; -def : SchedAlias<WritePHAddY, Zn2WriteMicrocoded>; -def : SchedAlias<WritePHAddYLd, Zn2WriteMicrocoded>; - // PCMPGTQ. def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>; def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; @@ -1137,8 +1141,12 @@ def : InstRW<[Zn2WritePCMPGTQYm], (instrs VPCMPGTQYrm)>; // PSLL,PSRL,PSRA W/D/Q. // x,x / v,v,x. -def Zn2WritePShift : SchedWriteRes<[Zn2FPU2]> ; -def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> ; +def Zn2WritePShift : SchedWriteRes<[Zn2FPU2]> { + let Latency = 3; +} +def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> { + let Latency = 3; +} // PSLL,PSRL DQ. def : InstRW<[Zn2WritePShift], (instregex "(V?)PS(R|L)LDQri")>; @@ -1280,7 +1288,7 @@ def Zn2WriteCVTDQ2PDr: SchedWriteRes<[Zn2FPU12,Zn2FPU3]> { } // CVTDQ2PD. // x,x. -def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>; +def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2P(D|S)rr")>; // Same as xmm // y,x. @@ -1290,9 +1298,9 @@ def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PSYrr)>; def Zn2WriteCVTPD2DQr: SchedWriteRes<[Zn2FPU12, Zn2FPU3]> { let Latency = 3; } -// CVT(T)PD2DQ. +// CVT(T)P(D|S)2DQ. // x,x. -def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)PD2DQrr")>; +def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)P(D|S)2DQrr")>; def Zn2WriteCVTPD2DQLd: SchedWriteRes<[Zn2AGU,Zn2FPU12,Zn2FPU3]> { let Latency = 10; @@ -1322,7 +1330,7 @@ def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>; def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>; def Zn2WriteCVSTSI2SSr: SchedWriteRes<[Zn2FPU3]> { - let Latency = 4; + let Latency = 3; } // same as CVTPD2DQr @@ -1334,7 +1342,7 @@ def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>; def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>; def Zn2WriteCVSTSI2SDr: SchedWriteRes<[Zn2FPU013, Zn2FPU3]> { - let Latency = 4; + let Latency = 3; } // CVTSI2SD. // x,r32/64. @@ -1376,7 +1384,7 @@ defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>; //-- SSE4A instructions --// // EXTRQ def Zn2WriteEXTRQ: SchedWriteRes<[Zn2FPU12, Zn2FPU2]> { - let Latency = 2; + let Latency = 3; } def : InstRW<[Zn2WriteEXTRQ], (instregex "EXTRQ")>; @@ -1448,12 +1456,6 @@ def : InstRW<[Zn2WriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>; //-- Arithmetic instructions --// -// HADD, HSUB PS/PD -def : SchedAlias<WriteFHAdd, Zn2WriteMicrocoded>; -def : SchedAlias<WriteFHAddLd, Zn2WriteMicrocoded>; -def : SchedAlias<WriteFHAddY, Zn2WriteMicrocoded>; -def : SchedAlias<WriteFHAddYLd, Zn2WriteMicrocoded>; - // VDIVPS. // TODO - convert to Zn2WriteResFpuPair // y,y,y. @@ -1490,11 +1492,9 @@ def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>; // DPPS. // x,x,i / v,v,v,i. -def : SchedAlias<WriteDPPS, Zn2WriteMicrocoded>; def : SchedAlias<WriteDPPSY, Zn2WriteMicrocoded>; // x,m,i / v,v,m,i. -def : SchedAlias<WriteDPPSLd, Zn2WriteMicrocoded>; def : SchedAlias<WriteDPPSYLd,Zn2WriteMicrocoded>; // DPPD. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index 1ae8df977f83..ce8d1d464da9 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -15,6 +15,7 @@ #include "X86InstrInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/DerivedTypes.h" @@ -45,7 +46,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val, - SDValue Size, unsigned Align, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); const X86Subtarget &Subtarget = @@ -65,7 +66,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( // If not DWORD aligned or size is more than the threshold, call the library. // The libc version is likely to be faster for these cases. It can use the // address value and run time information about the CPU. - if ((Align & 3) != 0 || !ConstantSize || + if (Alignment < Align(4) || !ConstantSize || ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) { // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val); @@ -111,28 +112,27 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( uint64_t Val = ValC->getZExtValue() & 255; // If the value is a constant, then we can potentially use larger sets. - switch (Align & 3) { - case 2: // WORD aligned - AVT = MVT::i16; - ValReg = X86::AX; - Val = (Val << 8) | Val; - break; - case 0: // DWORD aligned + if (Alignment > Align(2)) { + // DWORD aligned AVT = MVT::i32; ValReg = X86::EAX; Val = (Val << 8) | Val; Val = (Val << 16) | Val; - if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned + if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned AVT = MVT::i64; ValReg = X86::RAX; Val = (Val << 32) | Val; } - break; - default: // Byte aligned + } else if (Alignment == Align(2)) { + // WORD aligned + AVT = MVT::i16; + ValReg = X86::AX; + Val = (Val << 8) | Val; + } else { + // Byte aligned AVT = MVT::i8; ValReg = X86::AL; Count = DAG.getIntPtrConstant(SizeVal, dl); - break; } if (AVT.bitsGT(MVT::i8)) { @@ -169,13 +169,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( EVT AddrVT = Dst.getValueType(); EVT SizeVT = Size.getValueType(); - Chain = DAG.getMemset(Chain, dl, - DAG.getNode(ISD::ADD, dl, AddrVT, Dst, - DAG.getConstant(Offset, dl, AddrVT)), - Val, - DAG.getConstant(BytesLeft, dl, SizeVT), - Align, isVolatile, false, - DstPtrInfo.getWithOffset(Offset)); + Chain = + DAG.getMemset(Chain, dl, + DAG.getNode(ISD::ADD, dl, AddrVT, Dst, + DAG.getConstant(Offset, dl, AddrVT)), + Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment, + isVolatile, false, DstPtrInfo.getWithOffset(Offset)); } // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. @@ -283,7 +282,7 @@ static SDValue emitConstantSizeRepmov( Chain, dl, DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)), DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)), - DAG.getConstant(BytesLeft, dl, SizeVT), Align, isVolatile, + DAG.getConstant(BytesLeft, dl, SizeVT), llvm::Align(Align), isVolatile, /*AlwaysInline*/ true, /*isTailCall*/ false, DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset))); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results); @@ -291,7 +290,7 @@ static SDValue emitConstantSizeRepmov( SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { // If to a segment-relative address space, use the default lowering. if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256) @@ -309,10 +308,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( /// Handle constant sizes, if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size)) - return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src, - ConstantSize->getZExtValue(), - Size.getValueType(), Align, isVolatile, - AlwaysInline, DstPtrInfo, SrcPtrInfo); + return emitConstantSizeRepmov( + DAG, Subtarget, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), + Size.getValueType(), Alignment.value(), isVolatile, AlwaysInline, + DstPtrInfo, SrcPtrInfo); return SDValue(); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h index 0f2d979f91e3..dac62973636c 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -14,14 +14,9 @@ #define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H #include "llvm/CodeGen/SelectionDAGTargetInfo.h" -#include "llvm/MC/MCRegisterInfo.h" namespace llvm { -class X86TargetLowering; -class X86TargetMachine; -class X86Subtarget; - class X86SelectionDAGInfo : public SelectionDAGTargetInfo { /// Returns true if it is possible for the base register to conflict with the /// given set of clobbers for a memory intrinsic. @@ -33,13 +28,14 @@ public: SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, + SDValue Size, Align Alignment, + bool isVolatile, MachinePointerInfo DstPtrInfo) const override; SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, - bool AlwaysInline, + SDValue Size, Align Alignment, + bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const override; }; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp index a202fc63637b..de528299654c 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -11,8 +11,10 @@ // //===----------------------------------------------------------------------===// -#include "Utils/X86ShuffleDecode.h" +#include "X86ShuffleDecodeConstantPool.h" +#include "MCTargetDesc/X86ShuffleDecode.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/IR/Constants.h" //===----------------------------------------------------------------------===// @@ -34,17 +36,17 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits, // // <4 x i32> <i32 -2147483648, i32 -2147483648, // i32 -2147483648, i32 -2147483648> - Type *CstTy = C->getType(); - if (!CstTy->isVectorTy()) + auto *CstTy = dyn_cast<FixedVectorType>(C->getType()); + if (!CstTy) return false; - Type *CstEltTy = CstTy->getVectorElementType(); + Type *CstEltTy = CstTy->getElementType(); if (!CstEltTy->isIntegerTy()) return false; unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); - unsigned NumCstElts = CstTy->getVectorNumElements(); + unsigned NumCstElts = CstTy->getNumElements(); assert((CstSizeInBits % MaskEltSizeInBits) == 0 && "Unaligned shuffle mask size"); @@ -185,13 +187,12 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width, } void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize, - unsigned Width, - SmallVectorImpl<int> &ShuffleMask) { + unsigned Width, SmallVectorImpl<int> &ShuffleMask) { Type *MaskTy = C->getType(); unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); (void)MaskTySize; - assert((MaskTySize == 128 || MaskTySize == 256) && - Width >= MaskTySize && "Unexpected vector size."); + assert((MaskTySize == 128 || MaskTySize == 256) && Width >= MaskTySize && + "Unexpected vector size."); // The shuffle mask requires elements the same size as the target. APInt UndefElts; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h index 296341517579..51229a69a626 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h @@ -14,15 +14,13 @@ #ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H #define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H -#include "llvm/ADT/SmallVector.h" - //===----------------------------------------------------------------------===// // Vector Mask Decoding //===----------------------------------------------------------------------===// namespace llvm { class Constant; -class MVT; +template <typename T> class SmallVectorImpl; /// Decode a PSHUFB mask from an IR-level vector constant. void DecodePSHUFBMask(const Constant *C, unsigned Width, @@ -33,9 +31,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width, SmallVectorImpl<int> &ShuffleMask); /// Decode a VPERMILP2 variable mask from an IR-level vector constant. -void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize, - unsigned Width, - SmallVectorImpl<int> &ShuffleMask); +void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize, + unsigned Width, SmallVectorImpl<int> &ShuffleMask); /// Decode a VPPERM variable mask from an IR-level vector constant. void DecodeVPPERMMask(const Constant *C, unsigned Width, diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp new file mode 100644 index 000000000000..7e91c37367d2 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp @@ -0,0 +1,181 @@ +//===-- X86SpeculativeExecutionSideEffectSuppression.cpp ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file contains the X86 implementation of the speculative execution side +/// effect suppression mitigation. +/// +/// This must be used with the -mlvi-cfi flag in order to mitigate indirect +/// branches and returns. +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-seses" + +STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted"); + +static cl::opt<bool> EnableSpeculativeExecutionSideEffectSuppression( + "x86-seses-enable-without-lvi-cfi", + cl::desc("Force enable speculative execution side effect suppression. " + "(Note: User must pass -mlvi-cfi in order to mitigate indirect " + "branches and returns.)"), + cl::init(false), cl::Hidden); + +static cl::opt<bool> OneLFENCEPerBasicBlock( + "x86-seses-one-lfence-per-bb", + cl::desc( + "Omit all lfences other than the first to be placed in a basic block."), + cl::init(false), cl::Hidden); + +static cl::opt<bool> OnlyLFENCENonConst( + "x86-seses-only-lfence-non-const", + cl::desc("Only lfence before groups of terminators where at least one " + "branch instruction has an input to the addressing mode that is a " + "register other than %rip."), + cl::init(false), cl::Hidden); + +static cl::opt<bool> + OmitBranchLFENCEs("x86-seses-omit-branch-lfences", + cl::desc("Omit all lfences before branch instructions."), + cl::init(false), cl::Hidden); + +namespace { + +class X86SpeculativeExecutionSideEffectSuppression + : public MachineFunctionPass { +public: + X86SpeculativeExecutionSideEffectSuppression() : MachineFunctionPass(ID) {} + + static char ID; + StringRef getPassName() const override { + return "X86 Speculative Execution Side Effect Suppression"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // namespace + +char X86SpeculativeExecutionSideEffectSuppression::ID = 0; + +// This function returns whether the passed instruction uses a memory addressing +// mode that is constant. We treat all memory addressing modes that read +// from a register that is not %rip as non-constant. Note that the use +// of the EFLAGS register results in an addressing mode being considered +// non-constant, therefore all JCC instructions will return false from this +// function since one of their operands will always be the EFLAGS register. +static bool hasConstantAddressingMode(const MachineInstr &MI) { + for (const MachineOperand &MO : MI.uses()) + if (MO.isReg() && X86::RIP != MO.getReg()) + return false; + return true; +} + +bool X86SpeculativeExecutionSideEffectSuppression::runOnMachineFunction( + MachineFunction &MF) { + + const auto &OptLevel = MF.getTarget().getOptLevel(); + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + + // Check whether SESES needs to run as the fallback for LVI at O0, whether the + // user explicitly passed an SESES flag, or whether the SESES target feature + // was set. + if (!EnableSpeculativeExecutionSideEffectSuppression && + !(Subtarget.useLVILoadHardening() && OptLevel == CodeGenOpt::None) && + !Subtarget.useSpeculativeExecutionSideEffectSuppression()) + return false; + + LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() + << " **********\n"); + bool Modified = false; + const X86InstrInfo *TII = Subtarget.getInstrInfo(); + for (MachineBasicBlock &MBB : MF) { + MachineInstr *FirstTerminator = nullptr; + // Keep track of whether the previous instruction was an LFENCE to avoid + // adding redundant LFENCEs. + bool PrevInstIsLFENCE = false; + for (auto &MI : MBB) { + + if (MI.getOpcode() == X86::LFENCE) { + PrevInstIsLFENCE = true; + continue; + } + // We want to put an LFENCE before any instruction that + // may load or store. This LFENCE is intended to avoid leaking any secret + // data due to a given load or store. This results in closing the cache + // and memory timing side channels. We will treat terminators that load + // or store separately. + if (MI.mayLoadOrStore() && !MI.isTerminator()) { + if (!PrevInstIsLFENCE) { + BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE)); + NumLFENCEsInserted++; + Modified = true; + } + if (OneLFENCEPerBasicBlock) + break; + } + // The following section will be LFENCEing before groups of terminators + // that include branches. This will close the branch prediction side + // channels since we will prevent code executing after misspeculation as + // a result of the LFENCEs placed with this logic. + + // Keep track of the first terminator in a basic block since if we need + // to LFENCE the terminators in this basic block we must add the + // instruction before the first terminator in the basic block (as + // opposed to before the terminator that indicates an LFENCE is + // required). An example of why this is necessary is that the + // X86InstrInfo::analyzeBranch method assumes all terminators are grouped + // together and terminates it's analysis once the first non-termintor + // instruction is found. + if (MI.isTerminator() && FirstTerminator == nullptr) + FirstTerminator = &MI; + + // Look for branch instructions that will require an LFENCE to be put + // before this basic block's terminators. + if (!MI.isBranch() || OmitBranchLFENCEs) { + // This isn't a branch or we're not putting LFENCEs before branches. + PrevInstIsLFENCE = false; + continue; + } + + if (OnlyLFENCENonConst && hasConstantAddressingMode(MI)) { + // This is a branch, but it only has constant addressing mode and we're + // not adding LFENCEs before such branches. + PrevInstIsLFENCE = false; + continue; + } + + // This branch requires adding an LFENCE. + if (!PrevInstIsLFENCE) { + BuildMI(MBB, FirstTerminator, DebugLoc(), TII->get(X86::LFENCE)); + NumLFENCEsInserted++; + Modified = true; + } + break; + } + } + + return Modified; +} + +FunctionPass *llvm::createX86SpeculativeExecutionSideEffectSuppression() { + return new X86SpeculativeExecutionSideEffectSuppression(); +} + +INITIALIZE_PASS(X86SpeculativeExecutionSideEffectSuppression, "x86-seses", + "X86 Speculative Execution Side Effect Suppression", false, + false) diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index 9aa47c532e82..fe5b9a05f811 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -53,6 +53,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" #include <algorithm> #include <cassert> #include <iterator> @@ -872,10 +873,10 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads( case X86::FARCALL16m: case X86::FARCALL32m: - case X86::FARCALL64: + case X86::FARCALL64m: case X86::FARJMP16m: case X86::FARJMP32m: - case X86::FARJMP64: + case X86::FARJMP64m: // We cannot mitigate far jumps or calls, but we also don't expect them // to be vulnerable to Spectre v1.2 style attacks. continue; @@ -920,6 +921,11 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads( // Now stitch the new instructions into place and erase the old one. for (auto *NewMI : NewMIs) MBB.insert(MI.getIterator(), NewMI); + + // Update the call site info. + if (MI.isCandidateForCallSiteEntry()) + MF.eraseCallSiteInfo(&MI); + MI.eraseFromParent(); LLVM_DEBUG({ dbgs() << "Unfolded load successfully into:\n"; @@ -993,7 +999,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( case X86::FARJMP16m: case X86::FARJMP32m: - case X86::FARJMP64: + case X86::FARJMP64m: // We cannot mitigate far jumps or calls, but we also don't expect them // to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks. continue; @@ -1195,394 +1201,13 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches( return CMovs; } -/// Returns true if the instruction has no behavior (specified or otherwise) -/// that is based on the value of any of its register operands -/// -/// A classical example of something that is inherently not data invariant is an -/// indirect jump -- the destination is loaded into icache based on the bits set -/// in the jump destination register. -/// -/// FIXME: This should become part of our instruction tables. -static bool isDataInvariant(MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - // By default, assume that the instruction is not data invariant. - return false; - - // Some target-independent operations that trivially lower to data-invariant - // instructions. - case TargetOpcode::COPY: - case TargetOpcode::INSERT_SUBREG: - case TargetOpcode::SUBREG_TO_REG: - return true; - - // On x86 it is believed that imul is constant time w.r.t. the loaded data. - // However, they set flags and are perhaps the most surprisingly constant - // time operations so we call them out here separately. - case X86::IMUL16rr: - case X86::IMUL16rri8: - case X86::IMUL16rri: - case X86::IMUL32rr: - case X86::IMUL32rri8: - case X86::IMUL32rri: - case X86::IMUL64rr: - case X86::IMUL64rri32: - case X86::IMUL64rri8: - - // Bit scanning and counting instructions that are somewhat surprisingly - // constant time as they scan across bits and do other fairly complex - // operations like popcnt, but are believed to be constant time on x86. - // However, these set flags. - case X86::BSF16rr: - case X86::BSF32rr: - case X86::BSF64rr: - case X86::BSR16rr: - case X86::BSR32rr: - case X86::BSR64rr: - case X86::LZCNT16rr: - case X86::LZCNT32rr: - case X86::LZCNT64rr: - case X86::POPCNT16rr: - case X86::POPCNT32rr: - case X86::POPCNT64rr: - case X86::TZCNT16rr: - case X86::TZCNT32rr: - case X86::TZCNT64rr: - - // Bit manipulation instructions are effectively combinations of basic - // arithmetic ops, and should still execute in constant time. These also - // set flags. - case X86::BLCFILL32rr: - case X86::BLCFILL64rr: - case X86::BLCI32rr: - case X86::BLCI64rr: - case X86::BLCIC32rr: - case X86::BLCIC64rr: - case X86::BLCMSK32rr: - case X86::BLCMSK64rr: - case X86::BLCS32rr: - case X86::BLCS64rr: - case X86::BLSFILL32rr: - case X86::BLSFILL64rr: - case X86::BLSI32rr: - case X86::BLSI64rr: - case X86::BLSIC32rr: - case X86::BLSIC64rr: - case X86::BLSMSK32rr: - case X86::BLSMSK64rr: - case X86::BLSR32rr: - case X86::BLSR64rr: - case X86::TZMSK32rr: - case X86::TZMSK64rr: - - // Bit extracting and clearing instructions should execute in constant time, - // and set flags. - case X86::BEXTR32rr: - case X86::BEXTR64rr: - case X86::BEXTRI32ri: - case X86::BEXTRI64ri: - case X86::BZHI32rr: - case X86::BZHI64rr: - - // Shift and rotate. - case X86::ROL8r1: case X86::ROL16r1: case X86::ROL32r1: case X86::ROL64r1: - case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL: - case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri: - case X86::ROR8r1: case X86::ROR16r1: case X86::ROR32r1: case X86::ROR64r1: - case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL: - case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri: - case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1: case X86::SAR64r1: - case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL: - case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri: - case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1: case X86::SHL64r1: - case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL: - case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri: - case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1: case X86::SHR64r1: - case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL: - case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri: - case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL: - case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8: - case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL: - case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8: - - // Basic arithmetic is constant time on the input but does set flags. - case X86::ADC8rr: case X86::ADC8ri: - case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8: - case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8: - case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32: - case X86::ADD8rr: case X86::ADD8ri: - case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8: - case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8: - case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32: - case X86::AND8rr: case X86::AND8ri: - case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8: - case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8: - case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32: - case X86::OR8rr: case X86::OR8ri: - case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8: - case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8: - case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32: - case X86::SBB8rr: case X86::SBB8ri: - case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8: - case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8: - case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32: - case X86::SUB8rr: case X86::SUB8ri: - case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8: - case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8: - case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32: - case X86::XOR8rr: case X86::XOR8ri: - case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8: - case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8: - case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32: - // Arithmetic with just 32-bit and 64-bit variants and no immediates. - case X86::ADCX32rr: case X86::ADCX64rr: - case X86::ADOX32rr: case X86::ADOX64rr: - case X86::ANDN32rr: case X86::ANDN64rr: - // Unary arithmetic operations. - case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r: - case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r: - case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r: - // Check whether the EFLAGS implicit-def is dead. We assume that this will - // always find the implicit-def because this code should only be reached - // for instructions that do in fact implicitly def this. - if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) { - // If we would clobber EFLAGS that are used, just bail for now. - LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: "; - MI.dump(); dbgs() << "\n"); - return false; - } - - // Otherwise, fallthrough to handle these the same as instructions that - // don't set EFLAGS. - LLVM_FALLTHROUGH; - - // Unlike other arithmetic, NOT doesn't set EFLAGS. - case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r: - - // Various move instructions used to zero or sign extend things. Note that we - // intentionally don't support the _NOREX variants as we can't handle that - // register constraint anyways. - case X86::MOVSX16rr8: - case X86::MOVSX32rr8: case X86::MOVSX32rr16: - case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32: - case X86::MOVZX16rr8: - case X86::MOVZX32rr8: case X86::MOVZX32rr16: - case X86::MOVZX64rr8: case X86::MOVZX64rr16: - case X86::MOV32rr: - - // Arithmetic instructions that are both constant time and don't set flags. - case X86::RORX32ri: - case X86::RORX64ri: - case X86::SARX32rr: - case X86::SARX64rr: - case X86::SHLX32rr: - case X86::SHLX64rr: - case X86::SHRX32rr: - case X86::SHRX64rr: - - // LEA doesn't actually access memory, and its arithmetic is constant time. - case X86::LEA16r: - case X86::LEA32r: - case X86::LEA64_32r: - case X86::LEA64r: - return true; - } -} - -/// Returns true if the instruction has no behavior (specified or otherwise) -/// that is based on the value loaded from memory or the value of any -/// non-address register operands. -/// -/// For example, if the latency of the instruction is dependent on the -/// particular bits set in any of the registers *or* any of the bits loaded from -/// memory. -/// -/// A classical example of something that is inherently not data invariant is an -/// indirect jump -- the destination is loaded into icache based on the bits set -/// in the jump destination register. -/// -/// FIXME: This should become part of our instruction tables. -static bool isDataInvariantLoad(MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - // By default, assume that the load will immediately leak. - return false; - - // On x86 it is believed that imul is constant time w.r.t. the loaded data. - // However, they set flags and are perhaps the most surprisingly constant - // time operations so we call them out here separately. - case X86::IMUL16rm: - case X86::IMUL16rmi8: - case X86::IMUL16rmi: - case X86::IMUL32rm: - case X86::IMUL32rmi8: - case X86::IMUL32rmi: - case X86::IMUL64rm: - case X86::IMUL64rmi32: - case X86::IMUL64rmi8: - - // Bit scanning and counting instructions that are somewhat surprisingly - // constant time as they scan across bits and do other fairly complex - // operations like popcnt, but are believed to be constant time on x86. - // However, these set flags. - case X86::BSF16rm: - case X86::BSF32rm: - case X86::BSF64rm: - case X86::BSR16rm: - case X86::BSR32rm: - case X86::BSR64rm: - case X86::LZCNT16rm: - case X86::LZCNT32rm: - case X86::LZCNT64rm: - case X86::POPCNT16rm: - case X86::POPCNT32rm: - case X86::POPCNT64rm: - case X86::TZCNT16rm: - case X86::TZCNT32rm: - case X86::TZCNT64rm: - - // Bit manipulation instructions are effectively combinations of basic - // arithmetic ops, and should still execute in constant time. These also - // set flags. - case X86::BLCFILL32rm: - case X86::BLCFILL64rm: - case X86::BLCI32rm: - case X86::BLCI64rm: - case X86::BLCIC32rm: - case X86::BLCIC64rm: - case X86::BLCMSK32rm: - case X86::BLCMSK64rm: - case X86::BLCS32rm: - case X86::BLCS64rm: - case X86::BLSFILL32rm: - case X86::BLSFILL64rm: - case X86::BLSI32rm: - case X86::BLSI64rm: - case X86::BLSIC32rm: - case X86::BLSIC64rm: - case X86::BLSMSK32rm: - case X86::BLSMSK64rm: - case X86::BLSR32rm: - case X86::BLSR64rm: - case X86::TZMSK32rm: - case X86::TZMSK64rm: - - // Bit extracting and clearing instructions should execute in constant time, - // and set flags. - case X86::BEXTR32rm: - case X86::BEXTR64rm: - case X86::BEXTRI32mi: - case X86::BEXTRI64mi: - case X86::BZHI32rm: - case X86::BZHI64rm: - - // Basic arithmetic is constant time on the input but does set flags. - case X86::ADC8rm: - case X86::ADC16rm: - case X86::ADC32rm: - case X86::ADC64rm: - case X86::ADCX32rm: - case X86::ADCX64rm: - case X86::ADD8rm: - case X86::ADD16rm: - case X86::ADD32rm: - case X86::ADD64rm: - case X86::ADOX32rm: - case X86::ADOX64rm: - case X86::AND8rm: - case X86::AND16rm: - case X86::AND32rm: - case X86::AND64rm: - case X86::ANDN32rm: - case X86::ANDN64rm: - case X86::OR8rm: - case X86::OR16rm: - case X86::OR32rm: - case X86::OR64rm: - case X86::SBB8rm: - case X86::SBB16rm: - case X86::SBB32rm: - case X86::SBB64rm: - case X86::SUB8rm: - case X86::SUB16rm: - case X86::SUB32rm: - case X86::SUB64rm: - case X86::XOR8rm: - case X86::XOR16rm: - case X86::XOR32rm: - case X86::XOR64rm: - // Check whether the EFLAGS implicit-def is dead. We assume that this will - // always find the implicit-def because this code should only be reached - // for instructions that do in fact implicitly def this. - if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) { - // If we would clobber EFLAGS that are used, just bail for now. - LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: "; - MI.dump(); dbgs() << "\n"); - return false; - } - - // Otherwise, fallthrough to handle these the same as instructions that - // don't set EFLAGS. - LLVM_FALLTHROUGH; - - // Integer multiply w/o affecting flags is still believed to be constant - // time on x86. Called out separately as this is among the most surprising - // instructions to exhibit that behavior. - case X86::MULX32rm: - case X86::MULX64rm: - - // Arithmetic instructions that are both constant time and don't set flags. - case X86::RORX32mi: - case X86::RORX64mi: - case X86::SARX32rm: - case X86::SARX64rm: - case X86::SHLX32rm: - case X86::SHLX64rm: - case X86::SHRX32rm: - case X86::SHRX64rm: - - // Conversions are believed to be constant time and don't set flags. - case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm: - case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm: - case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm: - case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm: - case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm: - case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm: - case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm: - case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm: - case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm: - case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm: - // AVX512 added unsigned integer conversions. - case X86::VCVTTSD2USI64Zrm: - case X86::VCVTTSD2USIZrm: - case X86::VCVTTSS2USI64Zrm: - case X86::VCVTTSS2USIZrm: - case X86::VCVTUSI2SDZrm: - case X86::VCVTUSI642SDZrm: - case X86::VCVTUSI2SSZrm: - case X86::VCVTUSI642SSZrm: - - // Loads to register don't set flags. - case X86::MOV8rm: - case X86::MOV8rm_NOREX: - case X86::MOV16rm: - case X86::MOV32rm: - case X86::MOV64rm: - case X86::MOVSX16rm8: - case X86::MOVSX32rm16: - case X86::MOVSX32rm8: - case X86::MOVSX32rm8_NOREX: - case X86::MOVSX64rm16: - case X86::MOVSX64rm32: - case X86::MOVSX64rm8: - case X86::MOVZX16rm8: - case X86::MOVZX32rm16: - case X86::MOVZX32rm8: - case X86::MOVZX32rm8_NOREX: - case X86::MOVZX64rm16: - case X86::MOVZX64rm8: - return true; +// Returns true if the MI has EFLAGS as a register def operand and it's live, +// otherwise it returns false +static bool isEFLAGSDefLive(const MachineInstr &MI) { + if (const MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) { + return !DefOp->isDead(); } + return false; } static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, @@ -1740,8 +1365,9 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden( // address registers, queue it up to be hardened post-load. Notably, // even once hardened this won't introduce a useful dependency that // could prune out subsequent loads. - if (EnablePostLoadHardening && isDataInvariantLoad(MI) && - MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() && + if (EnablePostLoadHardening && X86InstrInfo::isDataInvariantLoad(MI) && + !isEFLAGSDefLive(MI) && MI.getDesc().getNumDefs() == 1 && + MI.getOperand(0).isReg() && canHardenRegister(MI.getOperand(0).getReg()) && !HardenedAddrRegs.count(BaseReg) && !HardenedAddrRegs.count(IndexReg)) { @@ -1795,9 +1421,10 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden( if (HardenPostLoad.erase(&MI)) { assert(!MI.isCall() && "Must not try to post-load harden a call!"); - // If this is a data-invariant load, we want to try and sink any - // hardening as far as possible. - if (isDataInvariantLoad(MI)) { + // If this is a data-invariant load and there is no EFLAGS + // interference, we want to try and sink any hardening as far as + // possible. + if (X86InstrInfo::isDataInvariantLoad(MI) && !isEFLAGSDefLive(MI)) { // Sink the instruction we'll need to harden as far as we can down // the graph. MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad); @@ -2085,9 +1712,9 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( // Broadcast our state into a vector register. Register VStateReg = MRI->createVirtualRegister(OpRC); - unsigned BroadcastOp = - Is128Bit ? X86::VPBROADCASTQrZ128r - : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr; + unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128rr + : Is256Bit ? X86::VPBROADCASTQrZ256rr + : X86::VPBROADCASTQrZrr; auto BroadcastI = BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg) .addReg(StateReg); @@ -2147,8 +1774,11 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr( MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) { - assert(isDataInvariantLoad(InitialMI) && + assert(X86InstrInfo::isDataInvariantLoad(InitialMI) && "Cannot get here with a non-invariant load!"); + assert(!isEFLAGSDefLive(InitialMI) && + "Cannot get here with a data invariant load " + "that interferes with EFLAGS!"); // See if we can sink hardening the loaded value. auto SinkCheckToSingleUse = @@ -2160,14 +1790,14 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( // own. MachineInstr *SingleUseMI = nullptr; for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) { - // If we're already going to harden this use, it is data invariant and - // within our block. + // If we're already going to harden this use, it is data invariant, it + // does not interfere with EFLAGS, and within our block. if (HardenedInstrs.count(&UseMI)) { - if (!isDataInvariantLoad(UseMI)) { + if (!X86InstrInfo::isDataInvariantLoad(UseMI) || isEFLAGSDefLive(UseMI)) { // If we've already decided to harden a non-load, we must have sunk // some other post-load hardened instruction to it and it must itself // be data-invariant. - assert(isDataInvariant(UseMI) && + assert(X86InstrInfo::isDataInvariant(UseMI) && "Data variant instruction being hardened!"); continue; } @@ -2199,7 +1829,8 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( // If this single use isn't data invariant, isn't in this block, or has // interfering EFLAGS, we can't sink the hardening to it. - if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent()) + if (!X86InstrInfo::isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent() || + isEFLAGSDefLive(UseMI)) return {}; // If this instruction defines multiple registers bail as we won't harden @@ -2590,10 +2221,10 @@ void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr( switch (MI.getOpcode()) { case X86::FARCALL16m: case X86::FARCALL32m: - case X86::FARCALL64: + case X86::FARCALL64m: case X86::FARJMP16m: case X86::FARJMP32m: - case X86::FARJMP64: + case X86::FARJMP64m: // We don't need to harden either far calls or far jumps as they are // safe from Spectre. return; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp index 75c3a70b430a..975cbabb30fd 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp @@ -10,14 +10,13 @@ // //===----------------------------------------------------------------------===// +#include "X86Subtarget.h" +#include "MCTargetDesc/X86BaseInfo.h" #include "X86.h" - #include "X86CallLowering.h" #include "X86LegalizerInfo.h" #include "X86MacroFusion.h" #include "X86RegisterBankInfo.h" -#include "X86Subtarget.h" -#include "MCTargetDesc/X86BaseInfo.h" #include "X86TargetMachine.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" @@ -89,7 +88,9 @@ X86Subtarget::classifyLocalReference(const GlobalValue *GV) const { // Medium is a hybrid: RIP-rel for code, GOTOFF for DSO local data. case CodeModel::Medium: - if (isa<Function>(GV)) + // Constant pool and jump table handling pass a nullptr to this + // function so we need to use isa_and_nonnull. + if (isa_and_nonnull<Function>(GV)) return X86II::MO_NO_FLAG; // All code is RIP-relative return X86II::MO_GOTOFF; // Local symbols use GOTOFF. } @@ -227,11 +228,11 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const { } void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { - std::string CPUName = CPU; + std::string CPUName = std::string(CPU); if (CPUName.empty()) CPUName = "generic"; - std::string FullFS = FS; + std::string FullFS = std::string(FS); if (In64BitMode) { // SSE2 should default to enabled in 64-bit mode, but can be turned off // explicitly. @@ -379,3 +380,7 @@ void X86Subtarget::getPostRAMutations( std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const { Mutations.push_back(createX86MacroFusionDAGMutation()); } + +bool X86Subtarget::isPositionIndependent() const { + return TM.isPositionIndependent(); +} diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h index af5153243c8b..de45d357e3c2 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h @@ -17,15 +17,9 @@ #include "X86ISelLowering.h" #include "X86InstrInfo.h" #include "X86SelectionDAGInfo.h" -#include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/GlobalISel/CallLowering.h" -#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" -#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" -#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/CallingConv.h" -#include "llvm/Target/TargetMachine.h" #include <climits> #include <memory> @@ -34,7 +28,13 @@ namespace llvm { +class CallLowering; class GlobalValue; +class InstructionSelector; +class LegalizerInfo; +class RegisterBankInfo; +class StringRef; +class TargetMachine; /// The X86 backend supports a number of different styles of PIC. /// @@ -258,6 +258,10 @@ protected: bool InsertVZEROUPPER = false; /// True if there is no performance penalty for writing NOPs with up to + /// 7 bytes. + bool HasFast7ByteNOP = false; + + /// True if there is no performance penalty for writing NOPs with up to /// 11 bytes. bool HasFast11ByteNOP = false; @@ -393,6 +397,17 @@ protected: /// Processor supports PCONFIG instruction bool HasPCONFIG = false; + /// Processor supports SERIALIZE instruction + bool HasSERIALIZE = false; + + /// Processor supports TSXLDTRK instruction + bool HasTSXLDTRK = false; + + /// Processor has AMX support + bool HasAMXTILE = false; + bool HasAMXBF16 = false; + bool HasAMXINT8 = false; + /// Processor has a single uop BEXTR implementation. bool HasFastBEXTR = false; @@ -427,6 +442,9 @@ protected: /// POP+LFENCE+JMP sequence. bool UseLVIControlFlowIntegrity = false; + /// Enable Speculative Execution Side Effect Suppression + bool UseSpeculativeExecutionSideEffectSuppression = false; + /// Insert LFENCE instructions to prevent data speculatively injected into /// loads from being used maliciously. bool UseLVILoadHardening = false; @@ -637,8 +655,15 @@ public: bool hasRTM() const { return HasRTM; } bool hasADX() const { return HasADX; } bool hasSHA() const { return HasSHA; } - bool hasPRFCHW() const { return HasPRFCHW || HasPREFETCHWT1; } + bool hasPRFCHW() const { return HasPRFCHW; } bool hasPREFETCHWT1() const { return HasPREFETCHWT1; } + bool hasPrefetchW() const { + // The PREFETCHW instruction was added with 3DNow but later CPUs gave it + // its own CPUID bit as part of deprecating 3DNow. Intel eventually added + // it and KNL has another that prefetches to L2 cache. We assume the + // L1 version exists if the L2 version does. + return has3DNow() || hasPRFCHW() || hasPREFETCHWT1(); + } bool hasSSEPrefetch() const { // We implicitly enable these when we have a write prefix supporting cache // level OR if we have prfchw, but don't already have a read prefetch from @@ -712,10 +737,15 @@ public: bool threewayBranchProfitable() const { return ThreewayBranchProfitable; } bool hasINVPCID() const { return HasINVPCID; } bool hasENQCMD() const { return HasENQCMD; } + bool hasSERIALIZE() const { return HasSERIALIZE; } + bool hasTSXLDTRK() const { return HasTSXLDTRK; } bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; } bool useRetpolineIndirectBranches() const { return UseRetpolineIndirectBranches; } + bool hasAMXTILE() const { return HasAMXTILE; } + bool hasAMXBF16() const { return HasAMXBF16; } + bool hasAMXINT8() const { return HasAMXINT8; } bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; } // These are generic getters that OR together all of the thunk types @@ -732,6 +762,9 @@ public: bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; } bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; } bool useLVILoadHardening() const { return UseLVILoadHardening; } + bool useSpeculativeExecutionSideEffectSuppression() const { + return UseSpeculativeExecutionSideEffectSuppression; + } unsigned getPreferVectorWidth() const { return PreferVectorWidth; } unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; } @@ -829,7 +862,7 @@ public: return PICStyle == PICStyles::Style::StubPIC; } - bool isPositionIndependent() const { return TM.isPositionIndependent(); } + bool isPositionIndependent() const; bool isCallingConvWin64(CallingConv::ID CC) const { switch (CC) { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp index 9f639ffa22ec..7344116e14af 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -73,18 +73,22 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeEvexToVexInstPassPass(PR); initializeFixupLEAPassPass(PR); initializeFPSPass(PR); + initializeX86FixupSetCCPassPass(PR); initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); initializeX86ExpandPseudoPass(PR); initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); initializeX86AvoidSFBPassPass(PR); + initializeX86AvoidTrailingCallPassPass(PR); initializeX86SpeculativeLoadHardeningPassPass(PR); + initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); initializeX86CondBrFoldingPassPass(PR); initializeX86LoadValueInjectionLoadHardeningPassPass(PR); initializeX86LoadValueInjectionRetHardeningPassPass(PR); initializeX86OptimizeLEAPassPass(PR); + initializeX86PartialReductionPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -94,19 +98,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return std::make_unique<TargetLoweringObjectFileMachO>(); } - if (TT.isOSFreeBSD()) - return std::make_unique<X86FreeBSDTargetObjectFile>(); - if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU()) - return std::make_unique<X86LinuxNaClTargetObjectFile>(); - if (TT.isOSSolaris()) - return std::make_unique<X86SolarisTargetObjectFile>(); - if (TT.isOSFuchsia()) - return std::make_unique<X86FuchsiaTargetObjectFile>(); - if (TT.isOSBinFormatELF()) - return std::make_unique<X86ELFTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) return std::make_unique<TargetLoweringObjectFileCOFF>(); - llvm_unreachable("unknown subtarget type"); + return std::make_unique<X86ELFTargetObjectFile>(); } static std::string computeDataLayout(const Triple &TT) { @@ -234,6 +228,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, setMachineOutliner(true); + // x86 supports the debug entry values. + setSupportsDebugEntryValues(true); + initAsmInfo(); } @@ -317,14 +314,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const { } //===----------------------------------------------------------------------===// -// Command line options for x86 -//===----------------------------------------------------------------------===// -static cl::opt<bool> -UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, - cl::desc("Minimize AVX to SSE transition penalty"), - cl::init(true)); - -//===----------------------------------------------------------------------===// // X86 TTI query. //===----------------------------------------------------------------------===// @@ -408,8 +397,10 @@ void X86PassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedAccessPass()); + addPass(createX86PartialReductionPass()); + } // Add passes that handle indirect branch removal and insertion of a retpoline // thunk. These will be a no-op unless a function subtarget has the retpoline @@ -498,10 +489,12 @@ void X86PassConfig::addMachineSSAOptimization() { void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); + // When -O0 is enabled, the Load Value Injection Hardening pass will fall back + // to using the Speculative Execution Side Effect Suppression pass for + // mitigation. This is to prevent slow downs due to + // analyses needed by the LVIHardening pass when compiling at -O0. if (getOptLevel() != CodeGenOpt::None) addPass(createX86LoadValueInjectionLoadHardeningPass()); - else - addPass(createX86LoadValueInjectionLoadHardeningUnoptimizedPass()); } void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } @@ -514,23 +507,33 @@ void X86PassConfig::addPreEmitPass() { addPass(createX86IndirectBranchTrackingPass()); - if (UseVZeroUpper) - addPass(createX86IssueVZeroUpperPass()); + addPass(createX86IssueVZeroUpperPass()); if (getOptLevel() != CodeGenOpt::None) { addPass(createX86FixupBWInsts()); addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); - addPass(createX86EvexToVexInsts()); } + addPass(createX86EvexToVexInsts()); addPass(createX86DiscriminateMemOpsPass()); addPass(createX86InsertPrefetchPass()); + addPass(createX86InsertX87waitPass()); } void X86PassConfig::addPreEmitPass2() { const Triple &TT = TM->getTargetTriple(); const MCAsmInfo *MAI = TM->getMCAsmInfo(); + // The X86 Speculative Execution Pass must run after all control + // flow graph modifying passes. As a result it was listed to run right before + // the X86 Retpoline Thunks pass. The reason it must run after control flow + // graph modifications is that the model of LFENCE in LLVM has to be updated + // (FIXME: https://bugs.llvm.org/show_bug.cgi?id=45167). Currently the + // placement of this pass was hand checked to ensure that the subsequent + // passes don't move the code around the LFENCEs in a way that will hurt the + // correctness of this pass. This placement has been shown to work based on + // hand inspection of the codegen output. + addPass(createX86SpeculativeExecutionSideEffectSuppression()); addPass(createX86IndirectThunksPass()); // Insert extra int3 instructions after trailing call instructions to avoid diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h index 757ce8bc5c72..8d98474a39c0 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h @@ -23,8 +23,6 @@ namespace llvm { class StringRef; -class X86Subtarget; -class X86RegisterBankInfo; class TargetTransformInfo; class X86TargetMachine final : public LLVMTargetMachine { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp index 44185957686b..2b48baccc01f 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -18,6 +18,7 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCValue.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; using namespace dwarf; @@ -63,30 +64,3 @@ const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol( const MCSymbol *Sym) const { return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); } - -void -X86FreeBSDTargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); -} - -void -X86FuchsiaTargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); -} - -void -X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); -} - -void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); -} diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h index 1fd0bbf56b19..acea772eb036 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -10,7 +10,6 @@ #define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Target/TargetLoweringObjectFile.h" namespace llvm { @@ -44,33 +43,10 @@ namespace llvm { X86ELFTargetObjectFile() { PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT; } - /// Describe a TLS variable address within debug info. const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; }; - /// X86FreeBSDTargetObjectFile - This implementation is used for FreeBSD - /// on x86 and x86-64. - class X86FreeBSDTargetObjectFile : public X86ELFTargetObjectFile { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - }; - - /// This implementation is used for Fuchsia on x86-64. - class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - }; - - /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and - /// Native Client on x86 and x86-64. - class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - }; - - /// This implementation is used for Solaris on x86/x86-64. - class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - }; - } // end namespace llvm #endif diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index b754836ea517..cc18e55656ef 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -170,12 +170,18 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { } int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + TTI::TargetCostKind CostKind, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, const Instruction *CxtI) { + // TODO: Handle more cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, + Op2Info, Opd1PropInfo, + Opd2PropInfo, Args, CxtI); // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -256,20 +262,25 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // The OperandValue properties may not be the same as that of the previous // operation; conservatively assume OP_None. int Cost = - 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, + 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, + Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, + Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, + Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, + Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info, + Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); if (ISD == ISD::SREM) { // For SREM: (X % C) is the equivalent of (X - (X/C)*C) - Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info); - Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info); + Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info, + Op2Info); + Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info, + Op2Info); } return Cost; @@ -277,12 +288,14 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // Vector unsigned division/remainder will be simplified to shifts/masks. if (ISD == ISD::UDIV) - return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, + return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, + Op1Info, Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); else // UREM - return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info, + return getArithmeticInstrCost(Instruction::And, Ty, CostKind, + Op1Info, Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); } @@ -304,6 +317,10 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v2i64, 1 }, { ISD::SRA, MVT::v4i64, 1 }, { ISD::SRA, MVT::v8i64, 1 }, + + { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. + { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. + { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -370,6 +387,14 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence + { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence + { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence + { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence + { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence + { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence + { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence + { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence + { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence }; if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || @@ -446,11 +471,32 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * Entry->Cost; } + static const CostTblEntry AVX512BWShiftCostTable[] = { + { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v8i16, 1 }, // vpsravw + + { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v16i16, 1 }, // vpsravw + + { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw + { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw + { ISD::SRA, MVT::v32i16, 1 }, // vpsravw + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + static const CostTblEntry AVX2UniformCostTable[] = { // Uniform splats are cheaper for the following instructions. { ISD::SHL, MVT::v16i16, 1 }, // psllw. { ISD::SRL, MVT::v16i16, 1 }, // psrlw. { ISD::SRA, MVT::v16i16, 1 }, // psraw. + { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw. + { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw. + { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw. }; if (ST->hasAVX2() && @@ -495,18 +541,6 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * Entry->Cost; static const CostTblEntry AVX512BWCostTable[] = { - { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw - { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw - { ISD::SRA, MVT::v8i16, 1 }, // vpsravw - - { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw - { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw - { ISD::SRA, MVT::v16i16, 1 }, // vpsravw - - { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw - { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw - { ISD::SRA, MVT::v32i16, 1 }, // vpsravw - { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. @@ -533,6 +567,7 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v4i64, 1 }, { ISD::SRA, MVT::v8i64, 1 }, + { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) @@ -568,6 +603,18 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRL, MVT::v4i64, 1 }, }; + if (ST->hasAVX512()) { + if (ISD == ISD::SHL && LT.second == MVT::v32i16 && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) + // On AVX512, a packed v32i16 shift left by a constant build_vector + // is lowered into a vector multiply (vpmullw). + return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, + Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + } + // Look for AVX2 lowering tricks. if (ST->hasAVX2()) { if (ISD == ISD::SHL && LT.second == MVT::v16i16 && @@ -575,7 +622,8 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) // On AVX2, a packed v16i16 shift left by a constant build_vector // is lowered into a vector multiply (vpmullw). - return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info, + return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, + Op1Info, Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -667,13 +715,19 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, static const CostTblEntry AVX2CostTable[] = { { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence. { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence. { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. + { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence. { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. + { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence. { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. + { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence. { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence. { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. @@ -877,20 +931,20 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || ISD == ISD::UREM)) { int ScalarCost = getArithmeticInstrCost( - Opcode, Ty->getScalarType(), Op1Info, Op2Info, + Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; } // Fallback to the default implementation. - return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); } -int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, + int Index, VectorType *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp); // Treat Transpose as 2-op shuffles - there's no difference in lowering. if (Kind == TTI::SK_Transpose) @@ -919,19 +973,19 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, // FIXME: Remove some of the alignment restrictions. // FIXME: We can use permq for 64-bit or larger extracts from 256-bit // vectors. - int OrigSubElts = SubTp->getVectorNumElements(); - if (NumSubElts > OrigSubElts && - (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 && + int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); + if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && + (NumSubElts % OrigSubElts) == 0 && LT.second.getVectorElementType() == - SubLT.second.getVectorElementType() && + SubLT.second.getVectorElementType() && LT.second.getVectorElementType().getSizeInBits() == - Tp->getVectorElementType()->getPrimitiveSizeInBits()) { + BaseTp->getElementType()->getPrimitiveSizeInBits()) { assert(NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!"); - Type *VecTy = VectorType::get(Tp->getVectorElementType(), - LT.second.getVectorNumElements()); - Type *SubTy = VectorType::get(Tp->getVectorElementType(), - SubLT.second.getVectorNumElements()); + auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), + LT.second.getVectorNumElements()); + auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), + SubLT.second.getVectorNumElements()); int ExtractIndex = alignDown((Index % NumElts), NumSubElts); int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy, ExtractIndex, SubTy); @@ -949,6 +1003,42 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, } } + // Handle some common (illegal) sub-vector types as they are often very cheap + // to shuffle even on targets without PSHUFB. + EVT VT = TLI->getValueType(DL, BaseTp); + if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && + !ST->hasSSSE3()) { + static const CostTblEntry SSE2SubVectorShuffleTbl[] = { + {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw + {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw + {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw + {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw + {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck + + {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw + {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw + {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus + {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck + + {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw + {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw + {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw + {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw + {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck + + {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw + {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw + {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw + {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw + {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck + }; + + if (ST->hasSSE2()) + if (const auto *Entry = + CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) + return Entry->Cost; + } + // We are going to permute multiple sources and the result will be in multiple // destinations. Providing an accurate cost only for splits where the element // type remains the same. @@ -956,25 +1046,26 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, MVT LegalVT = LT.second; if (LegalVT.isVector() && LegalVT.getVectorElementType().getSizeInBits() == - Tp->getVectorElementType()->getPrimitiveSizeInBits() && - LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) { + BaseTp->getElementType()->getPrimitiveSizeInBits() && + LegalVT.getVectorNumElements() < + cast<FixedVectorType>(BaseTp)->getNumElements()) { - unsigned VecTySize = DL.getTypeStoreSize(Tp); + unsigned VecTySize = DL.getTypeStoreSize(BaseTp); unsigned LegalVTSize = LegalVT.getStoreSize(); // Number of source vectors after legalization: unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; // Number of destination vectors after legalization: unsigned NumOfDests = LT.first; - Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(), - LegalVT.getVectorNumElements()); + auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), + LegalVT.getVectorNumElements()); unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); } - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); } // For 2-input shuffles, we must account for splitting the 2 inputs into many. @@ -992,9 +1083,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb - {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b - {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b - {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b + {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b + {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b + {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b }; if (ST->hasVBMI()) @@ -1006,22 +1097,18 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb - {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw - {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw + {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw + {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 - {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw - {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw - {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 - {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc - {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w - {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w - {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w - {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc + {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 - {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc }; if (ST->hasBWI()) @@ -1034,6 +1121,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd + {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps @@ -1065,7 +1154,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q - {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d + {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d + + // FIXME: This just applies the type legalization cost rules above + // assuming these completely split. + {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, + {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, + {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, + {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, }; if (ST->hasAVX512()) @@ -1267,14 +1363,22 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); } int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::TargetCostKind CostKind, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + // TODO: Allow non-throughput costs that aren't binary. + auto AdjustCost = [&CostKind](int Cost) { + if (CostKind != TTI::TCK_RecipThroughput) + return Cost == 0 ? 0 : 1; + return Cost; + }; + // FIXME: Need a better design of the cost table to handle non-simple types of // potential massive combinations (elem_num x src_type x dst_type). @@ -1283,6 +1387,11 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, // Mask sign extend has an instruction. + { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, @@ -1290,42 +1399,45 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, - // Mask zero extend is a load + broadcast. + // Mask zero extend is a sext + shift. + { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, + + { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, + { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, }; static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { - { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, - { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, - { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, - { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, - { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, - { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, - { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, - { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, - { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, - { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, }; @@ -1337,14 +1449,70 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, - { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, - { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, - { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb + + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 + { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, + + // Sign extend is zmm vpternlogd+vptruncdb. + // Zero extend is zmm broadcast load+vptruncdw. + { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, + + // Sign extend is zmm vpternlogd+vptruncdw. + // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. + { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, + + { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd + { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq + + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq - // v16i1 -> v16i32 - load + broadcast - { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, @@ -1356,6 +1524,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right + { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, @@ -1367,44 +1538,163 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, + + { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 }, + { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 }, + + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, + { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, + { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, + { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, + { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, + }; + + static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { + // Mask sign extend has an instruction. + { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, + { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, + + // Mask zero extend is a sext + shift. + { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, + { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, + + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw + { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb + }; + + static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, + + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, + + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, + + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, + }; + + static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd + + // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb + // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb + { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, + { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, + { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, + { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, + { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, + { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, + + // sign extend is vpcmpeq+maskedmove+vpmovdw + // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw + { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, + { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, + + { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd + { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, - { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, - { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, - { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, - { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, - { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, - { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, - { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, - { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, + + { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 }, + { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, - { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, - { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 }, - { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 }, - { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, - { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 }, - { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 }, }; static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { @@ -1416,6 +1706,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, @@ -1424,13 +1716,16 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, + + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, - { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, - { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, @@ -1447,6 +1742,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, @@ -1456,15 +1753,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, - { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 }, - { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, @@ -1503,8 +1806,15 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, - { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, - { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, + { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 }, + { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 }, + + { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 }, + { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 }, + { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 }, // This node is expanded into scalarized operations but BasicTTI is overly // optimistic estimating its cost. It computes 3 per element (one // vector-extract, one scalar conversion and one vector-insert). The @@ -1544,7 +1854,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, - { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, + // These truncates end up widening elements. + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD + + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 }, { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, @@ -1555,6 +1871,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, + + { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 }, + { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 }, + + { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 }, + { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 }, + { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, }; static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { @@ -1580,16 +1903,26 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 }, + { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, + { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 }, - { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 }, + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 }, { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 }, { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, + { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 }, + { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 }, + { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, + { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 }, + { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, + { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 }, { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, @@ -1616,11 +1949,19 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, + // These truncates are really widening elements. + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ + { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB - { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, - { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+3*PACKUSWB + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, @@ -1639,7 +1980,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, if (ST->hasSSE2() && !ST->hasAVX()) { if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, LTDest.second, LTSrc.second)) - return LTSrc.first * Entry->Cost; + return AdjustCost(LTSrc.first * Entry->Cost); } EVT SrcTy = TLI->getValueType(DL, Src); @@ -1647,61 +1988,77 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, // The function getSimpleVT only handles simple value types. if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind)); MVT SimpleSrcTy = SrcTy.getSimpleVT(); MVT SimpleDstTy = DstTy.getSimpleVT(); - // Make sure that neither type is going to be split before using the - // AVX512 tables. This handles -mprefer-vector-width=256 - // with -min-legal-vector-width<=256 - if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector && - TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) { + if (ST->useAVX512Regs()) { if (ST->hasBWI()) if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) - return Entry->Cost; + return AdjustCost(Entry->Cost); if (ST->hasDQI()) if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) - return Entry->Cost; + return AdjustCost(Entry->Cost); if (ST->hasAVX512()) if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) - return Entry->Cost; + return AdjustCost(Entry->Cost); } + if (ST->hasBWI()) + if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + + if (ST->hasDQI()) + if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + + if (ST->hasAVX512()) + if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + if (ST->hasAVX2()) { if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) - return Entry->Cost; + return AdjustCost(Entry->Cost); } if (ST->hasAVX()) { if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) - return Entry->Cost; + return AdjustCost(Entry->Cost); } if (ST->hasSSE41()) { if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) - return Entry->Cost; + return AdjustCost(Entry->Cost); } if (ST->hasSSE2()) { if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) - return Entry->Cost; + return AdjustCost(Entry->Cost); } - return BaseT::getCastInstrCost(Opcode, Dst, Src, I); + return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I)); } int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + TTI::TargetCostKind CostKind, const Instruction *I) { + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); + // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1774,6 +2131,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, { ISD::SELECT, MVT::v16i32, 1 }, { ISD::SELECT, MVT::v8f64, 1 }, { ISD::SELECT, MVT::v16f32, 1 }, + + { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4 + { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4 + + { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3 + { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3 }; static const CostTblEntry AVX2CostTbl[] = { @@ -1878,14 +2241,14 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * (ExtraCost + Entry->Cost); - return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I); + return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I); } unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } -int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed) { +int X86TTIImpl::getTypeBasedIntrinsicInstrCost( + const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { + // Costs should match the codegen from: // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll @@ -1935,12 +2298,20 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, static const CostTblEntry AVX512CostTbl[] = { { ISD::BITREVERSE, MVT::v8i64, 36 }, { ISD::BITREVERSE, MVT::v16i32, 24 }, + { ISD::BITREVERSE, MVT::v32i16, 10 }, + { ISD::BITREVERSE, MVT::v64i8, 10 }, { ISD::CTLZ, MVT::v8i64, 29 }, { ISD::CTLZ, MVT::v16i32, 35 }, + { ISD::CTLZ, MVT::v32i16, 28 }, + { ISD::CTLZ, MVT::v64i8, 18 }, { ISD::CTPOP, MVT::v8i64, 16 }, { ISD::CTPOP, MVT::v16i32, 24 }, + { ISD::CTPOP, MVT::v32i16, 18 }, + { ISD::CTPOP, MVT::v64i8, 12 }, { ISD::CTTZ, MVT::v8i64, 20 }, { ISD::CTTZ, MVT::v16i32, 28 }, + { ISD::CTTZ, MVT::v32i16, 24 }, + { ISD::CTTZ, MVT::v64i8, 18 }, { ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd { ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq { ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq @@ -1949,6 +2320,22 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq { ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq { ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq + { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split + { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split + { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split + { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split + { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split + { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split + { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split + { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split + { ISD::FMAXNUM, MVT::f32, 2 }, + { ISD::FMAXNUM, MVT::v4f32, 2 }, + { ISD::FMAXNUM, MVT::v8f32, 2 }, + { ISD::FMAXNUM, MVT::v16f32, 2 }, + { ISD::FMAXNUM, MVT::f64, 2 }, + { ISD::FMAXNUM, MVT::v2f64, 2 }, + { ISD::FMAXNUM, MVT::v4f64, 2 }, + { ISD::FMAXNUM, MVT::v8f64, 2 }, }; static const CostTblEntry XOPCostTbl[] = { { ISD::BITREVERSE, MVT::v4i64, 4 }, @@ -2031,6 +2418,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert { ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert + { ISD::FMAXNUM, MVT::f32, 3 }, + { ISD::FMAXNUM, MVT::v4f32, 3 }, + { ISD::FMAXNUM, MVT::v8f32, 5 }, + { ISD::FMAXNUM, MVT::f64, 3 }, + { ISD::FMAXNUM, MVT::v2f64, 3 }, + { ISD::FMAXNUM, MVT::v4f64, 5 }, { ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ { ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ @@ -2105,13 +2498,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::UADDSAT, MVT::v16i8, 1 }, { ISD::USUBSAT, MVT::v8i16, 1 }, { ISD::USUBSAT, MVT::v16i8, 1 }, + { ISD::FMAXNUM, MVT::f64, 4 }, + { ISD::FMAXNUM, MVT::v2f64, 4 }, { ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/ { ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/ }; static const CostTblEntry SSE1CostTbl[] = { + { ISD::FMAXNUM, MVT::f32, 4 }, + { ISD::FMAXNUM, MVT::v4f32, 4 }, { ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/ { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/ }; + static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets + { ISD::CTTZ, MVT::i64, 1 }, + }; + static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets + { ISD::CTTZ, MVT::i32, 1 }, + { ISD::CTTZ, MVT::i16, 1 }, + { ISD::CTTZ, MVT::i8, 1 }, + }; static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets { ISD::CTLZ, MVT::i64, 1 }, }; @@ -2131,6 +2536,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::BITREVERSE, MVT::i64, 14 }, { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH { ISD::CTPOP, MVT::i64, 10 }, { ISD::SADDO, MVT::i64, 1 }, { ISD::UADDO, MVT::i64, 1 }, @@ -2142,6 +2548,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH + { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH + { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH { ISD::CTPOP, MVT::i32, 8 }, { ISD::CTPOP, MVT::i16, 9 }, { ISD::CTPOP, MVT::i8, 7 }, @@ -2153,7 +2562,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::UADDO, MVT::i8, 1 }, }; + Type *RetTy = ICA.getReturnType(); Type *OpTy = RetTy; + Intrinsic::ID IID = ICA.getID(); unsigned ISD = ISD::DELETED_NODE; switch (IID) { default: @@ -2173,6 +2584,11 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, case Intrinsic::cttz: ISD = ISD::CTTZ; break; + case Intrinsic::maxnum: + case Intrinsic::minnum: + // FMINNUM has same costs so don't duplicate. + ISD = ISD::FMAXNUM; + break; case Intrinsic::sadd_sat: ISD = ISD::SADDSAT; break; @@ -2256,6 +2672,15 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * Entry->Cost; + if (ST->hasBMI()) { + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + } + if (ST->hasLZCNT()) { if (ST->is64Bit()) if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) @@ -2284,12 +2709,17 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, return LT.first * Entry->Cost; } - return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); + return BaseT::getIntrinsicInstrCost(ICA, CostKind); } -int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF, - unsigned VF) { +int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind) { + if (CostKind != TTI::TCK_RecipThroughput) + return BaseT::getIntrinsicInstrCost(ICA, CostKind); + + if (ICA.isTypeBasedOnly()) + return getTypeBasedIntrinsicInstrCost(ICA, CostKind); + static const CostTblEntry AVX512CostTbl[] = { { ISD::ROTL, MVT::v8i64, 1 }, { ISD::ROTL, MVT::v4i64, 1 }, @@ -2340,6 +2770,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, { ISD::FSHL, MVT::i8, 4 } }; + Intrinsic::ID IID = ICA.getID(); + Type *RetTy = ICA.getReturnType(); + const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); unsigned ISD = ISD::DELETED_NODE; switch (IID) { default: @@ -2379,7 +2812,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, return LT.first * Entry->Cost; } - return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); + return BaseT::getIntrinsicInstrCost(ICA, CostKind); } int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { @@ -2391,10 +2824,11 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { }; assert(Val->isVectorTy() && "This must be a vector type"); - Type *ScalarType = Val->getScalarType(); + int RegisterFileMoveCost = 0; - if (Index != -1U) { + if (Index != -1U && (Opcode == Instruction::ExtractElement || + Opcode == Instruction::InsertElement)) { // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); @@ -2403,17 +2837,32 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return 0; // The type may be split. Normalize the index to the new type. - unsigned Width = LT.second.getVectorNumElements(); - Index = Index % Width; + unsigned NumElts = LT.second.getVectorNumElements(); + unsigned SubNumElts = NumElts; + Index = Index % NumElts; + + // For >128-bit vectors, we need to extract higher 128-bit subvectors. + // For inserts, we also need to insert the subvector back. + if (LT.second.getSizeInBits() > 128) { + assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector"); + unsigned NumSubVecs = LT.second.getSizeInBits() / 128; + SubNumElts = NumElts / NumSubVecs; + if (SubNumElts <= Index) { + RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); + Index %= SubNumElts; + } + } if (Index == 0) { // Floating point scalars are already located in index #0. + // Many insertions to #0 can fold away for scalar fp-ops, so let's assume + // true for all. if (ScalarType->isFloatingPointTy()) - return 0; + return RegisterFileMoveCost; - // Assume movd/movq XMM <-> GPR is relatively cheap on all targets. - if (ScalarType->isIntegerTy()) - return 1; + // Assume movd/movq XMM -> GPR is relatively cheap on all targets. + if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) + return 1 + RegisterFileMoveCost; } int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -2421,24 +2870,124 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { MVT MScalarTy = LT.second.getScalarType(); if (ST->isSLM()) if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) - return LT.first * Entry->Cost; + return Entry->Cost + RegisterFileMoveCost; + + // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. + if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || + (MScalarTy.isInteger() && ST->hasSSE41())) + return 1 + RegisterFileMoveCost; + + // Assume insertps is relatively cheap on all targets. + if (MScalarTy == MVT::f32 && ST->hasSSE41() && + Opcode == Instruction::InsertElement) + return 1 + RegisterFileMoveCost; + + // For extractions we just need to shuffle the element to index 0, which + // should be very cheap (assume cost = 1). For insertions we need to shuffle + // the elements to its destination. In both cases we must handle the + // subvector move(s). + // If the vector type is already less than 128-bits then don't reduce it. + // TODO: Under what circumstances should we shuffle using the full width? + int ShuffleCost = 1; + if (Opcode == Instruction::InsertElement) { + auto *SubTy = cast<VectorType>(Val); + EVT VT = TLI->getValueType(DL, Val); + if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) + SubTy = FixedVectorType::get(ScalarType, SubNumElts); + ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy); + } + int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; + return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; } // Add to the base cost if we know that the extracted element of a vector is // destined to be moved to and used in the integer register file. - int RegisterFileMoveCost = 0; if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) - RegisterFileMoveCost = 1; + RegisterFileMoveCost += 1; return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } +unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty, + const APInt &DemandedElts, + bool Insert, bool Extract) { + unsigned Cost = 0; + + // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much + // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. + if (Insert) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + MVT MScalarTy = LT.second.getScalarType(); + + if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || + (MScalarTy.isInteger() && ST->hasSSE41()) || + (MScalarTy == MVT::f32 && ST->hasSSE41())) { + // For types we can insert directly, insertion into 128-bit sub vectors is + // cheap, followed by a cheap chain of concatenations. + if (LT.second.getSizeInBits() <= 128) { + Cost += + BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); + } else { + unsigned NumSubVecs = LT.second.getSizeInBits() / 128; + Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first; + Cost += DemandedElts.countPopulation(); + + // For vXf32 cases, insertion into the 0'th index in each v4f32 + // 128-bit vector is free. + // NOTE: This assumes legalization widens vXf32 vectors. + if (MScalarTy == MVT::f32) + for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements(); + i < e; i += 4) + if (DemandedElts[i]) + Cost--; + } + } else if (LT.second.isVector()) { + // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded + // integer element as a SCALAR_TO_VECTOR, then we build the vector as a + // series of UNPCK followed by CONCAT_VECTORS - all of these can be + // considered cheap. + if (Ty->isIntOrIntVectorTy()) + Cost += DemandedElts.countPopulation(); + + // Get the smaller of the legalized or original pow2-extended number of + // vector elements, which represents the number of unpacks we'll end up + // performing. + unsigned NumElts = LT.second.getVectorNumElements(); + unsigned Pow2Elts = + PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); + Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; + } + } + + // TODO: Use default extraction for now, but we should investigate extending this + // to handle repeated subvector extraction. + if (Extract) + Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); + + return Cost; +} + int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind, const Instruction *I) { + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) { + if (isa_and_nonnull<StoreInst>(I)) { + Value *Ptr = I->getOperand(1); + // Store instruction with index and scale costs 2 Uops. + // Check the preceding GEP to identify non-const indices. + if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) { + if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) + return TTI::TCC_Basic * 2; + } + } + return TTI::TCC_Basic; + } + // Handle non-power-of-two vectors such as <3 x float> - if (VectorType *VTy = dyn_cast<VectorType>(Src)) { - unsigned NumElem = VTy->getVectorNumElements(); + if (auto *VTy = dyn_cast<FixedVectorType>(Src)) { + unsigned NumElem = VTy->getNumElements(); // Handle a few common cases: // <3 x float> @@ -2453,14 +3002,21 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // Assume that all other non-power-of-two numbers are scalarized. if (!isPowerOf2_32(NumElem)) { + APInt DemandedElts = APInt::getAllOnesValue(NumElem); int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, - AddressSpace); - int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, + AddressSpace, CostKind); + int SplitCost = getScalarizationOverhead(VTy, DemandedElts, + Opcode == Instruction::Load, Opcode == Instruction::Store); return NumElem * Cost + SplitCost; } } + // Type legalization can't handle structs + if (TLI->getValueType(DL, Src, true) == MVT::Other) + return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, + CostKind); + // Legalize the type. std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && @@ -2478,33 +3034,36 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, - unsigned Alignment, - unsigned AddressSpace) { + Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind) { bool IsLoad = (Instruction::Load == Opcode); bool IsStore = (Instruction::Store == Opcode); - VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); + auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); if (!SrcVTy) // To calculate scalar take the regular cost, without mask - return getMemoryOpCost(Opcode, SrcTy, MaybeAlign(Alignment), AddressSpace); + return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); - unsigned NumElem = SrcVTy->getVectorNumElements(); - VectorType *MaskTy = - VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); - if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) || - (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) || + unsigned NumElem = SrcVTy->getNumElements(); + auto *MaskTy = + FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); + if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || + (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) || !isPowerOf2_32(NumElem)) { // Scalarization - int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); + APInt DemandedElts = APInt::getAllOnesValue(NumElem); + int MaskSplitCost = + getScalarizationOverhead(MaskTy, DemandedElts, false, true); int ScalarCompareCost = getCmpSelInstrCost( - Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr); - int BranchCost = getCFInstrCost(Instruction::Br); + Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, + CostKind); + int BranchCost = getCFInstrCost(Instruction::Br, CostKind); int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); - - int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore); + int ValueSplitCost = + getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); int MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), - MaybeAlign(Alignment), AddressSpace); + Alignment, AddressSpace, CostKind); return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; } @@ -2519,8 +3078,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr); else if (LT.second.getVectorNumElements() > NumElem) { - VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), - LT.second.getVectorNumElements()); + auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), + LT.second.getVectorNumElements()); // Expanding requires fill mask with zeroes Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); } @@ -2558,41 +3117,16 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, return BaseT::getAddressComputationCost(Ty, SE, Ptr); } -int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwise) { +int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + bool IsPairwise, + TTI::TargetCostKind CostKind) { + // Just use the default implementation for pair reductions. + if (IsPairwise) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind); + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry SLMCostTblPairWise[] = { - { ISD::FADD, MVT::v2f64, 3 }, - { ISD::ADD, MVT::v2i64, 5 }, - }; - - static const CostTblEntry SSE2CostTblPairWise[] = { - { ISD::FADD, MVT::v2f64, 2 }, - { ISD::FADD, MVT::v4f32, 4 }, - { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". - { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32. - { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". - { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16 - { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16 - { ISD::ADD, MVT::v8i16, 5 }, - { ISD::ADD, MVT::v2i8, 2 }, - { ISD::ADD, MVT::v4i8, 2 }, - { ISD::ADD, MVT::v8i8, 2 }, - { ISD::ADD, MVT::v16i8, 3 }, - }; - - static const CostTblEntry AVX1CostTblPairWise[] = { - { ISD::FADD, MVT::v4f64, 5 }, - { ISD::FADD, MVT::v8f32, 7 }, - { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". - { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". - { ISD::ADD, MVT::v8i32, 5 }, - { ISD::ADD, MVT::v16i16, 6 }, - { ISD::ADD, MVT::v32i8, 4 }, - }; - static const CostTblEntry SLMCostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 3 }, { ISD::ADD, MVT::v2i64, 5 }, @@ -2633,66 +3167,49 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, EVT VT = TLI->getValueType(DL, ValTy); if (VT.isSimple()) { MVT MTy = VT.getSimpleVT(); - if (IsPairwise) { - if (ST->isSLM()) - if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy)) - return Entry->Cost; - - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) - return Entry->Cost; - - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) - return Entry->Cost; - } else { - if (ST->isSLM()) - if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) - return Entry->Cost; + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) + return Entry->Cost; - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) - return Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) - return Entry->Cost; - } + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; } std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; - if (IsPairwise) { - if (ST->isSLM()) - if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + auto *ValVTy = cast<FixedVectorType>(ValTy); - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + unsigned ArithmeticCost = 0; + if (LT.first != 1 && MTy.isVector() && + MTy.getVectorNumElements() < ValVTy->getNumElements()) { + // Type needs to be split. We need LT.first - 1 arithmetic ops. + auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), + MTy.getVectorNumElements()); + ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); + ArithmeticCost *= LT.first - 1; + } - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; - } else { - if (ST->isSLM()) - if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) + return ArithmeticCost + Entry->Cost; - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return ArithmeticCost + Entry->Cost; - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; - } + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return ArithmeticCost + Entry->Cost; // FIXME: These assume a naive kshift+binop lowering, which is probably // conservative in most cases. - // FIXME: This doesn't cost large types like v128i1 correctly. static const CostTblEntry AVX512BoolReduction[] = { { ISD::AND, MVT::v2i1, 3 }, { ISD::AND, MVT::v4i1, 5 }, @@ -2738,252 +3255,408 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, }; // Handle bool allof/anyof patterns. - if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) { + if (ValVTy->getElementType()->isIntegerTy(1)) { + unsigned ArithmeticCost = 0; + if (LT.first != 1 && MTy.isVector() && + MTy.getVectorNumElements() < ValVTy->getNumElements()) { + // Type needs to be split. We need LT.first - 1 arithmetic ops. + auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), + MTy.getVectorNumElements()); + ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); + ArithmeticCost *= LT.first - 1; + } + if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; + + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, + CostKind); + } + + unsigned NumVecElts = ValVTy->getNumElements(); + unsigned ScalarSize = ValVTy->getScalarSizeInBits(); + + // Special case power of 2 reductions where the scalar type isn't changed + // by type legalization. + if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, + CostKind); + + unsigned ReductionCost = 0; + + auto *Ty = ValVTy; + if (LT.first != 1 && MTy.isVector() && + MTy.getVectorNumElements() < ValVTy->getNumElements()) { + // Type needs to be split. We need LT.first - 1 arithmetic ops. + Ty = FixedVectorType::get(ValVTy->getElementType(), + MTy.getVectorNumElements()); + ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); + ReductionCost *= LT.first - 1; + NumVecElts = MTy.getVectorNumElements(); + } + + // Now handle reduction with the legal type, taking into account size changes + // at each level. + while (NumVecElts > 1) { + // Determine the size of the remaining vector we need to reduce. + unsigned Size = NumVecElts * ScalarSize; + NumVecElts /= 2; + // If we're reducing from 256/512 bits, use an extract_subvector. + if (Size > 128) { + auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); + ReductionCost += + getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); + Ty = SubTy; + } else if (Size == 128) { + // Reducing from 128 bits is a permute of v2f64/v2i64. + FixedVectorType *ShufTy; + if (ValVTy->isFloatingPointTy()) + ShufTy = + FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); + else + ShufTy = + FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); + ReductionCost += + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + } else if (Size == 64) { + // Reducing from 64 bits is a shuffle of v4f32/v4i32. + FixedVectorType *ShufTy; + if (ValVTy->isFloatingPointTy()) + ShufTy = + FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); + else + ShufTy = + FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); + ReductionCost += + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + } else { + // Reducing from smaller size is a shift by immediate. + auto *ShiftTy = FixedVectorType::get( + Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); + ReductionCost += getArithmeticInstrCost( + Instruction::LShr, ShiftTy, CostKind, + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OK_UniformConstantValue, + TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); + } + + // Add the arithmetic op for this level. + ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); } - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); + // Add the final extract element to the cost. + return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } -int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, - bool IsPairwise, bool IsUnsigned) { - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); +int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); MVT MTy = LT.second; int ISD; - if (ValTy->isIntOrIntVectorTy()) { + if (Ty->isIntOrIntVectorTy()) { ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; } else { - assert(ValTy->isFPOrFPVectorTy() && + assert(Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type."); ISD = ISD::FMINNUM; } - // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput - // and make it as the cost. + static const CostTblEntry SSE1CostTbl[] = { + {ISD::FMINNUM, MVT::v4f32, 1}, + }; - static const CostTblEntry SSE1CostTblPairWise[] = { - {ISD::FMINNUM, MVT::v4f32, 4}, - }; - - static const CostTblEntry SSE2CostTblPairWise[] = { - {ISD::FMINNUM, MVT::v2f64, 3}, - {ISD::SMIN, MVT::v2i64, 6}, - {ISD::UMIN, MVT::v2i64, 8}, - {ISD::SMIN, MVT::v4i32, 6}, - {ISD::UMIN, MVT::v4i32, 8}, - {ISD::SMIN, MVT::v8i16, 4}, - {ISD::UMIN, MVT::v8i16, 6}, - {ISD::SMIN, MVT::v16i8, 8}, - {ISD::UMIN, MVT::v16i8, 6}, - }; - - static const CostTblEntry SSE41CostTblPairWise[] = { - {ISD::FMINNUM, MVT::v4f32, 2}, - {ISD::SMIN, MVT::v2i64, 9}, - {ISD::UMIN, MVT::v2i64,10}, - {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" - {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" - {ISD::SMIN, MVT::v8i16, 2}, - {ISD::UMIN, MVT::v8i16, 2}, - {ISD::SMIN, MVT::v16i8, 3}, - {ISD::UMIN, MVT::v16i8, 3}, - }; - - static const CostTblEntry SSE42CostTblPairWise[] = { - {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" - {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" - }; - - static const CostTblEntry AVX1CostTblPairWise[] = { - {ISD::FMINNUM, MVT::v4f32, 1}, - {ISD::FMINNUM, MVT::v4f64, 1}, - {ISD::FMINNUM, MVT::v8f32, 2}, - {ISD::SMIN, MVT::v2i64, 3}, - {ISD::UMIN, MVT::v2i64, 3}, - {ISD::SMIN, MVT::v4i32, 1}, - {ISD::UMIN, MVT::v4i32, 1}, - {ISD::SMIN, MVT::v8i16, 1}, - {ISD::UMIN, MVT::v8i16, 1}, - {ISD::SMIN, MVT::v16i8, 2}, - {ISD::UMIN, MVT::v16i8, 2}, - {ISD::SMIN, MVT::v4i64, 7}, - {ISD::UMIN, MVT::v4i64, 7}, - {ISD::SMIN, MVT::v8i32, 3}, - {ISD::UMIN, MVT::v8i32, 3}, - {ISD::SMIN, MVT::v16i16, 3}, - {ISD::UMIN, MVT::v16i16, 3}, - {ISD::SMIN, MVT::v32i8, 3}, - {ISD::UMIN, MVT::v32i8, 3}, - }; - - static const CostTblEntry AVX2CostTblPairWise[] = { - {ISD::SMIN, MVT::v4i64, 2}, - {ISD::UMIN, MVT::v4i64, 2}, - {ISD::SMIN, MVT::v8i32, 1}, - {ISD::UMIN, MVT::v8i32, 1}, - {ISD::SMIN, MVT::v16i16, 1}, - {ISD::UMIN, MVT::v16i16, 1}, - {ISD::SMIN, MVT::v32i8, 2}, - {ISD::UMIN, MVT::v32i8, 2}, - }; - - static const CostTblEntry AVX512CostTblPairWise[] = { - {ISD::FMINNUM, MVT::v8f64, 1}, - {ISD::FMINNUM, MVT::v16f32, 2}, - {ISD::SMIN, MVT::v8i64, 2}, - {ISD::UMIN, MVT::v8i64, 2}, - {ISD::SMIN, MVT::v16i32, 1}, - {ISD::UMIN, MVT::v16i32, 1}, - }; - - static const CostTblEntry SSE1CostTblNoPairWise[] = { - {ISD::FMINNUM, MVT::v4f32, 4}, + static const CostTblEntry SSE2CostTbl[] = { + {ISD::FMINNUM, MVT::v2f64, 1}, + {ISD::SMIN, MVT::v8i16, 1}, + {ISD::UMIN, MVT::v16i8, 1}, }; - static const CostTblEntry SSE2CostTblNoPairWise[] = { - {ISD::FMINNUM, MVT::v2f64, 3}, - {ISD::SMIN, MVT::v2i64, 6}, - {ISD::UMIN, MVT::v2i64, 8}, - {ISD::SMIN, MVT::v4i32, 6}, - {ISD::UMIN, MVT::v4i32, 8}, - {ISD::SMIN, MVT::v8i16, 4}, - {ISD::UMIN, MVT::v8i16, 6}, - {ISD::SMIN, MVT::v16i8, 8}, - {ISD::UMIN, MVT::v16i8, 6}, + static const CostTblEntry SSE41CostTbl[] = { + {ISD::SMIN, MVT::v4i32, 1}, + {ISD::UMIN, MVT::v4i32, 1}, + {ISD::UMIN, MVT::v8i16, 1}, + {ISD::SMIN, MVT::v16i8, 1}, }; - static const CostTblEntry SSE41CostTblNoPairWise[] = { - {ISD::FMINNUM, MVT::v4f32, 3}, - {ISD::SMIN, MVT::v2i64, 9}, - {ISD::UMIN, MVT::v2i64,11}, - {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" - {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" - {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" - {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8" - {ISD::SMIN, MVT::v16i8, 3}, - {ISD::UMIN, MVT::v16i8, 3}, + static const CostTblEntry SSE42CostTbl[] = { + {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd }; - static const CostTblEntry SSE42CostTblNoPairWise[] = { - {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" - {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" + static const CostTblEntry AVX1CostTbl[] = { + {ISD::FMINNUM, MVT::v8f32, 1}, + {ISD::FMINNUM, MVT::v4f64, 1}, + {ISD::SMIN, MVT::v8i32, 3}, + {ISD::UMIN, MVT::v8i32, 3}, + {ISD::SMIN, MVT::v16i16, 3}, + {ISD::UMIN, MVT::v16i16, 3}, + {ISD::SMIN, MVT::v32i8, 3}, + {ISD::UMIN, MVT::v32i8, 3}, }; - static const CostTblEntry AVX1CostTblNoPairWise[] = { - {ISD::FMINNUM, MVT::v4f32, 1}, - {ISD::FMINNUM, MVT::v4f64, 1}, - {ISD::FMINNUM, MVT::v8f32, 1}, - {ISD::SMIN, MVT::v2i64, 3}, - {ISD::UMIN, MVT::v2i64, 3}, - {ISD::SMIN, MVT::v4i32, 1}, - {ISD::UMIN, MVT::v4i32, 1}, - {ISD::SMIN, MVT::v8i16, 1}, - {ISD::UMIN, MVT::v8i16, 1}, - {ISD::SMIN, MVT::v16i8, 2}, - {ISD::UMIN, MVT::v16i8, 2}, - {ISD::SMIN, MVT::v4i64, 7}, - {ISD::UMIN, MVT::v4i64, 7}, - {ISD::SMIN, MVT::v8i32, 2}, - {ISD::UMIN, MVT::v8i32, 2}, - {ISD::SMIN, MVT::v16i16, 2}, - {ISD::UMIN, MVT::v16i16, 2}, - {ISD::SMIN, MVT::v32i8, 2}, - {ISD::UMIN, MVT::v32i8, 2}, - }; - - static const CostTblEntry AVX2CostTblNoPairWise[] = { - {ISD::SMIN, MVT::v4i64, 1}, - {ISD::UMIN, MVT::v4i64, 1}, - {ISD::SMIN, MVT::v8i32, 1}, - {ISD::UMIN, MVT::v8i32, 1}, - {ISD::SMIN, MVT::v16i16, 1}, - {ISD::UMIN, MVT::v16i16, 1}, - {ISD::SMIN, MVT::v32i8, 1}, - {ISD::UMIN, MVT::v32i8, 1}, - }; - - static const CostTblEntry AVX512CostTblNoPairWise[] = { - {ISD::FMINNUM, MVT::v8f64, 1}, - {ISD::FMINNUM, MVT::v16f32, 2}, - {ISD::SMIN, MVT::v8i64, 1}, - {ISD::UMIN, MVT::v8i64, 1}, - {ISD::SMIN, MVT::v16i32, 1}, - {ISD::UMIN, MVT::v16i32, 1}, - }; - - if (IsPairwise) { - if (ST->hasAVX512()) - if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + static const CostTblEntry AVX2CostTbl[] = { + {ISD::SMIN, MVT::v8i32, 1}, + {ISD::UMIN, MVT::v8i32, 1}, + {ISD::SMIN, MVT::v16i16, 1}, + {ISD::UMIN, MVT::v16i16, 1}, + {ISD::SMIN, MVT::v32i8, 1}, + {ISD::UMIN, MVT::v32i8, 1}, + }; - if (ST->hasAVX2()) - if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + static const CostTblEntry AVX512CostTbl[] = { + {ISD::FMINNUM, MVT::v16f32, 1}, + {ISD::FMINNUM, MVT::v8f64, 1}, + {ISD::SMIN, MVT::v2i64, 1}, + {ISD::UMIN, MVT::v2i64, 1}, + {ISD::SMIN, MVT::v4i64, 1}, + {ISD::UMIN, MVT::v4i64, 1}, + {ISD::SMIN, MVT::v8i64, 1}, + {ISD::UMIN, MVT::v8i64, 1}, + {ISD::SMIN, MVT::v16i32, 1}, + {ISD::UMIN, MVT::v16i32, 1}, + }; - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + static const CostTblEntry AVX512BWCostTbl[] = { + {ISD::SMIN, MVT::v32i16, 1}, + {ISD::UMIN, MVT::v32i16, 1}, + {ISD::SMIN, MVT::v64i8, 1}, + {ISD::UMIN, MVT::v64i8, 1}, + }; - if (ST->hasSSE42()) - if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + // If we have a native MIN/MAX instruction for this type, use it. + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE41()) - if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE1()) - if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE1()) + if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; + + unsigned CmpOpcode; + if (Ty->isFPOrFPVectorTy()) { + CmpOpcode = Instruction::FCmp; } else { - if (ST->hasAVX512()) - if (const auto *Entry = - CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + assert(Ty->isIntOrIntVectorTy() && + "expecting floating point or integer type for min/max reduction"); + CmpOpcode = Instruction::ICmp; + } - if (ST->hasAVX2()) - if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + // Otherwise fall back to cmp+select. + return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CostKind) + + getCmpSelInstrCost(Instruction::Select, Ty, CondTy, CostKind); +} + +int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind) { + // Just use the default implementation for pair reductions. + if (IsPairwise) + return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned, + CostKind); + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + + int ISD; + if (ValTy->isIntOrIntVectorTy()) { + ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; + } else { + assert(ValTy->isFPOrFPVectorTy() && + "Expected float point or integer vector type."); + ISD = ISD::FMINNUM; + } + + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput + // and make it as the cost. + + static const CostTblEntry SSE2CostTblNoPairWise[] = { + {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw + {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw + {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw + }; + + static const CostTblEntry SSE41CostTblNoPairWise[] = { + {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 + {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 + {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 + {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 + {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor + {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax + {ISD::SMIN, MVT::v2i8, 3}, // pminsb + {ISD::SMIN, MVT::v4i8, 5}, // pminsb + {ISD::SMIN, MVT::v8i8, 7}, // pminsb + {ISD::SMIN, MVT::v16i8, 6}, + {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 + {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 + {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 + {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax + }; + + static const CostTblEntry AVX1CostTblNoPairWise[] = { + {ISD::SMIN, MVT::v16i16, 6}, + {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax + {ISD::SMIN, MVT::v32i8, 8}, + {ISD::UMIN, MVT::v32i8, 8}, + }; + + static const CostTblEntry AVX512BWCostTblNoPairWise[] = { + {ISD::SMIN, MVT::v32i16, 8}, + {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax + {ISD::SMIN, MVT::v64i8, 10}, + {ISD::UMIN, MVT::v64i8, 10}, + }; + + // Before legalizing the type, give a chance to look up illegal narrow types + // in the table. + // FIXME: Is there a better way to do this? + EVT VT = TLI->getValueType(DL, ValTy); + if (VT.isSimple()) { + MVT MTy = VT.getSimpleVT(); + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) + return Entry->Cost; if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; - - if (ST->hasSSE42()) - if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + return Entry->Cost; if (ST->hasSSE41()) if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + return Entry->Cost; if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + return Entry->Cost; + } - if (ST->hasSSE1()) - if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + auto *ValVTy = cast<FixedVectorType>(ValTy); + unsigned NumVecElts = ValVTy->getNumElements(); + + auto *Ty = ValVTy; + unsigned MinMaxCost = 0; + if (LT.first != 1 && MTy.isVector() && + MTy.getVectorNumElements() < ValVTy->getNumElements()) { + // Type needs to be split. We need LT.first - 1 operations ops. + Ty = FixedVectorType::get(ValVTy->getElementType(), + MTy.getVectorNumElements()); + auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), + MTy.getVectorNumElements()); + MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); + MinMaxCost *= LT.first - 1; + NumVecElts = MTy.getVectorNumElements(); } - return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) + return MinMaxCost + Entry->Cost; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return MinMaxCost + Entry->Cost; + + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) + return MinMaxCost + Entry->Cost; + + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return MinMaxCost + Entry->Cost; + + unsigned ScalarSize = ValTy->getScalarSizeInBits(); + + // Special case power of 2 reductions where the scalar type isn't changed + // by type legalization. + if (!isPowerOf2_32(ValVTy->getNumElements()) || + ScalarSize != MTy.getScalarSizeInBits()) + return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned, + CostKind); + + // Now handle reduction with the legal type, taking into account size changes + // at each level. + while (NumVecElts > 1) { + // Determine the size of the remaining vector we need to reduce. + unsigned Size = NumVecElts * ScalarSize; + NumVecElts /= 2; + // If we're reducing from 256/512 bits, use an extract_subvector. + if (Size > 128) { + auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); + MinMaxCost += + getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); + Ty = SubTy; + } else if (Size == 128) { + // Reducing from 128 bits is a permute of v2f64/v2i64. + VectorType *ShufTy; + if (ValTy->isFloatingPointTy()) + ShufTy = + FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); + else + ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); + MinMaxCost += + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + } else if (Size == 64) { + // Reducing from 64 bits is a shuffle of v4f32/v4i32. + FixedVectorType *ShufTy; + if (ValTy->isFloatingPointTy()) + ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); + else + ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); + MinMaxCost += + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + } else { + // Reducing from smaller size is a shift by immediate. + auto *ShiftTy = FixedVectorType::get( + Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); + MinMaxCost += getArithmeticInstrCost( + Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, + TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OK_UniformConstantValue, + TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); + } + + // Add the arithmetic op for this level. + auto *SubCondTy = + FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); + MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); + } + + // Add the final extract element to the cost. + return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } /// Calculate the cost of materializing a 64-bit value. This helper @@ -2999,7 +3672,8 @@ int X86TTIImpl::getIntImmCost(int64_t Val) { return 2 * TTI::TCC_Basic; } -int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -3034,7 +3708,7 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { } int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty) { + Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -3121,17 +3795,18 @@ int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im if (Idx == ImmIdx) { int NumConstants = divideCeil(BitSize, 64); - int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); + int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); return (Cost <= NumConstants * TTI::TCC_Basic) ? static_cast<int>(TTI::TCC_Free) : Cost; } - return X86TTIImpl::getIntImmCost(Imm, Ty); + return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); } int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -3162,52 +3837,45 @@ int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, return TTI::TCC_Free; break; } - return X86TTIImpl::getIntImmCost(Imm, Ty); + return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); } -unsigned X86TTIImpl::getUserCost(const User *U, - ArrayRef<const Value *> Operands) { - if (isa<StoreInst>(U)) { - Value *Ptr = U->getOperand(1); - // Store instruction with index and scale costs 2 Uops. - // Check the preceding GEP to identify non-const indices. - if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) { - if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) - return TTI::TCC_Basic * 2; - } - return TTI::TCC_Basic; - } - return BaseT::getUserCost(U, Operands); +unsigned +X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { + if (CostKind != TTI::TCK_RecipThroughput) + return Opcode == Instruction::PHI ? 0 : 1; + // Branches are assumed to be predicted. + return CostKind == TTI::TCK_RecipThroughput ? 0 : 1; } // Return an average cost of Gather / Scatter instruction, maybe improved later -int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, - unsigned Alignment, unsigned AddressSpace) { +int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, + Align Alignment, unsigned AddressSpace) { assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); - unsigned VF = SrcVTy->getVectorNumElements(); + unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); // Try to reduce index size from 64 bit (default for GEP) // to 32. It is essential for VF 16. If the index can't be reduced to 32, the // operation will use 16 x 64 indices which do not fit in a zmm and needs // to split. Also check that the base pointer is the same for all lanes, // and that there's at most one variable index. - auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { + auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { unsigned IndexSize = DL.getPointerSizeInBits(); - GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); if (IndexSize < 64 || !GEP) return IndexSize; unsigned NumOfVarIndices = 0; - Value *Ptrs = GEP->getPointerOperand(); + const Value *Ptrs = GEP->getPointerOperand(); if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) return IndexSize; for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { if (isa<Constant>(GEP->getOperand(i))) continue; Type *IndxTy = GEP->getOperand(i)->getType(); - if (IndxTy->isVectorTy()) - IndxTy = IndxTy->getVectorElementType(); + if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) + IndxTy = IndexVTy->getElementType(); if ((IndxTy->getPrimitiveSizeInBits() == 64 && !isa<SExtInst>(GEP->getOperand(i))) || ++NumOfVarIndices > 1) @@ -3216,21 +3884,21 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, return (unsigned)32; }; - // Trying to reduce IndexSize to 32 bits for vector 16. // By default the IndexSize is equal to pointer size. unsigned IndexSize = (ST->hasAVX512() && VF >= 16) ? getIndexSizeInBits(Ptr, DL) : DL.getPointerSizeInBits(); - Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), - IndexSize), VF); + auto *IndexVTy = FixedVectorType::get( + IntegerType::get(SrcVTy->getContext(), IndexSize), VF); std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); int SplitFactor = std::max(IdxsLT.first, SrcLT.first); if (SplitFactor > 1) { // Handle splitting of vector of pointers - Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); + auto *SplitSrcTy = + FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, AddressSpace); } @@ -3241,7 +3909,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, ? ST->getGatherOverhead() : ST->getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), - MaybeAlign(Alignment), AddressSpace); + MaybeAlign(Alignment), AddressSpace, + TTI::TCK_RecipThroughput); } /// Return the cost of full scalarization of gather / scatter operation. @@ -3253,25 +3922,29 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, /// AddressSpace - pointer[s] address space. /// int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, - bool VariableMask, unsigned Alignment, + bool VariableMask, Align Alignment, unsigned AddressSpace) { - unsigned VF = SrcVTy->getVectorNumElements(); + unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); + APInt DemandedElts = APInt::getAllOnesValue(VF); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; int MaskUnpackCost = 0; if (VariableMask) { - VectorType *MaskTy = - VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); - MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); + auto *MaskTy = + FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); + MaskUnpackCost = + getScalarizationOverhead(MaskTy, DemandedElts, false, true); int ScalarCompareCost = getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), - nullptr); - int BranchCost = getCFInstrCost(Instruction::Br); + nullptr, CostKind); + int BranchCost = getCFInstrCost(Instruction::Br, CostKind); MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); } // The cost of the scalar loads/stores. int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), - MaybeAlign(Alignment), AddressSpace); + MaybeAlign(Alignment), AddressSpace, + CostKind); int InsertExtractCost = 0; if (Opcode == Instruction::Load) @@ -3290,21 +3963,28 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, /// Calculate the cost of Gather / Scatter operation int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, - Value *Ptr, bool VariableMask, - unsigned Alignment) { + const Value *Ptr, bool VariableMask, + Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) { + + if (CostKind != TTI::TCK_RecipThroughput) + return 1; + assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); - unsigned VF = SrcVTy->getVectorNumElements(); + unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); if (!PtrTy && Ptr->getType()->isVectorTy()) - PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); + PtrTy = dyn_cast<PointerType>( + cast<VectorType>(Ptr->getType())->getElementType()); assert(PtrTy && "Unexpected type for Ptr argument"); unsigned AddressSpace = PtrTy->getAddressSpace(); bool Scalarize = false; if ((Opcode == Instruction::Load && - !isLegalMaskedGather(SrcVTy, MaybeAlign(Alignment))) || + !isLegalMaskedGather(SrcVTy, Align(Alignment))) || (Opcode == Instruction::Store && - !isLegalMaskedScatter(SrcVTy, MaybeAlign(Alignment)))) + !isLegalMaskedScatter(SrcVTy, Align(Alignment)))) Scalarize = true; // Gather / Scatter for vector 2 is not profitable on KNL / SKX // Vector-4 of gather/scatter instruction does not exist on KNL. @@ -3337,12 +4017,13 @@ bool X86TTIImpl::canMacroFuseCmp() { return ST->hasMacroFusion() || ST->hasBranchFusion(); } -bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { if (!ST->hasAVX()) return false; // The backend can't handle a single element vector. - if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1) + if (isa<VectorType>(DataTy) && + cast<FixedVectorType>(DataTy)->getNumElements() == 1) return false; Type *ScalarTy = DataTy->getScalarType(); @@ -3360,7 +4041,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) { ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); } -bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) { +bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { return isLegalMaskedLoad(DataType, Alignment); } @@ -3407,10 +4088,10 @@ bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { return false; // The backend can't handle a single element vector. - if (DataTy->getVectorNumElements() == 1) + if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) return false; - Type *ScalarTy = DataTy->getVectorElementType(); + Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) return true; @@ -3427,7 +4108,7 @@ bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { return isLegalMaskedExpandLoad(DataTy); } -bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) { +bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { // Some CPUs have better gather performance than others. // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only // enable gather with a -march. @@ -3446,8 +4127,8 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) { // In this case we can reject non-power-of-2 vectors. // We also reject single element vectors as the type legalizer can't // scalarize it. - if (isa<VectorType>(DataTy)) { - unsigned NumElts = DataTy->getVectorNumElements(); + if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) { + unsigned NumElts = DataVTy->getNumElements(); if (NumElts == 1 || !isPowerOf2_32(NumElts)) return false; } @@ -3465,7 +4146,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) { return IntWidth == 32 || IntWidth == 64; } -bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { +bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { // AVX2 doesn't support scatter if (!ST->hasAVX512()) return false; @@ -3505,11 +4186,22 @@ bool X86TTIImpl::areFunctionArgsABICompatible( // If we get here, we know the target features match. If one function // considers 512-bit vectors legal and the other does not, consider them // incompatible. - // FIXME Look at the arguments and only consider 512 bit or larger vectors? const TargetMachine &TM = getTLI()->getTargetMachine(); - return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == - TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs(); + if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == + TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) + return true; + + // Consider the arguments compatible if they aren't vectors or aggregates. + // FIXME: Look at the size of vectors. + // FIXME: Look at the element types of aggregates to see if there are vectors. + // FIXME: The API of this function seems intended to allow arguments + // to be removed from the set, but the caller doesn't check if the set + // becomes empty so that may not work in practice. + return llvm::none_of(Args, [](Argument *A) { + auto *EltTy = cast<PointerType>(A->getType())->getElementType(); + return EltTy->isVectorTy() || EltTy->isAggregateType(); + }); } X86TTIImpl::TTI::MemCmpExpansionOptions @@ -3517,6 +4209,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { TTI::MemCmpExpansionOptions Options; Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); Options.NumLoadsPerBlock = 2; + // All GPR and vector loads can be unaligned. + Options.AllowOverlappingLoads = true; if (IsZeroCmp) { // Only enable vector loads for equality comparison. Right now the vector // version is not as fast for three way compare (see #33329). @@ -3524,8 +4218,6 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); - // All GPR and vector loads can be unaligned. - Options.AllowOverlappingLoads = true; } if (ST->is64Bit()) { Options.LoadSizes.push_back(8); @@ -3555,24 +4247,22 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() { // computing the cost using a generic formula as a function of generic // shuffles. We therefore use a lookup table instead, filled according to // the instruction sequences that codegen currently generates. -int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int X86TTIImpl::getInterleavedMemoryOpCostAVX2( + unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { if (UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). if (Indices.size() && Indices.size() != Factor) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, + CostKind); // VecTy for interleave memop is <VF*Factor x Elt>. // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -3584,10 +4274,11 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, // (see MachineValueType.h::getVectorVT()). if (!LegalVT.isVector()) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, + CostKind); - unsigned VF = VecTy->getVectorNumElements() / Factor; - Type *ScalarTy = VecTy->getVectorElementType(); + unsigned VF = VecTy->getNumElements() / Factor; + Type *ScalarTy = VecTy->getElementType(); // Calculate the number of memory operations (NumOfMemOps), required // for load/store the VecTy. @@ -3596,16 +4287,18 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; // Get the cost of one memory operation. - Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), - LegalVT.getVectorNumElements()); + auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), + LegalVT.getVectorNumElements()); unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, - MaybeAlign(Alignment), AddressSpace); + MaybeAlign(Alignment), AddressSpace, + CostKind); - VectorType *VT = VectorType::get(ScalarTy, VF); + auto *VT = FixedVectorType::get(ScalarTy, VF); EVT ETy = TLI->getValueType(DL, VT); if (!ETy.isSimple()) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, + CostKind); // TODO: Complete for other data-types and strides. // Each combination of Stride, ElementTy and VF results in a different @@ -3664,24 +4357,21 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, CostKind); } // Get estimation for interleaved load/store operations and strided load. // \p Indices contains indices for strided load. // \p Factor - the factor of interleaving. // AVX-512 provides 3-src shuffles that significantly reduces the cost. -int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int X86TTIImpl::getInterleavedMemoryOpCostAVX512( + unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { if (UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); // VecTy for interleave memop is <VF*Factor x Elt>. @@ -3696,12 +4386,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; // Get the cost of one memory operation. - Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(), - LegalVT.getVectorNumElements()); + auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), + LegalVT.getVectorNumElements()); unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, - MaybeAlign(Alignment), AddressSpace); + MaybeAlign(Alignment), AddressSpace, + CostKind); - unsigned VF = VecTy->getVectorNumElements() / Factor; + unsigned VF = VecTy->getNumElements() / Factor; MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); if (Opcode == Instruction::Load) { @@ -3733,8 +4424,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned NumOfLoadsInInterleaveGrp = Indices.size() ? Indices.size() : Factor; - Type *ResultTy = VectorType::get(VecTy->getVectorElementType(), - VecTy->getVectorNumElements() / Factor); + auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), + VecTy->getNumElements() / Factor); unsigned NumOfResults = getTLI()->getTypeLegalizationCost(DL, ResultTy).first * NumOfLoadsInInterleaveGrp; @@ -3796,15 +4487,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, return Cost; } -int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace, - bool UseMaskForCond, - bool UseMaskForGaps) { +int X86TTIImpl::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, + Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + bool UseMaskForCond, bool UseMaskForGaps) { auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { - Type *EltTy = VecTy->getVectorElementType(); + Type *EltTy = cast<VectorType>(VecTy)->getElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || EltTy->isIntegerTy(32) || EltTy->isPointerTy()) return true; @@ -3813,15 +4501,15 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, return false; }; if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) - return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, - UseMaskForCond, UseMaskForGaps); + return getInterleavedMemoryOpCostAVX512( + Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment, + AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); if (ST->hasAVX2()) - return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, - UseMaskForCond, UseMaskForGaps); + return getInterleavedMemoryOpCostAVX2( + Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment, + AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, + Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h index b9c2dbd78058..d462e1f96ca2 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -16,11 +16,9 @@ #ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H #define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H -#include "X86.h" #include "X86TargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/CodeGen/TargetLowering.h" namespace llvm { @@ -107,9 +105,9 @@ public: /// \name Cache TTI Implementation /// @{ llvm::Optional<unsigned> getCacheSize( - TargetTransformInfo::CacheLevel Level) const; + TargetTransformInfo::CacheLevel Level) const override; llvm::Optional<unsigned> getCacheAssociativity( - TargetTransformInfo::CacheLevel Level) const; + TargetTransformInfo::CacheLevel Level) const override; /// @} /// \name Vector TTI Implementations @@ -121,76 +119,90 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef<const Value *> Args = ArrayRef<const Value *>(), const Instruction *CxtI = nullptr); - int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, + VectorType *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, + TTI::TargetCostKind CostKind, const Instruction *I = nullptr); int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, + bool Insert, bool Extract); int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, - unsigned AddressSpace, const Instruction *I = nullptr); - int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); - int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, - bool VariableMask, unsigned Alignment); + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + int getMaskedMemoryOpCost( + unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, + bool VariableMask, Align Alignment, + TTI::TargetCostKind CostKind, + const Instruction *I); int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr); unsigned getAtomicMemIntrinsicMaxElementSize() const; - int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Type *> Tys, FastMathFlags FMF, - unsigned ScalarizationCostPassed = UINT_MAX); - int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef<Value *> Args, FastMathFlags FMF, - unsigned VF = 1); - - int getArithmeticReductionCost(unsigned Opcode, Type *Ty, - bool IsPairwiseForm); - - int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, - bool IsUnsigned); - - int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace, - bool UseMaskForCond = false, - bool UseMaskForGaps = false); - int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, - unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace, - bool UseMaskForCond = false, - bool UseMaskForGaps = false); - int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, - unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace, - bool UseMaskForCond = false, - bool UseMaskForGaps = false); + int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind); + int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind); + + int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); + + int getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned); + + int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwiseForm, bool IsUnsigned, + TTI::TargetCostKind CostKind); + + int getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, + Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, + bool UseMaskForCond = false, bool UseMaskForGaps = false); + int getInterleavedMemoryOpCostAVX512( + unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, + bool UseMaskForCond = false, bool UseMaskForGaps = false); + int getInterleavedMemoryOpCostAVX2( + unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, + TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency, + bool UseMaskForCond = false, bool UseMaskForGaps = false); int getIntImmCost(int64_t); - int getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind); - unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands); + unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind); - int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind); int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + Type *Ty, TTI::TargetCostKind CostKind); bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2); bool canMacroFuseCmp(); - bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment); - bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment); + bool isLegalMaskedLoad(Type *DataType, Align Alignment); + bool isLegalMaskedStore(Type *DataType, Align Alignment); bool isLegalNTLoad(Type *DataType, Align Alignment); bool isLegalNTStore(Type *DataType, Align Alignment); - bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment); - bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment); + bool isLegalMaskedGather(Type *DataType, Align Alignment); + bool isLegalMaskedScatter(Type *DataType, Align Alignment); bool isLegalMaskedExpandLoad(Type *DataType); bool isLegalMaskedCompressStore(Type *DataType); bool hasDivRemOp(Type *DataType, bool IsSigned); @@ -203,11 +215,20 @@ public: TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); + + /// Allow vectorizers to form reduction intrinsics in IR. The IR is expanded + /// into shuffles and vector math/logic by the backend + /// (see TTI::shouldExpandReduction) + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + return true; + } + private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, - unsigned Alignment, unsigned AddressSpace); - int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr, - unsigned Alignment, unsigned AddressSpace); + Align Alignment, unsigned AddressSpace); + int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr, + Align Alignment, unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp index 7a8308ef1ba9..c188c7443625 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp @@ -39,6 +39,11 @@ using namespace llvm; #define DEBUG_TYPE "x86-vzeroupper" +static cl::opt<bool> +UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, + cl::desc("Minimize AVX to SSE transition penalty"), + cl::init(true)); + STATISTIC(NumVZU, "Number of vzeroupper instructions inserted"); namespace { @@ -278,6 +283,9 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { /// Loop over all of the basic blocks, inserting vzeroupper instructions before /// function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { + if (!UseVZeroUpper) + return false; + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); if (!ST.hasAVX() || !ST.insertVZEROUPPER()) return false; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp index 42e8fba2201e..72593afb2258 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp @@ -19,6 +19,7 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp index 78d3f6460189..8627bbbf18d2 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp @@ -19,7 +19,7 @@ #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/WinEHFuncInfo.h" -#include "llvm/IR/CallSite.h" +#include "llvm/IR/CFG.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -67,13 +67,13 @@ private: Function *generateLSDAInEAXThunk(Function *ParentFunc); - bool isStateStoreNeeded(EHPersonality Personality, CallSite CS); - void rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, CallSite CS, - Value *State); + bool isStateStoreNeeded(EHPersonality Personality, CallBase &Call); + void rewriteSetJmpCall(IRBuilder<> &Builder, Function &F, CallBase &Call, + Value *State); int getBaseStateForBB(DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo, BasicBlock *BB); - int getStateForCallSite(DenseMap<BasicBlock *, ColorVector> &BlockColors, - WinEHFuncInfo &FuncInfo, CallSite CS); + int getStateForCall(DenseMap<BasicBlock *, ColorVector> &BlockColors, + WinEHFuncInfo &FuncInfo, CallBase &Call); // Module-level type getters. Type *getEHLinkRegistrationType(); @@ -455,16 +455,14 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) { // The idea behind _setjmp3 is that it takes an optional number of personality // specific parameters to indicate how to restore the personality-specific frame // state when longjmp is initiated. Typically, the current TryLevel is saved. -void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, - CallSite CS, Value *State) { +void WinEHStatePass::rewriteSetJmpCall(IRBuilder<> &Builder, Function &F, + CallBase &Call, Value *State) { // Don't rewrite calls with a weird number of arguments. - if (CS.getNumArgOperands() != 2) + if (Call.getNumArgOperands() != 2) return; - Instruction *Inst = CS.getInstruction(); - SmallVector<OperandBundleDef, 1> OpBundles; - CS.getOperandBundlesAsDefs(OpBundles); + Call.getOperandBundlesAsDefs(OpBundles); SmallVector<Value *, 3> OptionalArgs; if (Personality == EHPersonality::MSVC_CXX) { @@ -482,29 +480,27 @@ void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, SmallVector<Value *, 5> Args; Args.push_back( - Builder.CreateBitCast(CS.getArgOperand(0), Builder.getInt8PtrTy())); + Builder.CreateBitCast(Call.getArgOperand(0), Builder.getInt8PtrTy())); Args.push_back(Builder.getInt32(OptionalArgs.size())); Args.append(OptionalArgs.begin(), OptionalArgs.end()); - CallSite NewCS; - if (CS.isCall()) { - auto *CI = cast<CallInst>(Inst); + CallBase *NewCall; + if (auto *CI = dyn_cast<CallInst>(&Call)) { CallInst *NewCI = Builder.CreateCall(SetJmp3, Args, OpBundles); NewCI->setTailCallKind(CI->getTailCallKind()); - NewCS = NewCI; + NewCall = NewCI; } else { - auto *II = cast<InvokeInst>(Inst); - NewCS = Builder.CreateInvoke( + auto *II = cast<InvokeInst>(&Call); + NewCall = Builder.CreateInvoke( SetJmp3, II->getNormalDest(), II->getUnwindDest(), Args, OpBundles); } - NewCS.setCallingConv(CS.getCallingConv()); - NewCS.setAttributes(CS.getAttributes()); - NewCS->setDebugLoc(CS->getDebugLoc()); - - Instruction *NewInst = NewCS.getInstruction(); - NewInst->takeName(Inst); - Inst->replaceAllUsesWith(NewInst); - Inst->eraseFromParent(); + NewCall->setCallingConv(Call.getCallingConv()); + NewCall->setAttributes(Call.getAttributes()); + NewCall->setDebugLoc(Call.getDebugLoc()); + + NewCall->takeName(&Call); + Call.replaceAllUsesWith(NewCall); + Call.eraseFromParent(); } // Figure out what state we should assign calls in this block. @@ -527,17 +523,17 @@ int WinEHStatePass::getBaseStateForBB( } // Calculate the state a call-site is in. -int WinEHStatePass::getStateForCallSite( +int WinEHStatePass::getStateForCall( DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo, - CallSite CS) { - if (auto *II = dyn_cast<InvokeInst>(CS.getInstruction())) { + CallBase &Call) { + if (auto *II = dyn_cast<InvokeInst>(&Call)) { // Look up the state number of the EH pad this unwinds to. assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!"); return FuncInfo.InvokeStateMap[II]; } // Possibly throwing call instructions have no actions to take after // an unwind. Ensure they are in the -1 state. - return getBaseStateForBB(BlockColors, FuncInfo, CS.getParent()); + return getBaseStateForBB(BlockColors, FuncInfo, Call.getParent()); } // Calculate the intersection of all the FinalStates for a BasicBlock's @@ -618,16 +614,13 @@ static int getSuccState(DenseMap<BasicBlock *, int> &InitialStates, Function &F, } bool WinEHStatePass::isStateStoreNeeded(EHPersonality Personality, - CallSite CS) { - if (!CS) - return false; - + CallBase &Call) { // If the function touches memory, it needs a state store. if (isAsynchronousEHPersonality(Personality)) - return !CS.doesNotAccessMemory(); + return !Call.doesNotAccessMemory(); // If the function throws, it needs a state store. - return !CS.doesNotThrow(); + return !Call.doesNotThrow(); } void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { @@ -672,11 +665,11 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { if (&F.getEntryBlock() == BB) InitialState = FinalState = ParentBaseState; for (Instruction &I : *BB) { - CallSite CS(&I); - if (!isStateStoreNeeded(Personality, CS)) + auto *Call = dyn_cast<CallBase>(&I); + if (!Call || !isStateStoreNeeded(Personality, *Call)) continue; - int State = getStateForCallSite(BlockColors, FuncInfo, CS); + int State = getStateForCall(BlockColors, FuncInfo, *Call); if (InitialState == OverdefinedState) InitialState = State; FinalState = State; @@ -739,11 +732,11 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { << " PrevState=" << PrevState << '\n'); for (Instruction &I : *BB) { - CallSite CS(&I); - if (!isStateStoreNeeded(Personality, CS)) + auto *Call = dyn_cast<CallBase>(&I); + if (!Call || !isStateStoreNeeded(Personality, *Call)) continue; - int State = getStateForCallSite(BlockColors, FuncInfo, CS); + int State = getStateForCall(BlockColors, FuncInfo, *Call); if (State != PrevState) insertStateNumberStore(&I, State); PrevState = State; @@ -756,35 +749,35 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { insertStateNumberStore(BB->getTerminator(), EndState->second); } - SmallVector<CallSite, 1> SetJmp3CallSites; + SmallVector<CallBase *, 1> SetJmp3Calls; for (BasicBlock *BB : RPOT) { for (Instruction &I : *BB) { - CallSite CS(&I); - if (!CS) + auto *Call = dyn_cast<CallBase>(&I); + if (!Call) continue; - if (CS.getCalledValue()->stripPointerCasts() != + if (Call->getCalledOperand()->stripPointerCasts() != SetJmp3.getCallee()->stripPointerCasts()) continue; - SetJmp3CallSites.push_back(CS); + SetJmp3Calls.push_back(Call); } } - for (CallSite CS : SetJmp3CallSites) { - auto &BBColors = BlockColors[CS->getParent()]; + for (CallBase *Call : SetJmp3Calls) { + auto &BBColors = BlockColors[Call->getParent()]; BasicBlock *FuncletEntryBB = BBColors.front(); bool InCleanup = isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI()); - IRBuilder<> Builder(CS.getInstruction()); + IRBuilder<> Builder(Call); Value *State; if (InCleanup) { Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(), RegNode, StateFieldIndex); State = Builder.CreateLoad(Builder.getInt32Ty(), StateField); } else { - State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS)); + State = Builder.getInt32(getStateForCall(BlockColors, FuncInfo, *Call)); } - rewriteSetJmpCallSite(Builder, F, CS, State); + rewriteSetJmpCall(Builder, F, *Call, State); } } |